Compare commits
30 Commits
iterate-2I
...
iterate-4A
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
23189b95af | ||
|
|
acb29db444 | ||
|
|
dc1320cd4b | ||
|
|
9d24dd0eaa | ||
|
|
c62a355418 | ||
|
|
3f8d3b6f1c | ||
|
|
c0c6088e4d | ||
|
|
f6f3aac673 | ||
|
|
2a992db47b | ||
|
|
89b5c39d8a | ||
|
|
39723dfe37 | ||
|
|
da7c29b6d2 | ||
|
|
1b9918450f | ||
|
|
80fbff8bd1 | ||
|
|
6d8a2817a3 | ||
|
|
a3aa3cc7d6 | ||
|
|
6ff184694d | ||
|
|
504592ac13 | ||
|
|
6bb4355e3d | ||
|
|
3f5d5cf5f7 | ||
|
|
2f55d1fd7d | ||
|
|
a91f4c550b | ||
|
|
66bd805726 | ||
|
|
ad9c8e4cb8 | ||
|
|
873c197ff1 | ||
|
|
1ae472bd2b | ||
|
|
034ec8b47f | ||
|
|
93f60a3ba0 | ||
|
|
2bdb93e51e | ||
|
|
ed2e0e72fd |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -16,3 +16,7 @@ audit-*.md
|
||||
# working dir by the Wine canary build)
|
||||
vkd3d-proton.cache*
|
||||
*.dxvk-cache
|
||||
|
||||
# local analysis-DB backups (regenerable; too large to track)
|
||||
*.db.bak*
|
||||
sylpheed.db.bak-*
|
||||
|
||||
101
Cargo.lock
generated
101
Cargo.lock
generated
@@ -418,6 +418,26 @@ version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.64.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"peeking_take_while",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash 1.1.0",
|
||||
"shlex",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-set"
|
||||
version = "0.6.0"
|
||||
@@ -600,6 +620,15 @@ dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
|
||||
dependencies = [
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
@@ -639,6 +668,17 @@ dependencies = [
|
||||
"inout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clang-sys"
|
||||
version = "1.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
|
||||
dependencies = [
|
||||
"glob",
|
||||
"libc",
|
||||
"libloading",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.6.0"
|
||||
@@ -1076,6 +1116,20 @@ version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
||||
|
||||
[[package]]
|
||||
name = "ffmpeg-sys-next"
|
||||
version = "6.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2529ad916d08c3562c754c21bc9b17a26c7882c0f5706cc2cd69472175f1620"
|
||||
dependencies = [
|
||||
"bindgen",
|
||||
"cc",
|
||||
"libc",
|
||||
"num_cpus",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.27"
|
||||
@@ -1317,6 +1371,12 @@ dependencies = [
|
||||
"xml-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "glob"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||
|
||||
[[package]]
|
||||
name = "glow"
|
||||
version = "0.13.1"
|
||||
@@ -1898,6 +1958,12 @@ version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "lazycell"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "lexical-core"
|
||||
version = "1.0.6"
|
||||
@@ -2139,6 +2205,12 @@ dependencies = [
|
||||
"sketches-ddsketch",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.9"
|
||||
@@ -2262,6 +2334,16 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.50.3"
|
||||
@@ -2325,6 +2407,16 @@ dependencies = [
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum"
|
||||
version = "0.7.6"
|
||||
@@ -2657,6 +2749,12 @@ version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.2"
|
||||
@@ -4961,8 +5059,10 @@ dependencies = [
|
||||
name = "xenia-apu"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ffmpeg-sys-next",
|
||||
"thiserror 2.0.18",
|
||||
"tracing",
|
||||
"xenia-memory",
|
||||
"xenia-types",
|
||||
]
|
||||
|
||||
@@ -5025,6 +5125,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"thiserror 2.0.18",
|
||||
"tracing",
|
||||
"xenia-apu",
|
||||
"xenia-cpu",
|
||||
"xenia-gpu",
|
||||
"xenia-hid",
|
||||
|
||||
133
HANDOFF-iterate-4A-milestone2.md
Normal file
133
HANDOFF-iterate-4A-milestone2.md
Normal file
@@ -0,0 +1,133 @@
|
||||
# Handoff — branch `iterate-4A/apu-xma-stage1` (Milestone 2: intro-video / XMA audio + RE tooling)
|
||||
|
||||
Reverse-engineering Project Sylpheed under this Rust Xbox-360 emulator (`xenia-rs`), using Wine
|
||||
xenia-canary as the ground-truth oracle. This branch carries **Milestone 2** work plus major
|
||||
RE-tooling improvements, on top of the (uncommitted-until-now) Milestone-1 renderer history.
|
||||
|
||||
> Method: first-divergence vs canary · fix causes not symptoms · NO faking/masking · measure the
|
||||
> oracle, never infer · refute before believing · ground every claim in evidence.
|
||||
|
||||
---
|
||||
|
||||
## 0. SET UP ON A NEW MACHINE (do this first)
|
||||
|
||||
### a) FFmpeg system libraries — **REQUIRED to build** (crate `xenia-apu` links them via pkg-config)
|
||||
The XMA audio decoder uses `ffmpeg-sys-next` (`crates/xenia-apu/Cargo.toml`:
|
||||
`ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] }`),
|
||||
which links the **system** FFmpeg dev libraries. Install them:
|
||||
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install -y libavcodec-dev libavformat-dev libavutil-dev libswresample-dev pkg-config ffmpeg
|
||||
```
|
||||
|
||||
Verify the toolchain (the XMA path needs the `xma1`/`xma2` decoders — present in distro FFmpeg ≥ ~2015):
|
||||
```bash
|
||||
pkg-config --modversion libavcodec # expect 60.x (this branch built against 60.31)
|
||||
ffmpeg -hide_banner -decoders | grep -iE 'xma1|xma2' # expect: A....D xma1 / A....D xma2
|
||||
```
|
||||
(Decoder note: distro FFmpeg has **no** `AV_CODEC_ID_XMAFRAMES`; we use `AV_CODEC_ID_XMA2` — see
|
||||
`crates/xenia-apu/src/xma2_codec.rs`.) On non-Debian distros install the equivalent `-dev` packages.
|
||||
|
||||
### b) The game ISO (gitignored — `*.iso`)
|
||||
Not in the repo. Place the Project Sylpheed ISO somewhere and create a `sylpheed.iso` symlink to it
|
||||
in the repo root (the run/test commands use `sylpheed.iso`):
|
||||
```bash
|
||||
ln -s "/path/to/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso" sylpheed.iso
|
||||
```
|
||||
⚠️ For **canary** runs, point at the REAL ISO path, not the symlink (Wine can't resolve the symlink).
|
||||
|
||||
### c) Build — **always cap parallelism** (a default `-j` build OOM-crashed a 15 GB box)
|
||||
```bash
|
||||
export CARGO_BUILD_JOBS=4 # NEVER default -j12; check `free -h` first, drop to -j2 if <4GB free
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
### d) Regenerate the static-analysis DB `sylpheed.db` (gitignored — `*.db`, ~586 MB, ~1h35m)
|
||||
Used by the RE/analysis queries (NOT needed to run the emulator). Rebuild from the ISO:
|
||||
```bash
|
||||
cargo run --release -- dis "/path/to/<the ISO>" --db sylpheed.db
|
||||
# analysis passes run in <1s; the ~1h35m is DuckDB persisting ~1.8M dispatch rows. Be patient.
|
||||
```
|
||||
This branch's analyzer fix (see §3) makes the regenerated DB include the previously-missing XMV
|
||||
engine vtables (`0x8200a1e8`/`0x8200a908`). A local pre-fix backup may exist as
|
||||
`sylpheed.db.bak-pre-vtablefix` (gitignored, not pushed).
|
||||
|
||||
---
|
||||
|
||||
## 1. WHAT'S ON THIS BRANCH (all in this one commit, on top of `acb29db` = iterate-3AL)
|
||||
**Milestone-1 renderer history** (publisher/dev splash renders) is in the ancestry (iterate-2x → 3M →
|
||||
3O → 3AL); pushing this branch carries it. **Milestone 2** + tooling added here:
|
||||
|
||||
### ✅ XMA AUDIO path — BUILT, WORKING, deterministic, tested
|
||||
- `crates/xenia-apu/src/xma.rs` — register-mapped XMA context system (MMIO `0x7FEA0000`, 320×64B
|
||||
context array, Kick/Lock/Clear decode). `xma_decode.rs` + `xma2_codec.rs` — the real FFmpeg
|
||||
`xma2` decoder (XMA_CONTEXT_DATA bitfields, BitStream packet parse, planar-f32→S16BE PCM).
|
||||
Decode runs synchronously on the CPU thread (deterministic, no host thread). Wired via
|
||||
`KernelState.xma` (`state.rs`), exports (`exports.rs`), `xaudio.rs` (`XAudioSubmitRenderDriverFrame`
|
||||
made faithful), `main.rs` (MMIO install + per-round pump).
|
||||
- **Audio-worker scheduler fix** (`main.rs` LR_HALT restore + `scheduler.rs`): the XAudio render
|
||||
callback worker was wrongly exited after ~2 deliveries → fixed → the guest now drives XMA decode.
|
||||
- Verified: real PCM out; golden `sylpheed_n50m` **re-baselined** (`crates/xenia-app/tests/golden/`)
|
||||
and PASSES; milestone-1 splash intact; apu/cpu/kernel tests pass.
|
||||
|
||||
### 🛠️ RE TOOLING (this branch's lasting wins)
|
||||
- **Runtime dispatch-recorder** `crates/xenia-cpu/src/dispatch_rec.rs` — records `(call-site → target,
|
||||
r3, lr)` for every indirect (`bcctr`-family) call. Off by default; enable with `XENIA_DISPATCH_REC=1`,
|
||||
optional filters `XENIA_DISPATCH_REC_TARGETS=<hex,…>` / `_SITES=<hex,…>`, dumps to
|
||||
`XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`). Deterministic, observe-only.
|
||||
- **Repaired static analyzer** `crates/xenia-analysis/src/vtables.rs` — the vtable extractor silently
|
||||
**fragmented vtables with non-function head slots** (missed the XMV engine vtable entirely →
|
||||
blocked ~6 investigations). Fixed via **vptr-write-anchoring** (find `addis/addi → stw rX,0(rThis)`
|
||||
constant-vptr installs; read the fnptr run from each anchor). Result on rebuild: vtables 722→1150,
|
||||
dispatch candidates 688K→1.83M, engine fully typed. (Requires the §0d DB rebuild to take effect.)
|
||||
- **Probe Heisenbug FIXED** (`main.rs run_superblock`) — `--audit-pc-probe-hex` / `--mem-watch` used to
|
||||
**disable superblock chaining**, which changed thread scheduling and *starved the movie subsystem*
|
||||
so the probes couldn't observe it. Now probes fire *inside* the chain loop → scheduling is identical
|
||||
armed-vs-unarmed (verified byte-identical golden) → the probe suite is finally usable on the movie
|
||||
subsystem. Also fixed a `--quiet` bug that swallowed armed `--trace-handles`/`--dump-addr` reports.
|
||||
|
||||
---
|
||||
|
||||
## 2. CURRENT STATE & WHERE TO CONTINUE (the video still doesn't play)
|
||||
**Audio works; the intro VIDEO doesn't play yet.** Root, runtime-pinned: a 2000ms readiness timeout
|
||||
(`sub_821B66B8`) abandons because the XMV engine (`0x40d101c0`, runtime vtable `0x8200a1e8`) never
|
||||
**primes** — engine begin-playback `sub_825076F0` (slot 21) is **never dispatched** (0×), so the
|
||||
per-frame full-start always takes its skip branch and the playback clock never starts.
|
||||
- **Classification: (B) guest-side state machine.** The gate fields are the engine's *correct* reset
|
||||
defaults → there is **NO honest our-side fix at the gate** (forcing them = masking, forbidden). The
|
||||
defect is upstream: the guest SM reaches "create decoder (success)" but never issues begin-playback.
|
||||
- **Latest narrowing (evidence, fixed probes):** ARM2-setup `sub_821B55D8` runs once, create-decoder
|
||||
`sub_824F8398` succeeds, and ARM2 then calls engine-setup wrappers
|
||||
**`sub_824F7778` / `sub_824F7630` / `sub_824F7558` / `sub_824F7538` / `sub_824FCB68`** (on
|
||||
`[movie+104]`=engine) — the begin-playback dispatch is gated **inside one of these**. Tracing them
|
||||
(now possible with the fixed probes) for the begin-playback gate + why ours never satisfies it is
|
||||
**the next step**. The likely ultimate unlock is **measuring canary** (same XEX reaches begin-playback)
|
||||
to find the upstream state/signal we don't produce.
|
||||
|
||||
Full, evidence-grounded detail (engine/vtable/slot map, the eliminations, the investigation arc, the
|
||||
method lessons) lives in the agent-memory grounding file referenced in the project memory index
|
||||
(`milestone2_xma_grounding`). Key anchors: engine `0x40d101c0` vtable `0x8200a1e8` — PUMP slot19
|
||||
`sub_825078D8`, begin-playback slot21 `sub_825076F0`, submit slot27 `sub_82505C08`, full-start slot40
|
||||
`sub_825061E0`; movie host `0x40bb0440` (engine at `[host+104]`); SM ARM1 `sub_821B4C98` → ARM2
|
||||
`sub_821B55D8` → ARM3 `sub_821B5FB8` → poll `sub_821B66B8`.
|
||||
|
||||
### Useful commands
|
||||
```bash
|
||||
# Headless run to the video state (~30-40s, ~1B instr); add diagnostic flags as needed:
|
||||
./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
|
||||
# Non-perturbing PC probes (now usable on the movie subsystem):
|
||||
RUST_LOG=warn,xenia_apu=info XENIA_AUDIT_PC_PROBE=0x825078d8,0x82505c08 \
|
||||
./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
|
||||
# Dispatch recorder (filtered):
|
||||
XENIA_DISPATCH_REC=1 XENIA_DISPATCH_REC_TARGETS=0x825076f0,0x82505c08 \
|
||||
./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
|
||||
# Golden / determinism check:
|
||||
CARGO_BUILD_JOBS=4 cargo test -p xenia-app --release --test sylpheed_oracles -- --ignored sylpheed_n50m
|
||||
# Visual (watch the splash; ASK a human to watch — never self-screenshot):
|
||||
./target/release/xenia-rs exec sylpheed.iso --ui
|
||||
```
|
||||
⚠️ Probe/run discipline: kill background runs by pid or `pkill -x xenia-rs` (NEVER `pkill -f`, it
|
||||
self-matches the launcher). Runs are deterministic (instruction-count clock).
|
||||
|
||||
🤖 Generated with [Claude Code](https://claude.com/claude-code)
|
||||
@@ -26,6 +26,14 @@ use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::demangle;
|
||||
|
||||
/// Maximum number of consecutive non-function slots tolerated inside an
|
||||
/// anchor-recovered vtable before the run is considered terminated. MSVC
|
||||
/// vtables can carry null / pure-virtual / unrecognised-thunk slots in their
|
||||
/// head or interior; a small budget lets those through without merging two
|
||||
/// physically-adjacent vtables. Kept small to avoid bridging the gap between
|
||||
/// distinct tables.
|
||||
const MAX_ANCHOR_GAP: usize = 2;
|
||||
|
||||
/// One detected vtable.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Vtable {
|
||||
@@ -56,6 +64,35 @@ pub fn analyze(
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
function_starts: &std::collections::BTreeSet<u32>,
|
||||
) -> Vec<Vtable> {
|
||||
analyze_with_anchors(pe, image_base, sections, function_starts, &std::collections::BTreeSet::new())
|
||||
}
|
||||
|
||||
/// Like [`analyze`], but additionally recovers vtables whose base address is
|
||||
/// known a-priori from a constructor vptr-write store (an "anchor"). The
|
||||
/// contiguity heuristic in pass 1 fragments any vtable whose head region
|
||||
/// contains words that don't resolve to recognised function entries (null /
|
||||
/// pure-virtual / unrecognised thunk slots); those vtables are never emitted
|
||||
/// and the downstream typed-dispatch resolver can't type objects of that
|
||||
/// class. An anchor is a *content-independent* vtable signal — the ctor
|
||||
/// literally installs `vtable_base` into `this+0` via
|
||||
/// `addis/addi (or lis/ori) → stw rX, 0(rThis)` — so for every anchor not
|
||||
/// already covered by a pass-1 run we synthesise a vtable starting at that
|
||||
/// base, reading the fnptr-array run while *tolerating* up to
|
||||
/// [`MAX_ANCHOR_GAP`] consecutive non-function slots before terminating.
|
||||
///
|
||||
/// `anchors` are absolute VAs of vtable bases (from
|
||||
/// [`scan_vptr_write_constants`]). Existing pass-1 vtables are kept unchanged
|
||||
/// (no regression): an anchor that already coincides with a detected vtable
|
||||
/// base is skipped, and an anchor that lands *inside* an existing run is also
|
||||
/// skipped (it's a sub-object pointer, not a fresh table).
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze_with_anchors(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
function_starts: &std::collections::BTreeSet<u32>,
|
||||
anchors: &std::collections::BTreeSet<u32>,
|
||||
) -> Vec<Vtable> {
|
||||
let started = std::time::Instant::now();
|
||||
// Sections we'll scan for vtable bodies.
|
||||
@@ -117,6 +154,120 @@ pub fn analyze(
|
||||
let _ = (va_start, va_end);
|
||||
}
|
||||
|
||||
// --- Anchor-driven recovery (vptr-write-anchored vtables) ---
|
||||
//
|
||||
// Build a coverage interval set from pass-1 runs so we don't re-emit a
|
||||
// table for an anchor that already lies within an extracted vtable.
|
||||
let mut covered: Vec<(u32, u32)> = candidates
|
||||
.iter()
|
||||
.map(|v| (v.address, v.address + v.length * 4))
|
||||
.collect();
|
||||
covered.sort_unstable();
|
||||
|
||||
let is_covered = |addr: u32, covered: &[(u32, u32)]| -> bool {
|
||||
covered.iter().any(|&(s, e)| addr >= s && addr < e)
|
||||
};
|
||||
|
||||
// Section lookup for "which scan target contains this VA?"
|
||||
let scan_targets_va: Vec<(u32, u32, usize, usize)> = sections
|
||||
.iter()
|
||||
.filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
|
||||
.map(|s| {
|
||||
let va = image_base + s.virtual_address;
|
||||
(
|
||||
va,
|
||||
va + s.virtual_size,
|
||||
s.virtual_address as usize,
|
||||
(s.virtual_address + s.virtual_size) as usize,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Cap a recovered run at the *next anchor* so two physically-adjacent
|
||||
// anchored vtables don't merge. We deliberately do NOT cap at pass-1
|
||||
// fragments: a fragment is a sub-run the contiguity scan carved out of a
|
||||
// larger table, and the anchor legitimately re-absorbs it (subsumed
|
||||
// fragments are removed afterwards).
|
||||
let anchor_bases: std::collections::BTreeSet<u32> = anchors.iter().copied().collect();
|
||||
|
||||
let mut recovered = 0usize;
|
||||
let mut newly: Vec<Vtable> = Vec::new();
|
||||
for &anchor in anchors {
|
||||
if is_covered(anchor, &covered) { continue; }
|
||||
// Locate the containing .rdata/.data section.
|
||||
let Some(&(va_lo, va_hi, raw_lo, raw_hi)) =
|
||||
scan_targets_va.iter().find(|&&(lo, hi, _, _)| anchor >= lo && anchor < hi)
|
||||
else { continue };
|
||||
if anchor % 4 != 0 { continue; }
|
||||
let raw_hi = raw_hi.min(pe.len());
|
||||
// Read the fnptr-array run starting at the anchor. Tolerate small
|
||||
// gaps of non-function slots (null / pure-virtual / unrecognised),
|
||||
// but require the run to actually contain at least one real function
|
||||
// (otherwise it's just data, not a vtable).
|
||||
let next_base = anchor_bases.range((anchor + 4)..).next().copied();
|
||||
let mut methods: Vec<u32> = Vec::new();
|
||||
let mut gap = 0usize;
|
||||
let mut real_fns = 0usize;
|
||||
let mut off = (anchor - va_lo) as usize + raw_lo;
|
||||
let mut va = anchor;
|
||||
while off + 4 <= raw_hi && va < va_hi {
|
||||
if let Some(nb) = next_base && va >= nb { break; }
|
||||
let val = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]);
|
||||
if function_starts.contains(&val) {
|
||||
methods.push(val);
|
||||
real_fns += 1;
|
||||
gap = 0;
|
||||
} else {
|
||||
// A non-function slot. Keep the slot (so downstream slot
|
||||
// indexing stays aligned) but count toward the gap budget.
|
||||
gap += 1;
|
||||
if gap > MAX_ANCHOR_GAP {
|
||||
// Drop the trailing gap slots — they belong past the
|
||||
// table's end.
|
||||
methods.truncate(methods.len().saturating_sub(gap - 1));
|
||||
break;
|
||||
}
|
||||
methods.push(val);
|
||||
}
|
||||
off += 4;
|
||||
va += 4;
|
||||
}
|
||||
// Trim any trailing non-function slots (the table ends at its last
|
||||
// real method).
|
||||
while methods.last().is_some_and(|&m| !function_starts.contains(&m)) {
|
||||
methods.pop();
|
||||
}
|
||||
if real_fns == 0 || methods.is_empty() { continue; }
|
||||
let length = methods.len() as u32;
|
||||
newly.push(Vtable {
|
||||
address: anchor,
|
||||
length,
|
||||
col_address: None,
|
||||
class_name: synth_anon_name(&methods),
|
||||
rtti_present: false,
|
||||
base_classes_json: None,
|
||||
methods,
|
||||
});
|
||||
recovered += 1;
|
||||
}
|
||||
if recovered > 0 {
|
||||
// Drop pass-1 fragments fully subsumed by a recovered (anchored)
|
||||
// vtable — the anchor base is authoritative and the fragment was a
|
||||
// contiguity-scan artifact of the same table. Keep fragments that
|
||||
// only partially overlap (defensive; shouldn't happen for true
|
||||
// sub-runs) so we never lose method coverage.
|
||||
let recovered_spans: Vec<(u32, u32)> =
|
||||
newly.iter().map(|v| (v.address, v.address + v.length * 4)).collect();
|
||||
candidates.retain(|v| {
|
||||
!recovered_spans
|
||||
.iter()
|
||||
.any(|&(s, e)| v.address >= s && v.address + v.length * 4 <= e)
|
||||
});
|
||||
candidates.extend(newly);
|
||||
tracing::info!(recovered, "vtables recovered from vptr-write anchors");
|
||||
}
|
||||
let _ = &covered;
|
||||
|
||||
// RTTI walk: for each candidate, look at vtable[-1].
|
||||
let pe_image_base = image_base;
|
||||
for v in &mut candidates {
|
||||
@@ -268,6 +419,98 @@ fn read_class_hierarchy(
|
||||
serde_json::to_string(&names).ok()
|
||||
}
|
||||
|
||||
/// Pre-pass: discover candidate vtable *bases* from constructor vptr-write
|
||||
/// stores, independent of the static contiguity heuristic. A vptr install is
|
||||
/// the canonical `addis/addi` (or `lis/ori`) immediate build of a constant
|
||||
/// pointing into `.rdata` / `.data`, followed by `stw rX, 0(rThis)` — i.e. the
|
||||
/// ctor writing the vtable pointer to `this+0`. We return the set of such
|
||||
/// constants; these are fed to [`analyze_with_anchors`] so a vtable with
|
||||
/// non-function head words isn't lost.
|
||||
///
|
||||
/// We only consider stores at displacement 0 (the primary vptr; secondary
|
||||
/// MI vptrs land at non-zero offsets and are handled by the existing
|
||||
/// contiguity scan / typed-dispatch resolver well enough). The register
|
||||
/// tracker mirrors the lis+addi propagation used elsewhere and is reset at
|
||||
/// every basic-block boundary (`block_boundaries`).
|
||||
pub fn scan_vptr_write_constants(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
functions: &std::collections::BTreeMap<u32, (u32, bool)>, // start -> (end, is_saverestore)
|
||||
sections: &[PeSection],
|
||||
block_boundaries: &std::collections::HashSet<u32>,
|
||||
) -> std::collections::BTreeSet<u32> {
|
||||
// Ranges that a vtable base may legitimately live in.
|
||||
let data_ranges: Vec<(u32, u32)> = sections
|
||||
.iter()
|
||||
.filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
|
||||
.map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size))
|
||||
.collect();
|
||||
let in_data = |a: u32| data_ranges.iter().any(|&(s, e)| a >= s && a < e);
|
||||
|
||||
const OP_ADDI: u32 = 14;
|
||||
const OP_ADDIS: u32 = 15;
|
||||
const OP_ORI: u32 = 24;
|
||||
const OP_STW: u32 = 36;
|
||||
const OP_X_FORM: u32 = 31;
|
||||
|
||||
let read = |addr: u32| -> Option<u32> {
|
||||
let off = addr.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||
};
|
||||
|
||||
let mut anchors: std::collections::BTreeSet<u32> = std::collections::BTreeSet::new();
|
||||
for (&fn_start, &(fn_end, is_saverestore)) in functions {
|
||||
if is_saverestore { continue; }
|
||||
let mut reg: [Option<u32>; 32] = [None; 32];
|
||||
let mut pc = fn_start;
|
||||
while pc < fn_end {
|
||||
if pc != fn_start && block_boundaries.contains(&pc) {
|
||||
reg = [None; 32];
|
||||
}
|
||||
let Some(instr) = read(pc) else { break };
|
||||
let op = instr >> 26;
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||
let uimm = instr & 0xFFFF;
|
||||
match op {
|
||||
OP_ADDIS if ra == 0 => reg[rd] = Some(uimm << 16),
|
||||
OP_ADDIS => reg[rd] = reg[ra].map(|b| b.wrapping_add(uimm << 16)),
|
||||
OP_ADDI if ra != 0 => reg[rd] = reg[ra].map(|b| b.wrapping_add(simm as u32)),
|
||||
OP_ADDI => reg[rd] = Some(simm as u32),
|
||||
OP_ORI => {
|
||||
let rs = rd;
|
||||
reg[ra] = reg[rs].map(|b| b | uimm);
|
||||
}
|
||||
OP_STW => {
|
||||
// `stw rS, off(rA)` with displacement 0 = primary vptr install.
|
||||
if ra != 0
|
||||
&& simm == 0
|
||||
&& let Some(val) = reg[rd]
|
||||
&& in_data(val)
|
||||
{
|
||||
anchors.insert(val);
|
||||
}
|
||||
}
|
||||
32..=35 | 40..=43 | 48..=51 => reg[rd] = None,
|
||||
OP_X_FORM => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
if xo != 444 && xo != 467 { reg[rd] = None; } // keep `or`(444=mr)/`mtspr`-ish
|
||||
}
|
||||
18 | 16 => {
|
||||
if (instr & 1) != 0 {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
pc = pc.wrapping_add(4);
|
||||
}
|
||||
}
|
||||
anchors
|
||||
}
|
||||
|
||||
/// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of
|
||||
/// the sorted method-PC list. Two vtables with identical method ordering
|
||||
/// collapse to the same anonymous name.
|
||||
@@ -385,6 +628,112 @@ mod tests {
|
||||
assert!(!vtables[0].rtti_present);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anchor_recovers_vtable_with_nonfn_head() {
|
||||
// A vtable whose head has a null + an unrecognised word, so the
|
||||
// contiguity scan (≥3 contiguous known fns) fragments it. The anchor
|
||||
// (from a ctor vptr-write) must recover the whole table from its base.
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let text_va = 0x2000u32;
|
||||
let rdata_size = 0x40u32;
|
||||
let text_size = 0x100u32;
|
||||
let total = (text_va + text_size) as usize;
|
||||
let mut pe = vec![0u8; total];
|
||||
|
||||
let f0 = image_base + text_va;
|
||||
let f1 = image_base + text_va + 0x10;
|
||||
let f2 = image_base + text_va + 0x20;
|
||||
// Slots: [null, NONFN(0xDEAD), f0, f1, f2]
|
||||
let slots: [u32; 5] = [0, 0xDEADBEEF, f0, f1, f2];
|
||||
for (i, val) in slots.iter().enumerate() {
|
||||
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
|
||||
.copy_from_slice(&val.to_be_bytes());
|
||||
}
|
||||
|
||||
let sections = vec![
|
||||
PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: rdata_va,
|
||||
virtual_size: rdata_size,
|
||||
raw_offset: rdata_va,
|
||||
raw_size: rdata_size,
|
||||
flags: 0x4000_0040,
|
||||
},
|
||||
PeSection {
|
||||
name: ".text".into(),
|
||||
virtual_address: text_va,
|
||||
virtual_size: text_size,
|
||||
raw_offset: text_va,
|
||||
raw_size: text_size,
|
||||
flags: 0x6000_0020,
|
||||
},
|
||||
];
|
||||
let mut function_starts = std::collections::BTreeSet::new();
|
||||
for &pc in &[f0, f1, f2] { function_starts.insert(pc); }
|
||||
|
||||
// Without an anchor: the head gap (null + nonfn = 2 slots) means the
|
||||
// contiguous run is only [f0,f1,f2]=3 starting at +0x08, so pass-1
|
||||
// still finds it but at the WRONG base (0x...1008), not the true base.
|
||||
let no_anchor = analyze(&pe, image_base, §ions, &function_starts);
|
||||
assert!(
|
||||
!no_anchor.iter().any(|v| v.address == image_base + rdata_va),
|
||||
"without anchor the table is not recovered at its true base"
|
||||
);
|
||||
|
||||
// With the anchor at the true base:
|
||||
let mut anchors = std::collections::BTreeSet::new();
|
||||
anchors.insert(image_base + rdata_va);
|
||||
let with_anchor =
|
||||
analyze_with_anchors(&pe, image_base, §ions, &function_starts, &anchors);
|
||||
let v = with_anchor
|
||||
.iter()
|
||||
.find(|v| v.address == image_base + rdata_va)
|
||||
.expect("anchor must recover vtable at its true base");
|
||||
// length spans through f2 (slot 4): 5 slots.
|
||||
assert_eq!(v.length, 5, "table spans null/nonfn head through last fn");
|
||||
assert_eq!(v.methods[2], f0);
|
||||
assert_eq!(v.methods[4], f2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_vptr_write_constants_finds_ctor_store() {
|
||||
// Encode a ctor: addis r11,r0,0x8201; addi r11,r11,lo; stw r11,0(r31)
|
||||
// installing vtable base 0x8200A908 into this+0.
|
||||
let image_base = 0x82000000u32;
|
||||
let ctor = 0x82001000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
// Lay out a tiny .rdata at 0x...A900 so the constant lands in-range.
|
||||
let vt_base = 0x8200A908u32; // 0x82010000 - 22264
|
||||
let addis = (15u32 << 26) | (11 << 21) | (0 << 16) | 0x8201;
|
||||
let lo = (vt_base & 0xFFFF) as i16; // -22264
|
||||
let addi = (14u32 << 26) | (11 << 21) | (0 << 16) | ((lo as u16) as u32);
|
||||
// addi r11,r0,lo would set r11=lo (sign-extended); we need addis+addi
|
||||
// chained. Re-encode addis into r11 from r0, then addi r11,r11,lo.
|
||||
let addi2 = (14u32 << 26) | (11 << 21) | (11 << 16) | ((lo as u16) as u32);
|
||||
let stw = (36u32 << 26) | (11 << 21) | (31 << 16) | 0; // stw r11,0(r31)
|
||||
let at = (ctor - image_base) as usize;
|
||||
pe[at..at + 4].copy_from_slice(&addis.to_be_bytes());
|
||||
pe[at + 4..at + 8].copy_from_slice(&addi2.to_be_bytes());
|
||||
pe[at + 8..at + 12].copy_from_slice(&stw.to_be_bytes());
|
||||
let _ = addi;
|
||||
|
||||
let sections = vec![PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: 0xA900,
|
||||
virtual_size: 0x200,
|
||||
raw_offset: 0xA900,
|
||||
raw_size: 0x200,
|
||||
flags: 0x4000_0040,
|
||||
}];
|
||||
let mut funcs: std::collections::BTreeMap<u32, (u32, bool)> = std::collections::BTreeMap::new();
|
||||
funcs.insert(ctor, (ctor + 0x40, false));
|
||||
let anchors = scan_vptr_write_constants(
|
||||
&pe, image_base, &funcs, §ions, &std::collections::HashSet::new(),
|
||||
);
|
||||
assert!(anchors.contains(&vt_base), "ctor vptr store must yield anchor {vt_base:#x}, got {anchors:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_2_method_run() {
|
||||
let image_base = 0x82000000u32;
|
||||
|
||||
@@ -415,6 +415,18 @@ fn main() -> Result<()> {
|
||||
// metrics summary.
|
||||
let _obs = observability::init(&config)?;
|
||||
|
||||
// Env-gated indirect-dispatch recorder (off by default). Resolve the env
|
||||
// once here; a scope guard dumps the recorded (call_site -> target) table
|
||||
// at end-of-run no matter how the run terminates.
|
||||
xenia_cpu::dispatch_rec::install();
|
||||
struct DispatchRecGuard;
|
||||
impl Drop for DispatchRecGuard {
|
||||
fn drop(&mut self) {
|
||||
xenia_cpu::dispatch_rec::dump();
|
||||
}
|
||||
}
|
||||
let _dispatch_rec_guard = DispatchRecGuard;
|
||||
|
||||
let result = match cli.command {
|
||||
Commands::Disasm { path, count, at } => cmd_disasm(&path, count, at),
|
||||
Commands::Exec {
|
||||
@@ -1437,6 +1449,45 @@ fn cmd_exec_inner(
|
||||
// atoms that live inside `kernel.gpu.mmio`.
|
||||
mem.add_mmio_region(xenia_gpu::build_mmio_region(kernel.gpu.mmio()));
|
||||
|
||||
// apu stage 1 — reserve the 320-entry XMA context array and install the
|
||||
// `0x7FEA0000` register aperture (mirrors canary's `XmaDecoder::Setup`).
|
||||
//
|
||||
// Physical placement: canary stores a *physical* address in
|
||||
// `ContextArrayAddress` (reg 0x600) — `PhysicalHeap::GetPhysicalAddress`
|
||||
// returns `va - heap_base` (== `va & 0x1FFFFFFF` for the physical heaps).
|
||||
// Our memory model is FLAT: `translate_virtual` is a raw `membase + addr`
|
||||
// with no separate physical-window mirror, and `translate_physical` masks
|
||||
// `& 0x1FFFFFFF` — so the two only coincide for low (`< 0x2000_0000`) VAs.
|
||||
// `heap_alloc` returns a `0x40000000`-region VA, so `va & 0x1FFFFFFF` would
|
||||
// be 0 (disagreeing with the context pointers `XMACreateContext` hands out
|
||||
// at `va + i*64`). The guest reads `ContextArrayAddress` and indexes it as
|
||||
// `base + i*64`; for that to equal the pointers it dereferences, the base
|
||||
// MUST equal the VA. So we advertise `va` itself — self-consistent in the
|
||||
// flat model (the guest reaches every context through the same VA space).
|
||||
// Stage 3's decoder will read the context structs via this VA directly
|
||||
// (not via `translate_physical`). The 20480-byte buffer is page-committed
|
||||
// by `heap_alloc`, so the guest never faults writing the 64-byte structs.
|
||||
{
|
||||
let array_size =
|
||||
(xenia_apu::XMA_CONTEXT_COUNT as u32) * xenia_apu::XMA_CONTEXT_SIZE; // 320 * 64
|
||||
match kernel.heap_alloc(array_size, &mem) {
|
||||
Some(va) => {
|
||||
let phys = va; // flat model: array base == VA (see note above)
|
||||
kernel.xma.lock().unwrap().init(va, phys);
|
||||
mem.add_mmio_region(xenia_apu::build_mmio_region(kernel.xma.clone()));
|
||||
tracing::info!(
|
||||
va = format_args!("{va:#010x}"),
|
||||
phys = format_args!("{phys:#010x}"),
|
||||
size = format_args!("{array_size:#x}"),
|
||||
"xma: context array reserved + 0x7FEA0000 aperture installed"
|
||||
);
|
||||
}
|
||||
None => {
|
||||
tracing::error!("xma: failed to reserve context array (heap exhausted)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Install the initial guest thread on HW slot 0. The thread handle we
|
||||
// hand the scheduler isn't visible to any guest API yet, but joiners
|
||||
// (XThreadWait-style) will see it via `find_by_tid`.
|
||||
@@ -1497,16 +1548,28 @@ fn cmd_exec_inner(
|
||||
mem.write_u32(addr, block);
|
||||
}
|
||||
("xboxkrnl.exe", 0x00AD) => {
|
||||
// KeTimeStampBundle — 0x18 block with FILETIME at +0 and
|
||||
// interrupt-time u64 at +0x10. Mirrors the clock used by
|
||||
// KeQuerySystemTime so fast-path readers see consistent values.
|
||||
// KeTimeStampBundle — X_TIME_STAMP_BUNDLE (canary layout,
|
||||
// kernel_state.h): +0x00 interrupt_time u64, +0x08
|
||||
// system_time u64 (FILETIME 100ns), +0x10 tick_count u32
|
||||
// (milliseconds since boot), +0x14 padding. The guest's
|
||||
// worker-hub channel-dispatch loop (sub_82450A68 @
|
||||
// 0x82450b10) polls [block+0x10] (tick_count) and gates
|
||||
// dispatch on a `tick_count + 66` (ms) deadline. The block
|
||||
// MUST be ticked over the run or that deadline never
|
||||
// elapses (tid14 0x109c starvation gate). Initialize to a
|
||||
// zero-uptime base; KernelState::update_timestamp_bundle
|
||||
// ticks it every round from the deterministic global_clock.
|
||||
let block = alloc_zero(0x18, &mut mem, &mut kernel);
|
||||
if block != 0 {
|
||||
let fake_time: u64 = 132_500_000_000_000_000; // ~2021 FILETIME
|
||||
mem.write_u32(block, (fake_time >> 32) as u32);
|
||||
mem.write_u32(block + 4, fake_time as u32);
|
||||
mem.write_u32(block + 0x10, (fake_time >> 32) as u32);
|
||||
mem.write_u32(block + 0x14, fake_time as u32);
|
||||
// FILETIME base (~2021) so system_time is plausible.
|
||||
let fake_time: u64 = 132_500_000_000_000_000;
|
||||
mem.write_u32(block, 0); // interrupt_time hi
|
||||
mem.write_u32(block + 4, 0); // interrupt_time lo
|
||||
mem.write_u32(block + 0x08, (fake_time >> 32) as u32); // system_time hi
|
||||
mem.write_u32(block + 0x0C, fake_time as u32); // system_time lo
|
||||
mem.write_u32(block + 0x10, 0); // tick_count (ms) = 0 at boot
|
||||
mem.write_u32(block + 0x14, 0); // padding
|
||||
kernel.timestamp_bundle_addr = block;
|
||||
}
|
||||
mem.write_u32(addr, block);
|
||||
}
|
||||
@@ -1528,8 +1591,19 @@ fn cmd_exec_inner(
|
||||
mem.write_u32(addr, block);
|
||||
}
|
||||
("xboxkrnl.exe", 0x01BE) => {
|
||||
// VdGlobalDevice — passed through to Vd* shims. Write 0.
|
||||
mem.write_u32(addr, 0);
|
||||
// VdGlobalDevice — a *pointer to* a global D3D-device cell.
|
||||
// Mirror xenia-canary RegisterVideoExports (xboxkrnl_video.cc:
|
||||
// 557-564): allocate a 4-byte cell, point the import slot at
|
||||
// it, and zero the cell. The guest's graphics init then stores
|
||||
// its device object INTO the cell (e.g. sub_824C6DC0 @
|
||||
// 0x824C6F18 `stw r31, 0([0x82000750])`), and the swap-complete
|
||||
// callback sub_824CE2B8 reads it back via the two-level
|
||||
// `[[VdGlobalDevice]+0]+15160` to bump the swap counter (clock
|
||||
// B). Writing 0 directly here (the old behaviour) made that
|
||||
// store land at address 0 and the swap counter never advance —
|
||||
// freezing the title-loop's per-frame manager update.
|
||||
let cell = alloc_zero(0x4, &mut mem, &mut kernel);
|
||||
mem.write_u32(addr, cell);
|
||||
}
|
||||
("xboxkrnl.exe", 0x01C0) => {
|
||||
// VdGpuClockInMHz
|
||||
@@ -2128,7 +2202,13 @@ fn coord_pre_round(
|
||||
let fired = if kernel.parallel_active {
|
||||
kernel.interrupts.tick_vsync_wallclock()
|
||||
} else {
|
||||
kernel.interrupts.tick_vsync_instr(stats.instruction_count)
|
||||
// iterate-3AJ: present-anchored — pass the guest's live present
|
||||
// (`VdSwap`) count so vsync tracks the real present rate once the
|
||||
// guest is presenting (≈1 vblank/present), instead of firing a
|
||||
// fixed instruction quantum that over-fires ~66× during one heavy
|
||||
// splash asset-load frame and collapsed the logo fade-in.
|
||||
let presents = kernel.gpu.swaps_seen();
|
||||
kernel.interrupts.tick_vsync_instr(stats.instruction_count, presents)
|
||||
};
|
||||
if fired {
|
||||
use std::sync::atomic::Ordering;
|
||||
@@ -2297,8 +2377,19 @@ fn coord_post_round(
|
||||
let mut gpu_runs = (executed_this_round
|
||||
/ xenia_cpu::scheduler::HW_THREAD_COUNT as u64)
|
||||
.max(1);
|
||||
if gpu_runs > 64 {
|
||||
gpu_runs = 64;
|
||||
// Fairness cap on GPU commands drained per round. Must scale with the
|
||||
// per-round instruction volume: with the superblock runner a single
|
||||
// round legitimately retires up to ~SUPERBLOCK_INSTR_BUDGET per slot
|
||||
// (vs ~6 for the old one-block path), so the rate `executed/6` is much
|
||||
// higher and a flat cap of 64 throttled GPU command processing ~17×
|
||||
// (packets 50279→1861 @50M) — collapsing the present loop / splash.
|
||||
// Cap at the budget so the GPU keeps pace with the CPU at the same
|
||||
// per-instruction rate the one-block path had. The inner loop already
|
||||
// early-breaks on `!gpu.is_ready`, so this only bounds a pathological
|
||||
// backlog, never busy-spins.
|
||||
let gpu_cap = superblock_budget().max(64);
|
||||
if gpu_runs > gpu_cap {
|
||||
gpu_runs = gpu_cap;
|
||||
}
|
||||
if let Some(gpu) = kernel.gpu.as_inline_mut() {
|
||||
gpu.sync_with_mmio();
|
||||
@@ -2314,11 +2405,31 @@ fn coord_post_round(
|
||||
let _ = gpu_runs;
|
||||
}
|
||||
|
||||
// APU stage 3 — pump the XMA decoder on the CPU thread, same cadence as the
|
||||
// inline GPU. Deterministic (no host thread / clock): for each context with
|
||||
// a pending kick it runs one Work() pass, decoding the guest's XMA packets
|
||||
// into PCM and writing it back into the output ring + context struct.
|
||||
if let Ok(mut xma) = kernel.xma.try_lock() {
|
||||
xma.decode_pending(mem);
|
||||
}
|
||||
|
||||
if kernel.gpu.has_pending_interrupts() {
|
||||
for _pi in kernel.gpu.take_pending_interrupts() {
|
||||
for pi in kernel.gpu.take_pending_interrupts() {
|
||||
// Canary `ExecutePacketType3_INTERRUPT` dispatches the callback
|
||||
// once per set bit of `cpu_mask` with that bit's index as the
|
||||
// target CPU (`DispatchInterruptCallback(1, n)`). The guest's
|
||||
// swap-acknowledge fence stores `cpu_mask`, and the ISR clears
|
||||
// `1 << current_cpu` from it — so the ISR must run impersonating
|
||||
// the masked CPU or the fence never reaches 0. Sylpheed uses a
|
||||
// single-bit mask (`0x4` → CPU 2); take the lowest set bit.
|
||||
let cpu = if pi.cpu_mask == 0 {
|
||||
xenia_kernel::interrupts::VSYNC_TARGET_CPU
|
||||
} else {
|
||||
pi.cpu_mask.trailing_zeros().min(5) as u8
|
||||
};
|
||||
kernel
|
||||
.interrupts
|
||||
.queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP);
|
||||
.queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP, cpu);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2393,7 +2504,7 @@ fn worker_prologue(
|
||||
stats: &mut ExecStats,
|
||||
) -> PrologueOutcome {
|
||||
use xenia_cpu::interpreter::{step_cached, StepResult};
|
||||
use xenia_cpu::scheduler::{HwState, INITIAL_GUEST_TID};
|
||||
use xenia_cpu::scheduler::{BlockReason, HwState, INITIAL_GUEST_TID};
|
||||
use xenia_cpu::PpcOpcode;
|
||||
const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;
|
||||
|
||||
@@ -2418,10 +2529,19 @@ fn worker_prologue(
|
||||
// and println one record. Read-only; lockstep digest unaffected.
|
||||
// Empty set is the common case → single `is_empty()` test inside
|
||||
// the helper, no overhead on the hot path.
|
||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||
kernel.fire_branch_probe_if_match(hw_id);
|
||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||
kernel.fire_lr_trace_if_match(hw_id);
|
||||
// Perf (Tier-A #3): all four `fire_*_if_match` helpers early-return
|
||||
// on an empty registry, but paying 4× call overhead per slot-visit
|
||||
// (~3.2M visits boot-to-splash) is itself measurable. Gate the whole
|
||||
// group behind a single `any_probe_active()` predicted branch so the
|
||||
// common (no-probe) headless path never even makes the calls. When a
|
||||
// probe IS configured each helper still re-checks its own set, so
|
||||
// behaviour is identical either way.
|
||||
if kernel.any_probe_active() {
|
||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||
kernel.fire_branch_probe_if_match(hw_id);
|
||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||
kernel.fire_lr_trace_if_match(hw_id);
|
||||
}
|
||||
|
||||
if mem.has_mem_watch() {
|
||||
let ctx = kernel.scheduler.ctx(hw_id);
|
||||
@@ -2431,12 +2551,26 @@ fn worker_prologue(
|
||||
|
||||
// 1) Halt-sentinel check (per HW thread).
|
||||
if pc == LR_HALT {
|
||||
// iterate-4A: the async audio-callback injection (`try_inject_audio_callback`)
|
||||
// sets `interrupts.saved`/`injected_ref` to the dedicated audio
|
||||
// worker and runs REAL guest code (`sub_824D29F0`, which calls
|
||||
// blocking kernel APIs) across MANY scheduler rounds before
|
||||
// returning to `LR_HALT_SENTINEL`. The restore must fire only when
|
||||
// the thread that *actually* reached the sentinel is the injected
|
||||
// worker itself — i.e. the FULL `ThreadRef` (hw_id AND idx), which
|
||||
// `scheduler.current` holds after `begin_slot_visit`. Matching on
|
||||
// `hw_id` alone let ANY OTHER thread sharing that HW slot reach
|
||||
// `LR_HALT` and consume the audio worker's `saved` slot; when the
|
||||
// worker later truly returned, `saved` was already `None`, the
|
||||
// guard failed, and control fell through to "marking exited" — the
|
||||
// worker was removed and every subsequent audio callback dropped
|
||||
// (`find_by_handle` skips Exited threads). The graphics ISR path is
|
||||
// fully synchronous (`dispatch_graphics_interrupts` restores inline
|
||||
// and never leaves `interrupts.saved` set across rounds), so this
|
||||
// restore lifecycle is exclusive to audio and graphics is
|
||||
// unaffected.
|
||||
let injected_here = kernel.interrupts.saved.is_some()
|
||||
&& kernel
|
||||
.interrupts
|
||||
.injected_ref
|
||||
.map(|r| r.hw_id == hw_id)
|
||||
== Some(true);
|
||||
&& kernel.interrupts.injected_ref == kernel.scheduler.current;
|
||||
if injected_here
|
||||
&& let Some(saved) = kernel.interrupts.saved.take()
|
||||
{
|
||||
@@ -2448,17 +2582,64 @@ fn worker_prologue(
|
||||
kernel.interrupts.delivered += 1;
|
||||
let source = saved.source;
|
||||
let mut restore_outcome = "ready";
|
||||
let current = kernel.scheduler.thread(target_ref).state.clone();
|
||||
if let HwState::ServicingIrq(reason) = current {
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(reason);
|
||||
restore_outcome = "reblocked";
|
||||
|
||||
// iterate-4A: the dedicated audio worker's canonical resting
|
||||
// state is "parked on its synthetic handle, awaiting the next
|
||||
// callback injection". The callback (`sub_824D29F0`) runs real
|
||||
// guest code that can be flipped `ServicingIrq -> Ready` by an
|
||||
// intervening `wake_ref` (a `KeSetEvent`/timeout targeting the
|
||||
// worker as a waiter mid-callback). The old re-block heuristic
|
||||
// only re-parked when the state was *still* `ServicingIrq`, so
|
||||
// such a wake left the worker `Ready` — it then ran its thread
|
||||
// entry to the `LR_HALT` sentinel, EXITED, and every subsequent
|
||||
// callback dropped (`find_by_handle` skips Exited workers),
|
||||
// wedging the intro-video audio→XMA pipeline. When this restore
|
||||
// is an audio callback (`source == INTERRUPT_SOURCE_AUDIO`),
|
||||
// re-park the worker UNCONDITIONALLY onto its synthetic
|
||||
// park-handle so it survives to receive the next fire. (Graphics
|
||||
// restores keep the `ServicingIrq`-only re-block: a graphics
|
||||
// victim is a borrowed real thread, not a parked worker, and the
|
||||
// old behavior there must stay byte-identical.)
|
||||
if source == xenia_kernel::INTERRUPT_SOURCE_AUDIO {
|
||||
let worker_handle =
|
||||
kernel.scheduler.thread(target_ref).thread_handle;
|
||||
let index = worker_handle.and_then(|h| {
|
||||
kernel
|
||||
.xaudio
|
||||
.worker_handles
|
||||
.iter()
|
||||
.position(|wh| *wh == Some(h))
|
||||
});
|
||||
if let Some(index) = index {
|
||||
let park = xenia_kernel::xaudio::synthetic_park_handle(index);
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(BlockReason::WaitAny {
|
||||
handles: vec![park],
|
||||
deadline: None,
|
||||
});
|
||||
restore_outcome = "reparked";
|
||||
} else if let HwState::ServicingIrq(reason) =
|
||||
kernel.scheduler.thread(target_ref).state.clone()
|
||||
{
|
||||
// Fallback (handle unresolved): preserve the legacy
|
||||
// ServicingIrq-only re-block rather than leak the worker.
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(reason);
|
||||
restore_outcome = "reblocked";
|
||||
}
|
||||
} else {
|
||||
let current = kernel.scheduler.thread(target_ref).state.clone();
|
||||
if let HwState::ServicingIrq(reason) = current {
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(reason);
|
||||
restore_outcome = "reblocked";
|
||||
}
|
||||
}
|
||||
tracing::debug!(
|
||||
source,
|
||||
hw_id,
|
||||
outcome = restore_outcome,
|
||||
"graphics interrupt: callback returned"
|
||||
"interrupt: callback returned"
|
||||
);
|
||||
return PrologueOutcome::Continue;
|
||||
}
|
||||
@@ -2487,8 +2668,15 @@ fn worker_prologue(
|
||||
return PrologueOutcome::Continue;
|
||||
}
|
||||
|
||||
// 2) Import thunk intercept.
|
||||
if let Some((module, ordinal, name)) = thunk_map.get(&pc) {
|
||||
// 2) Import thunk intercept. Perf (Tier-A #4): import thunks occupy a
|
||||
// small contiguous address band; the overwhelming majority of executing
|
||||
// PCs are ordinary guest code outside it. Range-reject against the band
|
||||
// (two integer compares) before paying the `thunk_map` hash. Faithful
|
||||
// no-op — any in-band PC still goes through the exact map lookup, and an
|
||||
// out-of-band PC can never be a registered thunk.
|
||||
if kernel.pc_in_thunk_band(pc)
|
||||
&& let Some((module, ordinal, name)) = thunk_map.get(&pc)
|
||||
{
|
||||
let module = *module;
|
||||
let ordinal_u32 = *ordinal as u32;
|
||||
let thunk_pc = pc;
|
||||
@@ -2755,6 +2943,212 @@ fn worker_epilogue(
|
||||
SlotOutcome::Continue
|
||||
}
|
||||
|
||||
/// Hard cap on the number of guest instructions a single superblock
|
||||
/// runner invocation executes before returning to the round-robin
|
||||
/// scheduler. Bounds how coarse the lockstep interleaving can get: a
|
||||
/// larger budget amortizes more per-round/per-slot tax (faster) but
|
||||
/// runs one HW thread for longer between scheduler returns (coarser
|
||||
/// cross-thread interleaving). 1024 keeps a slot-visit ~170× longer
|
||||
/// than the old single-block (~6 instr) granularity while still
|
||||
/// returning to the round well inside a single 50k quantum. Purely an
|
||||
/// instruction count → deterministic, schedule reproduces byte-identically.
|
||||
///
|
||||
/// Tuned empirically on the Sylpheed boot-to-splash workload (iterate-3AL):
|
||||
/// budgets up to 256 keep boot progression byte-for-byte healthy (draws /
|
||||
/// swaps / packets track the one-block baseline), then a sharp cliff at
|
||||
/// ~384 collapses the present loop (a producer/consumer boot handoff
|
||||
/// starves when one slot runs too long without returning to the round).
|
||||
/// 128 sits 3× below that cliff with ~1.65× boot-to-splash speedup — a
|
||||
/// deliberately conservative pick (correctness over the last few %). The
|
||||
/// `XENIA_SUPERBLOCK_BUDGET` env var overrides it for further tuning.
|
||||
const SUPERBLOCK_INSTR_BUDGET: u64 = 128;
|
||||
|
||||
/// Effective superblock budget. Defaults to [`SUPERBLOCK_INSTR_BUDGET`];
|
||||
/// `XENIA_SUPERBLOCK_BUDGET` overrides it (A/B tuning without a rebuild).
|
||||
/// A budget of 1 reproduces the old one-block-per-slot-visit behaviour
|
||||
/// (the chain always stops after the first block). Read once and cached.
|
||||
fn superblock_budget() -> u64 {
|
||||
use std::sync::OnceLock;
|
||||
static BUDGET: OnceLock<u64> = OnceLock::new();
|
||||
*BUDGET.get_or_init(|| {
|
||||
std::env::var("XENIA_SUPERBLOCK_BUDGET")
|
||||
.ok()
|
||||
.and_then(|v| v.parse::<u64>().ok())
|
||||
.filter(|&v| v >= 1)
|
||||
.unwrap_or(SUPERBLOCK_INSTR_BUDGET)
|
||||
})
|
||||
}
|
||||
|
||||
/// Superblock runner (iterate-3AL). Executes a *chain* of basic blocks
|
||||
/// for one slot-visit — following each block's terminating branch into
|
||||
/// the next block — instead of a single block, amortizing the per-round
|
||||
/// (timebase / coord / `round_schedule`) and per-slot (`worker_prologue`)
|
||||
/// dispatch tax over up to [`SUPERBLOCK_INSTR_BUDGET`] guest instructions.
|
||||
///
|
||||
/// Determinism + cross-thread correctness: the chain ENDS (returns to the
|
||||
/// round) at exactly the points where lockstep granularity matters, all
|
||||
/// pure functions of guest state (never wall-clock):
|
||||
/// - a non-`Continue` step result (Yield / SystemCall / Trap / Unimpl /
|
||||
/// Halted) — `step_block` already bails on these; `Yield` in
|
||||
/// particular is the db16cyc spin-wait hand-off that prevents a
|
||||
/// spinner from starving its producer.
|
||||
/// - the just-run block was `sync_sensitive` (reserved load/store or a
|
||||
/// memory barrier) — the guest's own ordering points.
|
||||
/// - the block touched MMIO (the `mem.mmio_access_count()` watermark
|
||||
/// advanced) — GPU/register ordering vs other HW threads stays at the
|
||||
/// same fine granularity as the old one-block path.
|
||||
/// - the next PC leaves ordinary guest code: an import thunk, the halt
|
||||
/// sentinel, or unmapped memory — those need the full `worker_prologue`
|
||||
/// dispatch, so we stop and let the next round's prologue handle them.
|
||||
/// - the instruction budget is reached.
|
||||
///
|
||||
/// Instruction-count / clock accounting stays exact: `executed` is summed
|
||||
/// from the per-block `cycle_count` delta across every chained block and
|
||||
/// handed to `worker_epilogue` once, which advances `stats.instruction_count`
|
||||
/// and `decrement_quantum` by precisely the retired count — identical to
|
||||
/// dispatching each block separately.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn run_superblock(
|
||||
wc: &mut WorkerCtx,
|
||||
kernel: &mut xenia_kernel::KernelState,
|
||||
mem: &xenia_memory::GuestMemory,
|
||||
debugger: &mut xenia_debugger::Debugger,
|
||||
thunk_map: &HashMap<u32, (ModuleId, u16, String)>,
|
||||
stats: &mut ExecStats,
|
||||
tid: Option<u32>,
|
||||
thread_ref: xenia_cpu::ThreadRef,
|
||||
first_block_ptr: *const xenia_cpu::block_cache::DecodedBlock,
|
||||
first_pc_before: u32,
|
||||
) -> SlotOutcome {
|
||||
use xenia_cpu::interpreter::{step_block, StepResult};
|
||||
const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;
|
||||
|
||||
let budget = superblock_budget();
|
||||
|
||||
// Heisenbug fix (toolkit audit, 2026-06-21): probes and mem-watch are
|
||||
// OBSERVE-ONLY diagnostics and must NOT change guest scheduling. The
|
||||
// previous implementation disabled superblock chaining whenever any
|
||||
// probe / mem-watch was armed (so the per-block-entry observation in
|
||||
// `worker_prologue` was reached for every block). But chaining is what
|
||||
// determines thread interleaving, so arming a probe perturbed the
|
||||
// schedule — it starved the movie/XMV subsystem so it never reached the
|
||||
// video state, making the probe useless on exactly the code we most
|
||||
// needed to observe (`XENIA_SUPERBLOCK_BUDGET=1` reproduces the same
|
||||
// starvation, confirming chaining is the lever).
|
||||
//
|
||||
// The fix fires the SAME per-block-entry observation INSIDE the chain
|
||||
// loop, at every chained block's entry PC (see `fire_block_entry_probes`
|
||||
// below), so chaining — and therefore scheduling — is byte-identical
|
||||
// whether or not a probe is armed. `chain_allowed` no longer depends on
|
||||
// the probe/mem-watch state.
|
||||
//
|
||||
// `wants_hooks()` (the interactive debugger / breakpoint path) still
|
||||
// forces the per-instruction path in `worker_prologue` and never reaches
|
||||
// `run_superblock`, so the only remaining reason to never chain here is
|
||||
// the explicit budget==1 reproduction request.
|
||||
let chain_allowed = budget > 1;
|
||||
|
||||
// Per-block-entry diagnostic observation, replicating exactly what
|
||||
// `worker_prologue` does at the first block of a slot visit:
|
||||
// 1. the four `fire_*_if_match` probe helpers (read-only; each
|
||||
// re-checks its own armed set against the live ctx PC), and
|
||||
// 2. the mem-watch writer-context publish, so a watched store that
|
||||
// fires mid-block is attributed to the CORRECT chained block's
|
||||
// entry PC / LR (matching the single-block reporting granularity)
|
||||
// instead of the stale superblock-entry PC.
|
||||
// The closure is a pure function of the live scheduler context; the
|
||||
// caller must ensure `ctx.pc` equals the block-entry PC before calling.
|
||||
let probe_hw_id = wc.hw_id;
|
||||
let fire_block_entry_probes =
|
||||
|kernel: &mut xenia_kernel::KernelState, mem: &xenia_memory::GuestMemory| {
|
||||
let hw_id = probe_hw_id;
|
||||
if kernel.any_probe_active() {
|
||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||
kernel.fire_branch_probe_if_match(hw_id);
|
||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||
kernel.fire_lr_trace_if_match(hw_id);
|
||||
}
|
||||
if mem.has_mem_watch() {
|
||||
let ctx = kernel.scheduler.ctx(hw_id);
|
||||
let tid_w = kernel.scheduler.tid(hw_id).unwrap_or(0);
|
||||
xenia_memory::set_writer_ctx(tid_w, ctx.pc, ctx.lr as u32);
|
||||
}
|
||||
};
|
||||
|
||||
let mut block_ptr = first_block_ptr;
|
||||
let mut pc_before = first_pc_before;
|
||||
let mut total_executed: u64 = 0;
|
||||
|
||||
let (result, last_block_ptr, last_pc_before) = loop {
|
||||
let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
|
||||
let mmio_before = mem.mmio_access_count();
|
||||
let block = unsafe { &*block_ptr };
|
||||
let result = {
|
||||
let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
|
||||
step_block(ctx, mem, block)
|
||||
};
|
||||
let executed = kernel
|
||||
.scheduler
|
||||
.ctx_mut_ref(thread_ref)
|
||||
.cycle_count
|
||||
.saturating_sub(cycle_before);
|
||||
total_executed = total_executed.saturating_add(executed);
|
||||
|
||||
// STOP conditions (any → end the superblock, hand to epilogue):
|
||||
// non-Continue result (let the epilogue apply it), chaining
|
||||
// disabled, a sync-sensitive block just ran, MMIO was touched,
|
||||
// or the budget is spent.
|
||||
if !chain_allowed
|
||||
|| !matches!(result, StepResult::Continue)
|
||||
|| block.sync_sensitive
|
||||
|| mem.mmio_access_count() != mmio_before
|
||||
|| total_executed >= budget
|
||||
{
|
||||
break (result, block_ptr, pc_before);
|
||||
}
|
||||
|
||||
// Decide whether the NEXT PC is an ordinary guest block we can
|
||||
// chain into. Anything else (thunk / halt sentinel / unmapped)
|
||||
// needs the full prologue dispatch next round.
|
||||
let next_pc = kernel.scheduler.ctx(wc.hw_id).pc;
|
||||
if next_pc == LR_HALT
|
||||
|| (kernel.pc_in_thunk_band(next_pc) && thunk_map.contains_key(&next_pc))
|
||||
|| !mem.is_mapped(next_pc)
|
||||
{
|
||||
break (result, block_ptr, pc_before);
|
||||
}
|
||||
|
||||
// Chain into the next block. `ctx.pc` now equals `next_pc` (the
|
||||
// chained block's entry), so fire the per-block-entry observation
|
||||
// BEFORE stepping it — identical to what `worker_prologue` did at
|
||||
// the first block. This keeps the probe firing at EVERY armed
|
||||
// block-entry while leaving the chaining decision (and thus the
|
||||
// schedule) untouched. The first block was already observed by the
|
||||
// prologue, so we only observe the newly-chained blocks here.
|
||||
pc_before = next_pc;
|
||||
fire_block_entry_probes(kernel, mem);
|
||||
|
||||
// Build/fetch the next block. Re-borrows `wc.block_cache`, which
|
||||
// invalidates the previous `block_ptr` — but we've already finished
|
||||
// using it (only `sync_sensitive`/diagnostics were read, above), so
|
||||
// the raw-pointer aliasing rule is respected.
|
||||
block_ptr = wc.block_cache.lookup_or_build(next_pc, mem) as *const _;
|
||||
};
|
||||
|
||||
worker_epilogue(
|
||||
wc,
|
||||
kernel,
|
||||
debugger,
|
||||
stats,
|
||||
tid,
|
||||
thread_ref,
|
||||
last_block_ptr,
|
||||
last_pc_before,
|
||||
result,
|
||||
total_executed,
|
||||
)
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))]
|
||||
fn run_execution(
|
||||
mem: &xenia_memory::GuestMemory,
|
||||
@@ -2768,11 +3162,18 @@ fn run_execution(
|
||||
halt_on_deadlock: bool,
|
||||
shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||
) -> ExecStats {
|
||||
use xenia_cpu::interpreter::step_block;
|
||||
|
||||
let mut stats = ExecStats::default();
|
||||
let _ = quiet; // retained for future per-kind suppression
|
||||
|
||||
// APU stage 3 — give the XMA decoder a stable pointer to the guest memory
|
||||
// mapping `run_execution` runs against, so the kick MMIO write can run
|
||||
// Work() synchronously (canary `!use_dedicated_xma_thread` semantics: the
|
||||
// game observes the updated context the instant its kick store retires).
|
||||
// `mem` outlives this call for both the headless and UI paths.
|
||||
if let Ok(mut xma) = kernel.xma.lock() {
|
||||
xma.set_memory(mem);
|
||||
}
|
||||
|
||||
// `--halt-on-deadlock` CLI flag OR `XENIA_HALT_ON_DEADLOCK=1|true` env var:
|
||||
// when the scheduler next hits a hard deadlock (every live HW thread
|
||||
// blocked on a handle wait with no pending timer) we bail out with a
|
||||
@@ -2813,6 +3214,10 @@ fn run_execution(
|
||||
// re-decoding the same handful of pages 60×/s.
|
||||
let mut isr_decode_cache = xenia_cpu::decoder::DecodeCache::new();
|
||||
|
||||
// Tier-A perf #2: reusable buffer for `round_schedule_into` so the round
|
||||
// loop doesn't heap-allocate a `Vec<u8>` every iteration.
|
||||
let mut order_buf = [0u8; xenia_cpu::scheduler::HW_THREAD_COUNT];
|
||||
|
||||
'outer: loop {
|
||||
// Per-round prologue: budget / shutdown / heartbeat / vsync /
|
||||
// timers / audio-interrupt injection. Carved into
|
||||
@@ -2852,6 +3257,12 @@ fn run_execution(
|
||||
kernel
|
||||
.scheduler
|
||||
.advance_global_clock_to(stats.instruction_count);
|
||||
// ITERATE-2J — tick the KeTimeStampBundle (ordinal 0x00AD) from the
|
||||
// same deterministic clock so the guest's worker-hub tick_count
|
||||
// deadline gate (`[block+0x10] + 66` ms) actually elapses. Without
|
||||
// this the block is frozen at boot and the hub spins forever,
|
||||
// starving tid14 on event 0x109c.
|
||||
kernel.update_timestamp_bundle(mem, kernel.scheduler.global_clock());
|
||||
kernel.fire_due_silph_autosignals(stats.instruction_count);
|
||||
dispatch_graphics_interrupts(
|
||||
kernel,
|
||||
@@ -2861,10 +3272,12 @@ fn run_execution(
|
||||
thunk_map,
|
||||
);
|
||||
|
||||
// Snapshot round schedule. `round_schedule` also advances rng state
|
||||
// when seeded; mutation is intentional.
|
||||
// Snapshot round schedule. `round_schedule_into` also advances rng
|
||||
// state when seeded; mutation is intentional. Perf (Tier-A #2): fill
|
||||
// a reusable stack array instead of allocating a fresh Vec per round.
|
||||
kernel.scheduler.begin_round();
|
||||
let order = kernel.scheduler.round_schedule();
|
||||
let order_n = kernel.scheduler.round_schedule_into(&mut order_buf);
|
||||
let order = &order_buf[..order_n];
|
||||
|
||||
if order.is_empty() {
|
||||
// No Ready threads — advance time to the earliest pending
|
||||
@@ -2886,7 +3299,7 @@ fn run_execution(
|
||||
// GPU when block dispatch engages.
|
||||
let instrs_at_round_start = stats.instruction_count;
|
||||
|
||||
for hw_id in order {
|
||||
for &hw_id in order {
|
||||
let wc = &mut workers[hw_id as usize];
|
||||
match worker_prologue(
|
||||
wc,
|
||||
@@ -2905,34 +3318,25 @@ fn run_execution(
|
||||
block_ptr,
|
||||
pc_before,
|
||||
} => {
|
||||
// Block-cache step. The lockstep path keeps the
|
||||
// kernel state borrowed straight through (single
|
||||
// host thread, no contention). Step 03 of the
|
||||
// M3 real-parallelism plan introduces a
|
||||
// drop-and-reacquire window around `step_block`
|
||||
// for the parallel branch.
|
||||
let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
|
||||
let block = unsafe { &*block_ptr };
|
||||
let result = {
|
||||
let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
|
||||
step_block(ctx, mem, block)
|
||||
};
|
||||
let executed = kernel
|
||||
.scheduler
|
||||
.ctx_mut_ref(thread_ref)
|
||||
.cycle_count
|
||||
.saturating_sub(cycle_before);
|
||||
match worker_epilogue(
|
||||
// SUPERBLOCK runner (iterate-3AL). Instead of one
|
||||
// basic block per slot-visit, chain straight-line
|
||||
// blocks through their branches up to a deterministic
|
||||
// instruction budget, yielding back to the round only
|
||||
// at cross-thread synchronization points. Amortizes
|
||||
// the per-round (timebase / coord / round_schedule)
|
||||
// and per-slot (prologue) tax over hundreds of
|
||||
// instructions instead of ~6. See `run_superblock`.
|
||||
match run_superblock(
|
||||
wc,
|
||||
kernel,
|
||||
mem,
|
||||
debugger,
|
||||
thunk_map,
|
||||
&mut stats,
|
||||
tid,
|
||||
thread_ref,
|
||||
block_ptr,
|
||||
pc_before,
|
||||
result,
|
||||
executed,
|
||||
) {
|
||||
SlotOutcome::Continue => continue,
|
||||
SlotOutcome::BreakOuter => break 'outer,
|
||||
@@ -3296,6 +3700,16 @@ fn run_execution_parallel(
|
||||
guard.fire_due_silph_autosignals(s.instruction_count);
|
||||
}
|
||||
|
||||
// ITERATE-2J — tick the KeTimeStampBundle (ordinal 0x00AD) from
|
||||
// the parallel-mode coherent global_clock (summed per-block
|
||||
// retired instructions). Same fix as the lockstep loop: keeps the
|
||||
// guest's worker-hub tick_count deadline gate advancing so it
|
||||
// dispatches channel-3 and unblocks tid14 on event 0x109c.
|
||||
{
|
||||
let clock = guard.scheduler.global_clock();
|
||||
guard.update_timestamp_bundle(mem, clock);
|
||||
}
|
||||
|
||||
// Iterate-2.BE — host-driven synchronous ISR dispatch.
|
||||
// Runs under the kernel lock while workers are still parked
|
||||
// at the phaser B2 barrier (the coordinator hasn't published
|
||||
@@ -3506,7 +3920,17 @@ fn dispatch_graphics_interrupts(
|
||||
None
|
||||
};
|
||||
|
||||
/// X_KPCR offset of `prcb_data.current_cpu` (canary `xthread.cc`
|
||||
/// `SetActiveCpu` → `pcr.prcb_data.current_cpu`). The guest graphics
|
||||
/// ISR reads it via `lbz r10, 268(r13)` to decide which per-CPU bit of
|
||||
/// the swap-acknowledge fence to clear.
|
||||
const PCR_CURRENT_CPU_OFF: u32 = 268;
|
||||
|
||||
while let Some(source) = kernel.interrupts.peek_next() {
|
||||
let target_cpu = kernel
|
||||
.interrupts
|
||||
.peek_next_cpu()
|
||||
.unwrap_or(xenia_kernel::interrupts::VSYNC_TARGET_CPU);
|
||||
// Victim selection: Ready first, then Blocked (canary's
|
||||
// `XThread::GetCurrentThread()` analog — any live thread will
|
||||
// do for borrowing context). Skip Idle/Exited/ServicingIrq.
|
||||
@@ -3576,6 +4000,19 @@ fn dispatch_graphics_interrupts(
|
||||
saved
|
||||
};
|
||||
|
||||
// Impersonate the interrupt's target CPU on the borrowed thread's
|
||||
// PCR, mirroring canary `EmulateCPInterruptDPC` →
|
||||
// `XThread::SetActiveCpu(cpu)`. The guest swap-complete ISR clears
|
||||
// `1 << [pcr.current_cpu]` from the per-present swap-acknowledge
|
||||
// fence; if it runs on the wrong CPU it clears the wrong bit and
|
||||
// the GPU's trailing `WAIT_REG_MEM` on that fence never releases —
|
||||
// stranding the present/title loop. Save/restore so borrowing a
|
||||
// thread doesn't permanently rewrite its processor number.
|
||||
let pcr_addr = (kernel.scheduler.ctx_mut_ref(target_ref).gpr[13] as u32)
|
||||
.wrapping_add(PCR_CURRENT_CPU_OFF);
|
||||
let saved_cpu = mem.read_u8(pcr_addr);
|
||||
mem.write_u8(pcr_addr, target_cpu);
|
||||
|
||||
// Stash the previous `scheduler.current` (call_export reaches
|
||||
// it; imports the ISR calls must dispatch on the borrowed
|
||||
// thread). Restore on the way out.
|
||||
@@ -3668,6 +4105,7 @@ fn dispatch_graphics_interrupts(
|
||||
|
||||
// Restore the borrowed context.
|
||||
saved.restore(kernel.scheduler.ctx_mut_ref(target_ref));
|
||||
mem.write_u8(pcr_addr, saved_cpu);
|
||||
kernel.scheduler.current = prev_current;
|
||||
kernel.interrupts.delivered += 1;
|
||||
|
||||
@@ -3836,10 +4274,18 @@ fn dump_thread_diagnostic(
|
||||
),
|
||||
}
|
||||
}
|
||||
if quiet {
|
||||
return;
|
||||
}
|
||||
use xenia_kernel::objects::KernelObject;
|
||||
|
||||
// Toolkit-audit fix (2026-06-21): only the ALWAYS-ON thread/waiter table
|
||||
// is suppressed by `--quiet`. The explicitly-armed diagnostics below
|
||||
// (`--trace-handles`, `--trace-handles-focus`, `--dump-addr`) are
|
||||
// requested output — arming the flag IS the user asking for it — and
|
||||
// were previously swallowed by the blanket `if quiet { return; }`, which
|
||||
// made the documented headless `--quiet` invocation silently drop every
|
||||
// handle/focus/dump report. They are each self-gated below (on
|
||||
// `audit.enabled` / `!audit.focus.is_empty()` / `!dump_addrs.is_empty()`)
|
||||
// so they only print when actually armed.
|
||||
if !quiet {
|
||||
println!("\n=== Thread diagnostics ===");
|
||||
for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
|
||||
if slot.runqueue.is_empty() {
|
||||
@@ -3936,6 +4382,7 @@ fn dump_thread_diagnostic(
|
||||
println!(" cs={:#010x} waiters(tid)={:?}", cs_ptr, tids);
|
||||
}
|
||||
}
|
||||
} // end `if !quiet` (always-on thread/waiter table)
|
||||
|
||||
// Audit trails (only when --trace-handles flipped the flag). For each
|
||||
// tracked handle, emit a compact block: kind, creator, and the bounded
|
||||
@@ -4348,6 +4795,12 @@ fn run_with_ui(
|
||||
.map_err(|e| anyhow::anyhow!("winit event loop build failed: {e}"))?;
|
||||
let (ui_handles, kernel_bridge) = xenia_ui::build(event_loop.create_proxy());
|
||||
kernel.ui = Some(kernel_bridge);
|
||||
// iterate-3O: enable per-draw geometry capture so the UI can replay real
|
||||
// guest draws. Only on the `--ui` path; headless `check` never gets here,
|
||||
// so the deterministic core/golden stays untouched.
|
||||
if let Some(gpu) = kernel.gpu.as_inline_mut() {
|
||||
gpu.enable_frame_capture();
|
||||
}
|
||||
|
||||
let shutdown = std::sync::Arc::clone(&ui_handles.shutdown);
|
||||
let title_owned = std::path::Path::new(title)
|
||||
@@ -4605,8 +5058,23 @@ fn cmd_dis(
|
||||
// pointer-validity oracle; runs over .rdata + .data.
|
||||
let function_starts: std::collections::BTreeSet<u32> =
|
||||
func_analysis.functions.keys().copied().collect();
|
||||
let vtables = xenia_analysis::vtables::analyze(
|
||||
&pe_image, base, §ions, &function_starts,
|
||||
// Anchor discovery: recover vtable bases from constructor vptr-write
|
||||
// stores so a vtable with non-function head words (null / pure-virtual /
|
||||
// unrecognised thunk slots) isn't fragmented away by the contiguity
|
||||
// heuristic. (Fixes e.g. the XMV engine vtable 0x8200a908.)
|
||||
let vptr_anchor_funcs: std::collections::BTreeMap<u32, (u32, bool)> = func_analysis
|
||||
.functions
|
||||
.iter()
|
||||
.map(|(&s, fi)| (s, (fi.end, fi.is_saverestore)))
|
||||
.collect();
|
||||
let vptr_block_boundaries: std::collections::HashSet<u32> =
|
||||
xref_result.labels.keys().copied().collect();
|
||||
let vtable_anchors = xenia_analysis::vtables::scan_vptr_write_constants(
|
||||
&pe_image, base, &vptr_anchor_funcs, §ions, &vptr_block_boundaries,
|
||||
);
|
||||
info!(vtable_anchors = vtable_anchors.len(), "vptr-write anchor scan complete");
|
||||
let vtables = xenia_analysis::vtables::analyze_with_anchors(
|
||||
&pe_image, base, §ions, &function_starts, &vtable_anchors,
|
||||
);
|
||||
let rtti_count = vtables.iter().filter(|v| v.rtti_present).count();
|
||||
info!(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"instructions": 2000005,
|
||||
"instructions": 2000073,
|
||||
"imports": 5635,
|
||||
"unimpl": 0,
|
||||
"draws": 0,
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
"instructions": 50000000,
|
||||
"imports": 339766,
|
||||
"instructions": 50000200,
|
||||
"imports": 189264,
|
||||
"unimpl": 0,
|
||||
"draws": 0,
|
||||
"swaps": 2,
|
||||
"unique_render_targets": 0,
|
||||
"shader_blobs_live": 0,
|
||||
"texture_cache_entries": 0
|
||||
"draws": 768,
|
||||
"swaps": 157,
|
||||
"unique_render_targets": 2,
|
||||
"shader_blobs_live": 6,
|
||||
"texture_cache_entries": 1
|
||||
}
|
||||
|
||||
@@ -57,6 +57,16 @@ fn run_oracle(label: &str, max_instr: u64, golden_rel: &str) {
|
||||
&iso,
|
||||
"-n",
|
||||
&max_instr_str,
|
||||
// Pin the inline (single-threaded) GPU backend. The default
|
||||
// threaded backend drains the ring on a separate host thread,
|
||||
// so the exact instruction at which a CP interrupt is queued —
|
||||
// and therefore when the guest's swap-complete ISR callback runs
|
||||
// (iterate-2S armed it via SCRATCH_REG writeback) — varies run to
|
||||
// run. Inline draining is instruction-count-deterministic, which
|
||||
// is what a regression golden needs. (The threaded path is the
|
||||
// documented "GPU thread race" the stable-digest already warns
|
||||
// about.)
|
||||
"--gpu-inline",
|
||||
"--stable-digest",
|
||||
"--expect",
|
||||
&golden_str,
|
||||
|
||||
@@ -6,5 +6,12 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
xenia-types = { workspace = true }
|
||||
xenia-memory = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
# Raw FFmpeg FFI for the XMA2 audio decoder (stage 3). The system libs are
|
||||
# FFmpeg 6.1 (libavcodec 60), so we pin the matching `6.1` series. The `build`
|
||||
# feature regenerates bindings via bindgen against the installed headers, so
|
||||
# the FFI matches the distro FFmpeg exactly. We only need avcodec + avutil.
|
||||
ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] }
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
pub mod xma;
|
||||
pub mod xma2_codec;
|
||||
pub mod xma_decode;
|
||||
|
||||
pub use xma::{build_mmio_region, XmaDecoder, XMA_CONTEXT_COUNT, XMA_CONTEXT_SIZE};
|
||||
|
||||
/// Audio processing unit stub. Logging only for now.
|
||||
pub struct AudioSystem {
|
||||
pub enabled: bool,
|
||||
|
||||
932
crates/xenia-apu/src/xma.rs
Normal file
932
crates/xenia-apu/src/xma.rs
Normal file
@@ -0,0 +1,932 @@
|
||||
//! Register-mapped XMA context system — a faithful port of xenia-canary's
|
||||
//! `apu/xma_decoder.cc` context-array + MMIO machinery, MINUS the audio
|
||||
//! decoder itself (stage 3).
|
||||
//!
|
||||
//! The guest allocates XMA contexts via `XMACreateContext` (which hands back a
|
||||
//! pointer into our 320-entry context array in physical guest memory), writes
|
||||
//! the 64-byte `XMA_CONTEXT_DATA` struct, then *kicks* decode by writing the
|
||||
//! per-context bit into the `0x7FEA0000` register aperture. This module
|
||||
//! satisfies all of that without faulting and records which contexts the guest
|
||||
//! kicked; stage 3 will consume the recorded `pending` flags to actually
|
||||
//! produce PCM.
|
||||
//!
|
||||
//! ## Byte order
|
||||
//! The guest accesses the aperture byte-reversed (`stwbrx`/`lwbrx`), so the raw
|
||||
//! `u32` our MMIO boundary delivers is byte-swapped relative to the logical
|
||||
//! register value — exactly the situation canary handles with `xe::byte_swap`.
|
||||
//! So `write_register` swaps the incoming value before decoding and the
|
||||
//! register file holds host-order values; `read_register` swaps on the way out.
|
||||
//! This was proven empirically: the guest's Clear writes arrive as
|
||||
//! `0x01000000`/`0x02000000`/`0x04000000`, i.e. byte-reversed `1`/`2`/`4`,
|
||||
//! targeting contexts 0/1/2 (which it had just allocated) — NOT 24/25/26. The
|
||||
//! register-index math (`(addr & 0xFFFF) / 4`) is the same as canary's.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use xenia_memory::access::MemoryAccess;
|
||||
use xenia_memory::{GuestMemory, MmioRegion};
|
||||
|
||||
use crate::xma_decode::{self, ContextDecodeState, XmaContextData};
|
||||
|
||||
/// Size in bytes of an `XMA_CONTEXT_DATA` struct (canary `xma_context.h`).
|
||||
/// Stage 1 does not decode the fields — only the stride matters.
|
||||
pub const XMA_CONTEXT_SIZE: u32 = 64;
|
||||
/// Number of XMA contexts the hardware exposes (canary `kContextCount`).
|
||||
pub const XMA_CONTEXT_COUNT: usize = 320;
|
||||
|
||||
/// Register aperture base (guest physical). Canary maps the XMA decoder at
|
||||
/// `0x7FEA0000` in `XmaDecoder::Setup`.
|
||||
pub const APERTURE_BASE: u32 = 0x7FEA_0000;
|
||||
/// Mask used by `MmioRegion::contains` so any `0x7FEAxxxx` address hits.
|
||||
pub const APERTURE_MASK: u32 = 0xFFFF_0000;
|
||||
/// Total aperture size in bytes (the low 16-bit register window).
|
||||
pub const APERTURE_SIZE: u32 = 0x0001_0000;
|
||||
|
||||
// ----- Register indices (canary `XmaRegister` enum / xma_register_table.inc).
|
||||
// Indices are dword indices: byte offset = index * 4.
|
||||
|
||||
/// `ContextArrayAddress` — physical base of the context array. byte 0x1800.
|
||||
const REG_CONTEXT_ARRAY_ADDRESS: u32 = 0x600;
|
||||
/// `CurrentContextIndex` — the context the HW is currently servicing. byte
|
||||
/// 0x1818. Polled by the guest; we rotate it so a poll never sticks.
|
||||
const REG_CURRENT_CONTEXT_INDEX: u32 = 0x606;
|
||||
|
||||
/// First of the 10 `ContextNKick` registers (`Context0Kick`..`Context9Kick`).
|
||||
/// byte 0x1940. Each register's bit N kicks context `base*32 + N`.
|
||||
const REG_CONTEXT_KICK_BASE: u32 = 0x650;
|
||||
/// First of the 10 `ContextNLock` registers. byte 0x1A40.
|
||||
const REG_CONTEXT_LOCK_BASE: u32 = 0x690;
|
||||
/// First of the 10 `ContextNClear` registers. byte 0x1A80.
|
||||
const REG_CONTEXT_CLEAR_BASE: u32 = 0x6A0;
|
||||
/// Each group spans 10 registers (320 contexts / 32-per-register).
|
||||
const CONTEXT_GROUP_LEN: u32 = 10;
|
||||
|
||||
/// Number of 32-bit words backing the register file. The highest index we
|
||||
/// touch is `0x6A9`; round up generously so any in-aperture index is in range
|
||||
/// (64 KB aperture / 4).
|
||||
const REGISTER_FILE_WORDS: usize = 0x4000;
|
||||
|
||||
/// Register-mapped XMA context array. Owns the allocation bitmap, the register
|
||||
/// file, and the per-context kick/enable bookkeeping that stage 3 consumes.
|
||||
pub struct XmaDecoder {
|
||||
/// Guest virtual address of the context array (handed back by
|
||||
/// `allocate_context`).
|
||||
context_array_guest_va: u32,
|
||||
/// Physical address stored into `ContextArrayAddress` (reg 0x600).
|
||||
context_array_phys: u32,
|
||||
/// 320-slot allocation bitmap, one bit per context (`bitmap[i>>6]` bit
|
||||
/// `i & 63`). A set bit means *allocated*.
|
||||
bitmap: [u64; (XMA_CONTEXT_COUNT + 63) / 64],
|
||||
/// Flat register file, host-native values. Indexed by dword register index.
|
||||
registers: Vec<u32>,
|
||||
/// Per-context "decode requested" flag, set on Kick, cleared on Clear.
|
||||
/// Stage 3 drains this to produce PCM.
|
||||
pending: [bool; XMA_CONTEXT_COUNT],
|
||||
/// Per-context enable flag. A Lock disables; a Kick (re-)enables. Mirrors
|
||||
/// canary's "is_enabled" notion loosely — exact decode semantics are
|
||||
/// stage 3.
|
||||
enabled: [bool; XMA_CONTEXT_COUNT],
|
||||
/// Total kicks observed (diagnostic; lets headless logs show progress).
|
||||
kick_count: u64,
|
||||
/// Rotating value served for `CurrentContextIndex` reads so a guest poll
|
||||
/// can't spin forever on a fixed value. Atomic so the read path can stay
|
||||
/// `&self`.
|
||||
current_context_index: AtomicU32,
|
||||
/// Per-context stage-3 decode state (FFmpeg codec, staged PCM frame, ring
|
||||
/// bookkeeping). Lazily populated as contexts are decoded.
|
||||
decode_state: Vec<ContextDecodeState>,
|
||||
/// Total PCM bytes written to guest output buffers (diagnostic).
|
||||
pcm_bytes_total: u64,
|
||||
/// Stable pointer to the guest memory mapping, captured at init. Used to run
|
||||
/// `Work()` SYNCHRONOUSLY inside the kick MMIO write — exactly as canary's
|
||||
/// default `!use_dedicated_xma_thread` path does (`context.Work()` right in
|
||||
/// `WriteRegister`), so the game sees the updated context the instant its
|
||||
/// kick store retires. The mapping lives for the whole run; decode is
|
||||
/// deterministic and happens on the CPU thread, so this is determinism-safe.
|
||||
mem_ptr: *const GuestMemory,
|
||||
}
|
||||
|
||||
// The decoder is owned behind an `Arc<Mutex<..>>` and only ever touched from the
|
||||
// CPU scheduler thread (kick MMIO writes + the per-round pump). The raw `mem_ptr`
|
||||
// is a stable whole-run mapping; access is single-threaded.
|
||||
unsafe impl Send for XmaDecoder {}
|
||||
|
||||
impl XmaDecoder {
|
||||
/// Construct an un-initialized decoder. Call [`Self::init`] once the
|
||||
/// context-array memory has been reserved.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
context_array_guest_va: 0,
|
||||
context_array_phys: 0,
|
||||
bitmap: [0; (XMA_CONTEXT_COUNT + 63) / 64],
|
||||
registers: vec![0; REGISTER_FILE_WORDS],
|
||||
pending: [false; XMA_CONTEXT_COUNT],
|
||||
enabled: [false; XMA_CONTEXT_COUNT],
|
||||
kick_count: 0,
|
||||
current_context_index: AtomicU32::new(0),
|
||||
decode_state: (0..XMA_CONTEXT_COUNT).map(|_| ContextDecodeState::new()).collect(),
|
||||
pcm_bytes_total: 0,
|
||||
mem_ptr: std::ptr::null(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Capture the stable guest-memory mapping so the kick MMIO path can run
|
||||
/// `Work()` synchronously (canary semantics). Call once at boot, after the
|
||||
/// final `mem` is in its long-lived location.
|
||||
pub fn set_memory(&mut self, mem: &GuestMemory) {
|
||||
self.mem_ptr = mem as *const GuestMemory;
|
||||
}
|
||||
|
||||
/// Wire in the context-array addresses (after the app reserves the buffer)
|
||||
/// and publish the physical base into `ContextArrayAddress` (reg 0x600),
|
||||
/// exactly as canary's `XmaDecoder::Setup` does.
|
||||
pub fn init(&mut self, context_array_guest_va: u32, context_array_phys: u32) {
|
||||
self.context_array_guest_va = context_array_guest_va;
|
||||
self.context_array_phys = context_array_phys;
|
||||
self.registers[REG_CONTEXT_ARRAY_ADDRESS as usize] = context_array_phys;
|
||||
tracing::info!(
|
||||
va = format_args!("{context_array_guest_va:#010x}"),
|
||||
phys = format_args!("{context_array_phys:#010x}"),
|
||||
"xma: context array initialized"
|
||||
);
|
||||
}
|
||||
|
||||
/// Acquire a free context slot and return its guest pointer
|
||||
/// (`context_array_guest_va + i*64`), or 0 if all 320 slots are in use.
|
||||
/// Mirrors canary's `XmaDecoder::AllocateContext`.
|
||||
pub fn allocate_context(&mut self) -> u32 {
|
||||
for i in 0..XMA_CONTEXT_COUNT {
|
||||
let word = i >> 6;
|
||||
let bit = 1u64 << (i & 63);
|
||||
if self.bitmap[word] & bit == 0 {
|
||||
self.bitmap[word] |= bit;
|
||||
let ptr = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE;
|
||||
tracing::info!(
|
||||
index = i,
|
||||
ptr = format_args!("{ptr:#010x}"),
|
||||
"xma: allocate_context"
|
||||
);
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
tracing::warn!("xma: allocate_context — all {} slots in use", XMA_CONTEXT_COUNT);
|
||||
0
|
||||
}
|
||||
|
||||
/// Free the slot backing `guest_ptr`. Mirrors canary's
|
||||
/// `XmaDecoder::ReleaseContext`. Out-of-range / unaligned pointers are
|
||||
/// ignored (the guest never faults).
|
||||
pub fn release_context(&mut self, guest_ptr: u32) {
|
||||
if guest_ptr < self.context_array_guest_va {
|
||||
return;
|
||||
}
|
||||
let offset = guest_ptr - self.context_array_guest_va;
|
||||
let i = (offset / XMA_CONTEXT_SIZE) as usize;
|
||||
if i >= XMA_CONTEXT_COUNT {
|
||||
return;
|
||||
}
|
||||
let word = i >> 6;
|
||||
let bit = 1u64 << (i & 63);
|
||||
self.bitmap[word] &= !bit;
|
||||
self.pending[i] = false;
|
||||
self.enabled[i] = false;
|
||||
tracing::info!(index = i, ptr = format_args!("{guest_ptr:#010x}"), "xma: release_context");
|
||||
}
|
||||
|
||||
/// Read a register. Returns the stored value, except `CurrentContextIndex`
|
||||
/// (0x606) which rotates `0..XMA_CONTEXT_COUNT` per read so a polling guest
|
||||
/// always sees forward progress. Out-of-range indices read 0.
|
||||
pub fn read_register(&self, reg_index: u32) -> u32 {
|
||||
// The guest accesses the aperture byte-reversed (`lwbrx`), so the
|
||||
// register file holds host-order values and we swap on the way out —
|
||||
// exactly as canary's `ReadRegister` returns `xe::byte_swap(reg)`.
|
||||
let host = if reg_index == REG_CURRENT_CONTEXT_INDEX {
|
||||
// Rotate mod context count on each read so a poll never sticks.
|
||||
let prev = self.current_context_index.fetch_add(1, Ordering::Relaxed);
|
||||
prev % XMA_CONTEXT_COUNT as u32
|
||||
} else {
|
||||
self.registers.get(reg_index as usize).copied().unwrap_or(0)
|
||||
};
|
||||
host.swap_bytes()
|
||||
}
|
||||
|
||||
/// Write a register, then apply the side-effect of the Kick / Lock / Clear
|
||||
/// register groups. Each register in a group covers 32 contexts; bit N maps
|
||||
/// to `context_id = (reg_index - group_base) * 32 + N`. We iterate set bits
|
||||
/// with `trailing_zeros` + clear-lowest-bit, mirroring canary's
|
||||
/// `std::countr_zero` loop. The incoming value is byte-swapped first (see
|
||||
/// below).
|
||||
pub fn write_register(&mut self, reg_index: u32, value: u32) {
|
||||
// The guest writes the aperture byte-reversed (`stwbrx`); undo it so the
|
||||
// register file holds host-order values, mirroring canary's
|
||||
// `WriteRegister` which does `value = xe::byte_swap(value)` first. Proven
|
||||
// by the guest's Clear writes (`0x01000000` == context 0, not 24).
|
||||
let value = value.swap_bytes();
|
||||
if let Some(slot) = self.registers.get_mut(reg_index as usize) {
|
||||
*slot = value;
|
||||
}
|
||||
|
||||
if (REG_CONTEXT_KICK_BASE..REG_CONTEXT_KICK_BASE + CONTEXT_GROUP_LEN).contains(®_index) {
|
||||
let base = (reg_index - REG_CONTEXT_KICK_BASE) * 32;
|
||||
let mut bits = value;
|
||||
while bits != 0 {
|
||||
let b = bits.trailing_zeros();
|
||||
bits &= bits - 1;
|
||||
let context_id = (base + b) as usize;
|
||||
if context_id < XMA_CONTEXT_COUNT {
|
||||
self.pending[context_id] = true;
|
||||
self.enabled[context_id] = true;
|
||||
self.kick_count += 1;
|
||||
tracing::debug!(
|
||||
context_id,
|
||||
kick_count = self.kick_count,
|
||||
"xma: kick (decode requested)"
|
||||
);
|
||||
// Canary `!use_dedicated_xma_thread`: run Work() right here so
|
||||
// the game observes the updated context when its kick store
|
||||
// retires. Safe — `mem_ptr` is a stable whole-run mapping and
|
||||
// we're on the CPU thread.
|
||||
if !self.mem_ptr.is_null() {
|
||||
let mem: &GuestMemory = unsafe { &*self.mem_ptr };
|
||||
self.enabled[context_id] = false;
|
||||
self.work_one(mem, context_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (REG_CONTEXT_LOCK_BASE..REG_CONTEXT_LOCK_BASE + CONTEXT_GROUP_LEN)
|
||||
.contains(®_index)
|
||||
{
|
||||
let base = (reg_index - REG_CONTEXT_LOCK_BASE) * 32;
|
||||
let mut bits = value;
|
||||
while bits != 0 {
|
||||
let b = bits.trailing_zeros();
|
||||
bits &= bits - 1;
|
||||
let context_id = (base + b) as usize;
|
||||
if context_id < XMA_CONTEXT_COUNT {
|
||||
self.enabled[context_id] = false;
|
||||
tracing::debug!(context_id, "xma: lock (context disabled)");
|
||||
}
|
||||
}
|
||||
} else if (REG_CONTEXT_CLEAR_BASE..REG_CONTEXT_CLEAR_BASE + CONTEXT_GROUP_LEN)
|
||||
.contains(®_index)
|
||||
{
|
||||
let base = (reg_index - REG_CONTEXT_CLEAR_BASE) * 32;
|
||||
let mut bits = value;
|
||||
while bits != 0 {
|
||||
let b = bits.trailing_zeros();
|
||||
bits &= bits - 1;
|
||||
let context_id = (base + b) as usize;
|
||||
if context_id < XMA_CONTEXT_COUNT {
|
||||
self.pending[context_id] = false;
|
||||
self.enabled[context_id] = false;
|
||||
tracing::debug!(context_id, "xma: clear (context state reset)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Total kicks observed so far (diagnostic; stage 3 will consume `pending`).
|
||||
pub fn kick_count(&self) -> u64 {
|
||||
self.kick_count
|
||||
}
|
||||
|
||||
/// Whether context `i` has a pending (un-serviced) kick. Stage-3 hook.
|
||||
pub fn is_pending(&self, i: usize) -> bool {
|
||||
self.pending.get(i).copied().unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Total PCM bytes the decoder has written to guest output buffers.
|
||||
pub fn pcm_bytes_total(&self) -> u64 {
|
||||
self.pcm_bytes_total
|
||||
}
|
||||
|
||||
/// Stage-3 entry point. Called once per scheduler round from the CPU
|
||||
/// thread's per-round coordinator. For each context with a pending kick,
|
||||
/// run one `Work()` pass (canary `XmaContextNew::Work`): read the context,
|
||||
/// decode available input into PCM, drain into the output ring, and write
|
||||
/// the decoder-owned fields back. Deterministic — no host thread, no clock.
|
||||
pub fn decode_pending(&mut self, mem: &GuestMemory) {
|
||||
if self.context_array_guest_va == 0 {
|
||||
return;
|
||||
}
|
||||
for i in 0..XMA_CONTEXT_COUNT {
|
||||
if !self.pending[i] || !self.enabled[i] {
|
||||
continue;
|
||||
}
|
||||
// Canary `Work` clears is_enabled at entry; a fresh kick re-enables.
|
||||
self.enabled[i] = false;
|
||||
self.work_one(mem, i);
|
||||
}
|
||||
}
|
||||
|
||||
/// One `Work()` pass for context `i`. Faithful to canary's orchestration but
|
||||
/// uses the mainline xma2 decoder (whole-packet driven) for the actual
|
||||
/// frame decode in place of canary's per-frame `Decode()`.
|
||||
fn work_one(&mut self, mem: &GuestMemory, i: usize) {
|
||||
let ctx_va = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE;
|
||||
let data = XmaContextData::read(mem, ctx_va);
|
||||
let initial = data;
|
||||
|
||||
if data.output_buffer_valid == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut data = data;
|
||||
self.decode_into_output(mem, i, ctx_va, &mut data, &initial);
|
||||
}
|
||||
|
||||
/// Decode available input packets into PCM and drain into the output ring.
|
||||
fn decode_into_output(
|
||||
&mut self,
|
||||
mem: &GuestMemory,
|
||||
i: usize,
|
||||
ctx_va: u32,
|
||||
data: &mut XmaContextData,
|
||||
initial: &XmaContextData,
|
||||
) {
|
||||
use xma_decode::*;
|
||||
|
||||
let output_capacity = data.output_buffer_block_count * OUTPUT_BYTES_PER_BLOCK;
|
||||
if output_capacity == 0 {
|
||||
return;
|
||||
}
|
||||
let out_backing = xma_phys_to_backing(data.output_buffer_ptr);
|
||||
let mut write_off = data.output_buffer_write_offset * OUTPUT_BYTES_PER_BLOCK;
|
||||
let read_off = data.output_buffer_read_offset * OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
// write_count: free space in the ring from write to read.
|
||||
let free_bytes = ring_write_count(read_off, write_off, output_capacity);
|
||||
self.decode_state[i].remaining_subframe_blocks_in_output =
|
||||
(free_bytes / OUTPUT_BYTES_PER_BLOCK) as i32;
|
||||
|
||||
let effective_sdc = data.subframe_decode_count.max(1);
|
||||
let min_blocks = effective_sdc as i32 + data.output_buffer_padding as i32;
|
||||
|
||||
if min_blocks > self.decode_state[i].remaining_subframe_blocks_in_output {
|
||||
// No room — write back unchanged and wait for the game to drain.
|
||||
store_merged_pub(mem, ctx_va, data, initial);
|
||||
return;
|
||||
}
|
||||
|
||||
let mut produced_any = false;
|
||||
|
||||
// Ensure codec configured for current rate/channels.
|
||||
let rate = sample_rate_hz(data.sample_rate);
|
||||
let channels = if data.is_stereo != 0 { 2 } else { 1 };
|
||||
self.ensure_codec(i, rate, channels);
|
||||
|
||||
// Main decode loop: while there's output ring room and valid input.
|
||||
loop {
|
||||
if self.decode_state[i].remaining_subframe_blocks_in_output < min_blocks {
|
||||
break;
|
||||
}
|
||||
|
||||
// If we still have undrained subframes from a prior decode, consume
|
||||
// them first (canary Consume before next Decode).
|
||||
if self.decode_state[i].current_frame_remaining_subframes == 0 {
|
||||
// Need a fresh decoded frame. Pull from the codec, feeding input
|
||||
// packets as required.
|
||||
if !self.produce_frame(mem, i, data) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Consume: write up to `effective_sdc` subframes (256B blocks) of
|
||||
// the staged raw_frame into the output ring.
|
||||
let total_subframes =
|
||||
((BYTES_PER_FRAME_CHANNEL / OUTPUT_BYTES_PER_BLOCK) << data.is_stereo) as u8;
|
||||
let remaining = self.decode_state[i].current_frame_remaining_subframes;
|
||||
let to_write = remaining.min(effective_sdc as u8);
|
||||
let frame_read_off = (total_subframes - remaining) as usize * OUTPUT_BYTES_PER_BLOCK as usize;
|
||||
let nbytes = to_write as u32 * OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
// Write into the output ring (handle wrap).
|
||||
let raw = &self.decode_state[i].raw_frame;
|
||||
write_off = ring_write(
|
||||
mem,
|
||||
out_backing,
|
||||
output_capacity,
|
||||
write_off,
|
||||
&raw[frame_read_off..frame_read_off + nbytes as usize],
|
||||
);
|
||||
self.pcm_bytes_total += nbytes as u64;
|
||||
produced_any = true;
|
||||
|
||||
let headroom = if remaining - to_write == 0 {
|
||||
data.output_buffer_padding as i32
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.decode_state[i].remaining_subframe_blocks_in_output -=
|
||||
to_write as i32 + headroom;
|
||||
self.decode_state[i].current_frame_remaining_subframes -= to_write;
|
||||
}
|
||||
|
||||
// Writeback offsets.
|
||||
data.output_buffer_write_offset = write_off / OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
if self.decode_state[i].remaining_subframe_blocks_in_output == 0
|
||||
&& write_off == read_off
|
||||
{
|
||||
data.output_buffer_valid = 0;
|
||||
}
|
||||
if !produced_any && !data.is_any_input_buffer_valid() {
|
||||
data.output_buffer_valid = 0;
|
||||
}
|
||||
|
||||
store_merged_pub(mem, ctx_va, data, initial);
|
||||
}
|
||||
|
||||
/// Configure (or reconfigure) the FFmpeg xma2 codec for this context.
|
||||
fn ensure_codec(&mut self, i: usize, rate: u32, channels: u32) {
|
||||
let st = &mut self.decode_state[i];
|
||||
if st.codec.is_some() && st.codec_rate == rate && st.codec_channels == channels {
|
||||
return;
|
||||
}
|
||||
match crate::xma2_codec::Xma2Codec::new(rate, channels) {
|
||||
Ok(c) => {
|
||||
st.codec = Some(c);
|
||||
st.codec_rate = rate;
|
||||
st.codec_channels = channels;
|
||||
tracing::info!(ctx = i, rate, channels, "xma: xma2 codec configured");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(ctx = i, rate, channels, error = %e, "xma: xma2 codec init failed");
|
||||
st.codec = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Produce one decoded 512-sample frame into `raw_frame` (interleaved S16BE).
|
||||
///
|
||||
/// Input-consumption model (faithful to canary's packet/buffer contract).
|
||||
///
|
||||
/// The mainline xma2 decoder consumes whole 2 KB packets via `send_packet`
|
||||
/// and emits frames in bursts (internal FIFO + lookahead), so its intake
|
||||
/// position can't be read per-frame. We therefore keep TWO cursors:
|
||||
///
|
||||
/// 1. A private FFmpeg *feed* cursor (`feed_buffer`/`feed_packet_index`)
|
||||
/// that hands raw packets to FFmpeg only far enough ahead to keep the
|
||||
/// PCM queue stocked. This follows the same buffer ping-pong as the
|
||||
/// guest but is NOT what the guest observes.
|
||||
/// 2. The guest-visible `input_buffer_read_offset`, advanced by exactly
|
||||
/// ONE compressed frame each time we emit a 512-sample frame to the
|
||||
/// guest — via `advance_read_offset_one_frame`, a faithful port of the
|
||||
/// offset arithmetic in canary's `Decode()`. This crosses packet and
|
||||
/// buffer boundaries (and fires SwapInputBuffer, clearing the drained
|
||||
/// buffer's valid bit) at canary's true per-frame cadence, which is
|
||||
/// what the WMV demuxer polls to refill ADV.wmv.
|
||||
///
|
||||
/// Decoupling the two means FFmpeg's whole-packet burst framing no longer
|
||||
/// freezes the guest-visible offset: the offset now tracks emitted output,
|
||||
/// so the input buffer is consumed and swapped as the movie actually plays.
|
||||
fn produce_frame(&mut self, mem: &GuestMemory, i: usize, data: &mut XmaContextData) -> bool {
|
||||
use xma_decode::*;
|
||||
let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 };
|
||||
let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize;
|
||||
|
||||
// Top up FFmpeg's internal FIFO (and our queue) just enough to satisfy
|
||||
// one frame, feeding raw packets via the private feed cursor.
|
||||
if self.decode_state[i].pcm_queue.len() < frame_bytes {
|
||||
self.feed_codec(mem, i, data);
|
||||
}
|
||||
|
||||
// Pop exactly one 512-sample frame from the queue into raw_frame.
|
||||
if self.decode_state[i].pcm_queue.len() < frame_bytes {
|
||||
return false;
|
||||
}
|
||||
{
|
||||
let st = &mut self.decode_state[i];
|
||||
st.raw_frame.iter_mut().for_each(|b| *b = 0);
|
||||
for b in st.raw_frame[..frame_bytes].iter_mut() {
|
||||
*b = st.pcm_queue.pop_front().unwrap();
|
||||
}
|
||||
st.current_frame_remaining_subframes = (4u8) << data.is_stereo;
|
||||
}
|
||||
|
||||
// We just emitted one frame to the guest — advance its visible read
|
||||
// offset by one compressed frame at canary's cadence (may swap buffer).
|
||||
self.advance_read_offset_one_frame(mem, data);
|
||||
true
|
||||
}
|
||||
|
||||
/// Feed raw 2 KB packets to FFmpeg from the private feed cursor until the
|
||||
/// PCM queue holds at least one frame or the codec stops accepting input.
|
||||
/// The feed cursor follows the guest's `current_buffer` ping-pong but keeps
|
||||
/// its own packet index (`feed_packet_index`), so feeding ahead of the
|
||||
/// guest-visible read offset is fine — the offset advances separately per
|
||||
/// emitted frame.
|
||||
fn feed_codec(&mut self, mem: &GuestMemory, i: usize, data: &XmaContextData) {
|
||||
use xma_decode::*;
|
||||
let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 };
|
||||
let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize;
|
||||
|
||||
// Re-sync the feed buffer to the guest's current buffer if the guest has
|
||||
// swapped past us (the buffer we were feeding was consumed).
|
||||
if self.decode_state[i].feed_buffer != data.current_buffer
|
||||
&& !data.is_input_buffer_valid(self.decode_state[i].feed_buffer)
|
||||
{
|
||||
self.decode_state[i].feed_buffer = data.current_buffer;
|
||||
self.decode_state[i].feed_packet_index = 0;
|
||||
}
|
||||
|
||||
const MAX_FEED: u32 = 8;
|
||||
let mut fed = 0u32;
|
||||
while self.decode_state[i].pcm_queue.len() < frame_bytes && fed < MAX_FEED {
|
||||
let fb = self.decode_state[i].feed_buffer;
|
||||
if !data.is_input_buffer_valid(fb) {
|
||||
// Nothing to feed from this buffer; try the other if valid.
|
||||
let other = fb ^ 1;
|
||||
if data.is_input_buffer_valid(other) {
|
||||
self.decode_state[i].feed_buffer = other;
|
||||
self.decode_state[i].feed_packet_index = 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
let pkt_count = data.input_buffer_packet_count(fb);
|
||||
let pidx = self.decode_state[i].feed_packet_index;
|
||||
if pidx >= pkt_count {
|
||||
// Exhausted this buffer's packets at the feed cursor; advance to
|
||||
// the other buffer if it's valid (it was refilled), else wait.
|
||||
let other = fb ^ 1;
|
||||
if data.is_input_buffer_valid(other) {
|
||||
self.decode_state[i].feed_buffer = other;
|
||||
self.decode_state[i].feed_packet_index = 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
let backing = xma_phys_to_backing(data.input_buffer_address(fb));
|
||||
let pkt_va = backing + pidx * BYTES_PER_PACKET;
|
||||
let mut packet = vec![0u8; BYTES_PER_PACKET as usize];
|
||||
mem.read_bytes(pkt_va, &mut packet);
|
||||
let send_res = match self.decode_state[i].codec.as_mut() {
|
||||
Some(codec) => codec.send_packet(&packet),
|
||||
None => break,
|
||||
};
|
||||
match send_res {
|
||||
Ok(()) => {
|
||||
self.decode_state[i].feed_packet_index += 1;
|
||||
fed += 1;
|
||||
self.drain_codec_frames(i);
|
||||
}
|
||||
// Decoder full — drain what it has and stop; re-offer this same
|
||||
// packet next time (don't advance the feed cursor).
|
||||
Err(ref e) if e == "EAGAIN" => {
|
||||
self.drain_codec_frames(i);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(ctx = i, error = %e, "xma: send_packet failed");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pull all currently-available decoded frames from the codec and append
|
||||
/// their interleaved S16BE PCM to the context's queue.
|
||||
fn drain_codec_frames(&mut self, i: usize) {
|
||||
loop {
|
||||
let out = match self.decode_state[i].codec.as_mut() {
|
||||
Some(c) => c.receive_frame(),
|
||||
None => None,
|
||||
};
|
||||
let Some((nb, bytes)) = out else { break };
|
||||
let st = &mut self.decode_state[i];
|
||||
st.frames_decoded += 1;
|
||||
if !st.first_frame_logged {
|
||||
st.first_frame_logged = true;
|
||||
tracing::info!(
|
||||
ctx = i,
|
||||
samples = nb,
|
||||
pcm_bytes = bytes.len(),
|
||||
"xma: first PCM frame decoded"
|
||||
);
|
||||
}
|
||||
st.pcm_queue.extend(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance `input_buffer_read_offset` by exactly ONE compressed frame,
|
||||
/// faithfully mirroring the offset arithmetic in canary's
|
||||
/// `XmaContextNew::Decode` (frame-size parse + packet-boundary handling +
|
||||
/// SwapInputBuffer when the buffer's packets are exhausted). Called once per
|
||||
/// 512-sample frame we emit to the guest, so the guest-visible read offset
|
||||
/// crosses packet/buffer boundaries at canary's true cadence — independent
|
||||
/// of the mainline xma2 decoder's whole-packet burst framing. This is what
|
||||
/// lets `input_buffer_0_valid` toggle and the WMV demuxer refill ADV.wmv.
|
||||
fn advance_read_offset_one_frame(&mut self, mem: &GuestMemory, data: &mut XmaContextData) {
|
||||
use xma_decode::*;
|
||||
|
||||
if !data.is_any_input_buffer_valid() {
|
||||
return;
|
||||
}
|
||||
if !data.is_current_input_buffer_valid() {
|
||||
self.swap_input_buffer(data);
|
||||
if !data.is_current_input_buffer_valid() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Clamp a header-region offset (canary's Dirt-2 guard).
|
||||
if data.input_buffer_read_offset < BITS_PER_PACKET_HEADER {
|
||||
data.input_buffer_read_offset = BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
|
||||
let pkt_count = data.current_input_buffer_packet_count();
|
||||
let input_size = pkt_count * BYTES_PER_PACKET;
|
||||
let Some(packet_index) = packet_number(input_size, data.input_buffer_read_offset) else {
|
||||
return;
|
||||
};
|
||||
let buf_backing = xma_phys_to_backing(data.current_input_buffer_address());
|
||||
let pkt_va = buf_backing + packet_index * BYTES_PER_PACKET;
|
||||
let mut packet = vec![0u8; BYTES_PER_PACKET as usize];
|
||||
mem.read_bytes(pkt_va, &mut packet);
|
||||
|
||||
let first_frame_offset = packet_frame_offset(&packet);
|
||||
let mut relative_offset = data.input_buffer_read_offset % BITS_PER_PACKET;
|
||||
if relative_offset < first_frame_offset {
|
||||
// Tail of a split frame — skip to this packet's first frame.
|
||||
data.input_buffer_read_offset =
|
||||
packet_index * BITS_PER_PACKET + first_frame_offset;
|
||||
relative_offset = first_frame_offset;
|
||||
}
|
||||
|
||||
let skip_count = packet_skip_count(&packet);
|
||||
// Full-packet skip (0xFF): no frames begin here — advance to the next
|
||||
// packet that does, swapping the buffer if exhausted.
|
||||
if skip_count == 0xFF {
|
||||
let next_packet_index = packet_index + 1;
|
||||
let next_off =
|
||||
self.next_packet_read_offset(mem, data, next_packet_index, pkt_count);
|
||||
if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER {
|
||||
self.swap_input_buffer(data);
|
||||
}
|
||||
data.input_buffer_read_offset = next_off;
|
||||
return;
|
||||
}
|
||||
|
||||
let info = get_packet_info(&packet, relative_offset);
|
||||
let packet_to_skip = (skip_count as u32) + 1;
|
||||
let next_packet_index = packet_index + packet_to_skip;
|
||||
|
||||
// Frame size: clamp to the bits remaining in the packet stream (canary
|
||||
// GetAmountOfBitsToRead over the (packet_index+1)*kBitsPerPacket stream).
|
||||
let stream_remaining =
|
||||
((packet_index + 1) * BITS_PER_PACKET).saturating_sub(data.input_buffer_read_offset);
|
||||
let frame_size = if info.current_frame_size == 0 {
|
||||
// Split header we can't resolve from this packet alone; fall back to
|
||||
// advancing past the rest of this packet so we don't stall.
|
||||
stream_remaining
|
||||
} else {
|
||||
info.current_frame_size
|
||||
};
|
||||
let bits_to_copy = amount_of_bits_to_read(stream_remaining, frame_size);
|
||||
|
||||
if !info.is_last_frame_in_packet() {
|
||||
let next_frame_offset =
|
||||
(data.input_buffer_read_offset + bits_to_copy) % BITS_PER_PACKET;
|
||||
data.input_buffer_read_offset =
|
||||
packet_index * BITS_PER_PACKET + next_frame_offset;
|
||||
return;
|
||||
}
|
||||
|
||||
// Last frame in this packet: move to the next packet's first frame, or
|
||||
// swap the input buffer if the packets are exhausted (canary's
|
||||
// `next_packet_index >= current_input_packet_count`).
|
||||
let mut next_off =
|
||||
self.next_packet_read_offset(mem, data, next_packet_index, pkt_count);
|
||||
if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER {
|
||||
self.swap_input_buffer(data);
|
||||
}
|
||||
if next_off == BITS_PER_PACKET_HEADER && data.is_any_input_buffer_valid() {
|
||||
// At the start of the next buffer: jump to its first frame offset.
|
||||
let nb_backing = xma_phys_to_backing(data.current_input_buffer_address());
|
||||
let mut hdr = [0u8; 4];
|
||||
mem.read_bytes(nb_backing, &mut hdr);
|
||||
let fo = packet_frame_offset(&hdr);
|
||||
if fo <= MAX_FRAME_SIZE_IN_BITS {
|
||||
next_off = fo;
|
||||
}
|
||||
}
|
||||
data.input_buffer_read_offset = next_off;
|
||||
}
|
||||
|
||||
/// Scan forward from `next_packet_index` (possibly into the *next* buffer)
|
||||
/// for the next packet that begins a frame and return its bit offset, or
|
||||
/// `BITS_PER_PACKET_HEADER` if none (canary `GetNextPacketReadOffset`).
|
||||
fn next_packet_read_offset(
|
||||
&self,
|
||||
mem: &GuestMemory,
|
||||
data: &XmaContextData,
|
||||
next_packet_index: u32,
|
||||
current_input_packet_count: u32,
|
||||
) -> u32 {
|
||||
use xma_decode::*;
|
||||
// Resolve which buffer the packet lives in (current or the other).
|
||||
let (buffer_index, mut pidx) = if next_packet_index >= current_input_packet_count {
|
||||
(data.current_buffer ^ 1, next_packet_index - current_input_packet_count)
|
||||
} else {
|
||||
(data.current_buffer, next_packet_index)
|
||||
};
|
||||
if !data.is_input_buffer_valid(buffer_index) {
|
||||
return BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
let addr = data.input_buffer_address(buffer_index);
|
||||
if addr == 0 {
|
||||
return BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
let pkt_count = data.input_buffer_packet_count(buffer_index);
|
||||
let backing = xma_phys_to_backing(addr);
|
||||
while pidx < pkt_count {
|
||||
let mut hdr = [0u8; 4];
|
||||
mem.read_bytes(backing + pidx * BYTES_PER_PACKET, &mut hdr);
|
||||
let fo = packet_frame_offset(&hdr);
|
||||
if fo <= MAX_FRAME_SIZE_IN_BITS {
|
||||
return pidx * BITS_PER_PACKET + fo;
|
||||
}
|
||||
pidx += 1;
|
||||
}
|
||||
BITS_PER_PACKET_HEADER
|
||||
}
|
||||
|
||||
fn swap_input_buffer(&mut self, data: &mut XmaContextData) {
|
||||
use xma_decode::*;
|
||||
tracing::debug!(
|
||||
from = data.current_buffer,
|
||||
to = data.current_buffer ^ 1,
|
||||
"xma: SwapInputBuffer (input buffer consumed)"
|
||||
);
|
||||
if data.current_buffer == 0 {
|
||||
data.input_buffer_0_valid = 0;
|
||||
} else {
|
||||
data.input_buffer_1_valid = 0;
|
||||
}
|
||||
data.current_buffer ^= 1;
|
||||
data.input_buffer_read_offset = BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for XmaDecoder {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the [`MmioRegion`] for the XMA register aperture at `0x7FEA0000`.
|
||||
/// Mirrors the GPU's `build_region`: the closures lock the shared decoder,
|
||||
/// compute the dword register index, and dispatch to `read`/`write_register`.
|
||||
pub fn build_mmio_region(dec: Arc<Mutex<XmaDecoder>>) -> MmioRegion {
|
||||
let read_dec = dec.clone();
|
||||
let write_dec = dec;
|
||||
|
||||
MmioRegion {
|
||||
base_address: APERTURE_BASE,
|
||||
mask: APERTURE_MASK,
|
||||
size: APERTURE_SIZE,
|
||||
read_callback: Box::new(move |addr: u32| {
|
||||
let reg_index = (addr & 0xFFFF) / 4;
|
||||
read_dec.lock().unwrap().read_register(reg_index)
|
||||
}),
|
||||
write_callback: Box::new(move |addr: u32, value: u32| {
|
||||
let reg_index = (addr & 0xFFFF) / 4;
|
||||
write_dec.lock().unwrap().write_register(reg_index, value);
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn inited() -> XmaDecoder {
|
||||
let mut d = XmaDecoder::new();
|
||||
// Pick a plausible physical-window VA/phys pair.
|
||||
d.init(0xA010_0000, 0x0010_0000);
|
||||
d
|
||||
}
|
||||
|
||||
/// The guest writes/reads the aperture byte-reversed; `wire(v)` is the raw
|
||||
/// bus value the guest sends to mean host-order `v` (and what a read of a
|
||||
/// host-order `v` returns). Equivalent to `lwbrx`/`stwbrx` semantics.
|
||||
fn wire(v: u32) -> u32 {
|
||||
v.swap_bytes()
|
||||
}
|
||||
|
||||
/// (a) `allocate_context` hands back distinct, increasing pointers spaced by
|
||||
/// the 64-byte stride, exhausts at 320, and `release_context` frees the slot.
|
||||
#[test]
|
||||
fn allocate_distinct_then_exhaust_then_release() {
|
||||
let mut d = inited();
|
||||
let first = d.allocate_context();
|
||||
let second = d.allocate_context();
|
||||
assert_eq!(first, 0xA010_0000);
|
||||
assert_eq!(second, 0xA010_0000 + XMA_CONTEXT_SIZE);
|
||||
assert!(second > first);
|
||||
|
||||
// Drain the remaining slots (2 already taken).
|
||||
for _ in 0..(XMA_CONTEXT_COUNT - 2) {
|
||||
assert_ne!(d.allocate_context(), 0);
|
||||
}
|
||||
// 321st allocation fails.
|
||||
assert_eq!(d.allocate_context(), 0);
|
||||
|
||||
// Free the first slot and re-acquire it.
|
||||
d.release_context(first);
|
||||
assert_eq!(d.allocate_context(), first);
|
||||
}
|
||||
|
||||
/// (b) A Kick to `Context0Kick` with host value `0b101` marks contexts 0
|
||||
/// and 2. The guest sends it byte-reversed (`wire`).
|
||||
#[test]
|
||||
fn kick_context0_marks_correct_contexts() {
|
||||
let mut d = inited();
|
||||
d.write_register(REG_CONTEXT_KICK_BASE, wire(0b101));
|
||||
assert!(d.is_pending(0));
|
||||
assert!(!d.is_pending(1));
|
||||
assert!(d.is_pending(2));
|
||||
assert_eq!(d.kick_count(), 2);
|
||||
}
|
||||
|
||||
/// (c) A Kick to `Context1Kick` (0x651) bit 0 maps to context_id 32.
|
||||
#[test]
|
||||
fn kick_context1_bit0_is_context_32() {
|
||||
let mut d = inited();
|
||||
d.write_register(REG_CONTEXT_KICK_BASE + 1, wire(0b1));
|
||||
assert!(d.is_pending(32));
|
||||
assert!(!d.is_pending(0));
|
||||
assert_eq!(d.kick_count(), 1);
|
||||
}
|
||||
|
||||
/// Regression for the byte-order fix: the guest's real Clear writes were
|
||||
/// `0x01000000`/`0x02000000`/`0x04000000` (bytes-reversed `1`/`2`/`4`),
|
||||
/// meaning contexts 0/1/2 — NOT 24/25/26. Verify the raw bus values decode
|
||||
/// to the low contexts.
|
||||
#[test]
|
||||
fn byte_reversed_clear_targets_low_contexts() {
|
||||
let mut d = inited();
|
||||
for i in 0..3 {
|
||||
d.write_register(REG_CONTEXT_KICK_BASE, wire(1 << i));
|
||||
}
|
||||
assert!(d.is_pending(0) && d.is_pending(1) && d.is_pending(2));
|
||||
// The exact bus values observed from the guest.
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0100_0000);
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0200_0000);
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0400_0000);
|
||||
assert!(!d.is_pending(0) && !d.is_pending(1) && !d.is_pending(2));
|
||||
}
|
||||
|
||||
/// (d) `read_register(0x600)` returns the base byte-reversed (the guest
|
||||
/// `lwbrx`-reverses it back to the host-order base on its side).
|
||||
#[test]
|
||||
fn context_array_address_reads_phys() {
|
||||
let d = inited();
|
||||
assert_eq!(
|
||||
d.read_register(REG_CONTEXT_ARRAY_ADDRESS),
|
||||
wire(0x0010_0000)
|
||||
);
|
||||
}
|
||||
|
||||
/// (e) `CurrentContextIndex` rotates on each read and wraps at the count
|
||||
/// (values returned byte-reversed).
|
||||
#[test]
|
||||
fn current_context_index_rotates() {
|
||||
let d = inited();
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0));
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(1));
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(2));
|
||||
// Advance to the wrap boundary.
|
||||
for _ in 3..XMA_CONTEXT_COUNT as u32 {
|
||||
d.read_register(REG_CURRENT_CONTEXT_INDEX);
|
||||
}
|
||||
// Next read wraps back to 0.
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0));
|
||||
}
|
||||
|
||||
/// Clear must drop a previously-kicked pending flag.
|
||||
#[test]
|
||||
fn clear_resets_pending() {
|
||||
let mut d = inited();
|
||||
d.write_register(REG_CONTEXT_KICK_BASE, wire(0b1));
|
||||
assert!(d.is_pending(0));
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, wire(0b1));
|
||||
assert!(!d.is_pending(0));
|
||||
}
|
||||
|
||||
/// The MMIO region routes a guest write at `BASE + 0x600*4` to reg 0x600
|
||||
/// and a read back through the same byte address, applying the byte swap.
|
||||
#[test]
|
||||
fn mmio_region_round_trips_register() {
|
||||
let dec = Arc::new(Mutex::new(inited()));
|
||||
let region = build_mmio_region(dec.clone());
|
||||
let kick_byte = APERTURE_BASE + REG_CONTEXT_KICK_BASE * 4;
|
||||
(region.write_callback)(kick_byte, wire(0b1));
|
||||
assert!(dec.lock().unwrap().is_pending(0));
|
||||
// ContextArrayAddress read-back via the bus (byte-reversed).
|
||||
let addr_byte = APERTURE_BASE + REG_CONTEXT_ARRAY_ADDRESS * 4;
|
||||
assert_eq!((region.read_callback)(addr_byte), wire(0x0010_0000));
|
||||
}
|
||||
}
|
||||
217
crates/xenia-apu/src/xma2_codec.rs
Normal file
217
crates/xenia-apu/src/xma2_codec.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Thin unsafe wrapper around the mainline FFmpeg `AV_CODEC_ID_XMA2` decoder.
|
||||
//!
|
||||
//! Unlike canary's vendored `XMAFRAMES` (one frame per packet, custom padding
|
||||
//! header), the distro xma2 decoder consumes whole 2 KB XMA2 packets
|
||||
//! (`block_align == 2048`), needs `extradata` declaring the channel/stream
|
||||
//! layout, and buffers samples internally across packets. We drive it with the
|
||||
//! guest's raw 2 KB packets and pull whatever 512-sample float-planar frames it
|
||||
//! emits, returning them as interleaved S16 big-endian PCM (canary `ConvertFrame`).
|
||||
|
||||
use std::os::raw::c_int;
|
||||
use std::ptr;
|
||||
|
||||
use ffmpeg_sys_next as ff;
|
||||
|
||||
/// One xma2 decoder instance, configured for a fixed (sample_rate, channels).
|
||||
pub struct Xma2Codec {
|
||||
codec: *const ff::AVCodec,
|
||||
ctx: *mut ff::AVCodecContext,
|
||||
frame: *mut ff::AVFrame,
|
||||
packet: *mut ff::AVPacket,
|
||||
extradata: Vec<u8>,
|
||||
channels: u32,
|
||||
}
|
||||
|
||||
// FFmpeg objects are not Send/Sync by default; the decoder is only ever touched
|
||||
// on the CPU scheduler thread (decode_pending), so this is sound for our use.
|
||||
unsafe impl Send for Xma2Codec {}
|
||||
|
||||
impl Xma2Codec {
|
||||
/// Build XMA2WAVEFORMATEX extradata (34 bytes) for a single XMA2 stream.
|
||||
/// Layout (little-endian, per FFmpeg `xma_decode_init` / xma2defs.h):
|
||||
/// [0..2] NumStreams (u16) = 1
|
||||
/// [2..6] ChannelMask (u32) = mono/stereo mask
|
||||
/// [6..34] remaining XMA2WAVEFORMATEX fields (unused by the decoder)
|
||||
fn build_extradata(channels: u32) -> Vec<u8> {
|
||||
let mut e = vec![0u8; 34];
|
||||
// NumStreams = 1
|
||||
e[0..2].copy_from_slice(&1u16.to_le_bytes());
|
||||
// ChannelMask: 0x3 (FL|FR) for stereo, 0x4 (FC) for mono.
|
||||
let mask: u32 = if channels >= 2 { 0x3 } else { 0x4 };
|
||||
e[2..6].copy_from_slice(&mask.to_le_bytes());
|
||||
e
|
||||
}
|
||||
|
||||
pub fn new(sample_rate: u32, channels: u32) -> Result<Self, String> {
|
||||
unsafe {
|
||||
let codec = ff::avcodec_find_decoder(ff::AVCodecID::AV_CODEC_ID_XMA2);
|
||||
if codec.is_null() {
|
||||
return Err("xma2 decoder not found in libavcodec".into());
|
||||
}
|
||||
let ctx = ff::avcodec_alloc_context3(codec);
|
||||
if ctx.is_null() {
|
||||
return Err("avcodec_alloc_context3 failed".into());
|
||||
}
|
||||
|
||||
let mut extradata = Self::build_extradata(channels);
|
||||
// FFmpeg requires extradata to be allocated with av_malloc and
|
||||
// padded; copy our bytes into an av_malloc'd buffer.
|
||||
let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize;
|
||||
let raw = ff::av_mallocz(extradata.len() + pad) as *mut u8;
|
||||
if raw.is_null() {
|
||||
ff::avcodec_free_context(&mut (ctx as *mut _));
|
||||
return Err("av_mallocz extradata failed".into());
|
||||
}
|
||||
ptr::copy_nonoverlapping(extradata.as_ptr(), raw, extradata.len());
|
||||
(*ctx).extradata = raw;
|
||||
(*ctx).extradata_size = extradata.len() as c_int;
|
||||
|
||||
(*ctx).sample_rate = sample_rate as c_int;
|
||||
(*ctx).block_align = 2048;
|
||||
ff::av_channel_layout_default(&mut (*ctx).ch_layout, channels as c_int);
|
||||
|
||||
let ret = ff::avcodec_open2(ctx, codec, ptr::null_mut());
|
||||
if ret < 0 {
|
||||
let mut ctxm = ctx;
|
||||
ff::avcodec_free_context(&mut ctxm);
|
||||
return Err(format!("avcodec_open2 failed: {}", av_err(ret)));
|
||||
}
|
||||
|
||||
let frame = ff::av_frame_alloc();
|
||||
let packet = ff::av_packet_alloc();
|
||||
if frame.is_null() || packet.is_null() {
|
||||
let mut ctxm = ctx;
|
||||
ff::avcodec_free_context(&mut ctxm);
|
||||
return Err("av_frame_alloc/av_packet_alloc failed".into());
|
||||
}
|
||||
|
||||
// keep our Vec alive as the source of truth for length
|
||||
extradata.shrink_to_fit();
|
||||
|
||||
Ok(Self {
|
||||
codec,
|
||||
ctx,
|
||||
frame,
|
||||
packet,
|
||||
extradata,
|
||||
channels,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn channels(&self) -> u32 {
|
||||
self.channels
|
||||
}
|
||||
|
||||
/// Feed one raw 2 KB XMA2 packet (header + data) to the decoder. Returns the
|
||||
/// number of bytes the decoder accepted (0 = buffered, needs no new packet
|
||||
/// yet / EAGAIN). Decoded frames are pulled via [`receive_frame`].
|
||||
pub fn send_packet(&mut self, packet: &[u8]) -> Result<(), String> {
|
||||
unsafe {
|
||||
// av_packet_from_data takes ownership of an av_malloc buffer; simpler
|
||||
// to point at our own bytes via a stack packet with a padded copy.
|
||||
let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize;
|
||||
let buf = ff::av_malloc(packet.len() + pad) as *mut u8;
|
||||
if buf.is_null() {
|
||||
return Err("av_malloc packet failed".into());
|
||||
}
|
||||
ptr::copy_nonoverlapping(packet.as_ptr(), buf, packet.len());
|
||||
ptr::write_bytes(buf.add(packet.len()), 0, pad);
|
||||
ff::av_packet_unref(self.packet);
|
||||
// Wrap buf so FFmpeg frees it.
|
||||
let ret = ff::av_packet_from_data(self.packet, buf, packet.len() as c_int);
|
||||
if ret < 0 {
|
||||
ff::av_free(buf as *mut _);
|
||||
return Err(format!("av_packet_from_data failed: {}", av_err(ret)));
|
||||
}
|
||||
let ret = ff::avcodec_send_packet(self.ctx, self.packet);
|
||||
if ret == ff::AVERROR(ff::EAGAIN) {
|
||||
// Decoder full — caller should drain frames first then retry.
|
||||
return Err("EAGAIN".into());
|
||||
}
|
||||
if ret < 0 {
|
||||
return Err(format!("avcodec_send_packet failed: {}", av_err(ret)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Signal end-of-stream so the decoder flushes its internal FIFO.
|
||||
pub fn send_eof(&mut self) {
|
||||
unsafe {
|
||||
let _ = ff::avcodec_send_packet(self.ctx, ptr::null());
|
||||
}
|
||||
}
|
||||
|
||||
/// Pull one decoded frame as interleaved S16 big-endian PCM, or None if the
|
||||
/// decoder needs more input (EAGAIN) or is drained (EOF). Returns
|
||||
/// (samples_per_channel, interleaved_s16be_bytes).
|
||||
pub fn receive_frame(&mut self) -> Option<(u32, Vec<u8>)> {
|
||||
unsafe {
|
||||
let ret = ff::avcodec_receive_frame(self.ctx, self.frame);
|
||||
if ret < 0 {
|
||||
return None;
|
||||
}
|
||||
let nb = (*self.frame).nb_samples as u32;
|
||||
if nb == 0 {
|
||||
return None;
|
||||
}
|
||||
let ch = (*self.frame).ch_layout.nb_channels.max(1) as u32;
|
||||
let out = convert_frame_planar_to_s16be(self.frame, ch, nb);
|
||||
Some((nb, out))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Xma2Codec {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
if !self.frame.is_null() {
|
||||
ff::av_frame_free(&mut self.frame);
|
||||
}
|
||||
if !self.packet.is_null() {
|
||||
ff::av_packet_free(&mut self.packet);
|
||||
}
|
||||
if !self.ctx.is_null() {
|
||||
ff::avcodec_free_context(&mut self.ctx);
|
||||
}
|
||||
let _ = &self.codec;
|
||||
let _ = &self.extradata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert FFmpeg planar-float output to interleaved S16 big-endian PCM
|
||||
/// (faithful to canary `XmaContext::ConvertFrame`: saturate to [-1,1], scale by
|
||||
/// 2^15-1, byte-swap each sample). `channels` planes of `nb_samples` floats.
|
||||
unsafe fn convert_frame_planar_to_s16be(
|
||||
frame: *mut ff::AVFrame,
|
||||
channels: u32,
|
||||
nb_samples: u32,
|
||||
) -> Vec<u8> {
|
||||
const SCALE: f32 = ((1i32 << 15) - 1) as f32;
|
||||
let mut out = Vec::with_capacity((nb_samples * channels * 2) as usize);
|
||||
unsafe {
|
||||
// extended_data[ch] points to a plane of f32 (AV_SAMPLE_FMT_FLTP).
|
||||
let ext = (*frame).extended_data;
|
||||
for i in 0..nb_samples as isize {
|
||||
for ch in 0..channels as isize {
|
||||
let plane = *ext.offset(ch) as *const f32;
|
||||
let s = if plane.is_null() { 0.0 } else { *plane.offset(i) };
|
||||
let clamped = s.clamp(-1.0, 1.0) * SCALE;
|
||||
let v = clamped as i16;
|
||||
out.extend_from_slice(&v.to_be_bytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn av_err(code: c_int) -> String {
|
||||
unsafe {
|
||||
let mut buf = [0i8; ff::AV_ERROR_MAX_STRING_SIZE as usize];
|
||||
ff::av_strerror(code, buf.as_mut_ptr(), buf.len());
|
||||
let cstr = std::ffi::CStr::from_ptr(buf.as_ptr());
|
||||
cstr.to_string_lossy().into_owned()
|
||||
}
|
||||
}
|
||||
690
crates/xenia-apu/src/xma_decode.rs
Normal file
690
crates/xenia-apu/src/xma_decode.rs
Normal file
@@ -0,0 +1,690 @@
|
||||
//! Stage 3 — the real XMA2→PCM decoder.
|
||||
//!
|
||||
//! A faithful port of xenia-canary's `apu/xma_context_new.cc` decode pipeline
|
||||
//! (`Work`/`Decode`/`Consume`/`StoreContextMerged`), adapted to the *mainline*
|
||||
//! distro FFmpeg `AV_CODEC_ID_XMA2` decoder rather than canary's vendored
|
||||
//! `AV_CODEC_ID_XMAFRAMES`.
|
||||
//!
|
||||
//! ## Determinism
|
||||
//! There is no host decoder thread. [`super::xma::XmaDecoder::decode_pending`]
|
||||
//! is invoked from the CPU scheduler's per-round coordinator
|
||||
//! (`coord_post_round` in xenia-app). FFmpeg decode is itself deterministic
|
||||
//! (same input bytes → same PCM), so the lockstep golden stays reproducible.
|
||||
//!
|
||||
//! ## FFmpeg framing — why this differs from canary
|
||||
//! Canary feeds FFmpeg one *frame* at a time (it bit-extracts a single 512-
|
||||
//! sample frame from the guest packet stream and hands it to the vendored
|
||||
//! `XMAFRAMES` codec with a custom 1-byte padding header). The mainline
|
||||
//! `xma2` decoder does NOT have `XMAFRAMES`; instead it consumes whole 2 KB
|
||||
//! XMA2 *packets* (`block_align == 2048`), needs `extradata` declaring the
|
||||
//! stream/channel layout, and manages frame splitting + a per-stream sample
|
||||
//! FIFO internally. So this module keeps canary's *guest-facing* contract
|
||||
//! (the `XMA_CONTEXT_DATA` packet/frame bookkeeping, the 256-byte-block output
|
||||
//! ring buffer, the field writeback) but replaces canary's per-frame
|
||||
//! `Decode()` body with: feed the current 2 KB packet to the xma2 decoder,
|
||||
//! pull any 512-sample PCM frames it emits, convert them to interleaved S16BE,
|
||||
//! and stage them as the "raw frame" that `Consume()` drains into the output
|
||||
//! ring.
|
||||
//!
|
||||
//! See `xma2_codec.rs` for the unsafe FFmpeg wrapper.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use xenia_memory::access::MemoryAccess;
|
||||
use xenia_memory::GuestMemory;
|
||||
|
||||
use crate::xma2_codec::Xma2Codec;
|
||||
|
||||
// ---- Constants (canary `XmaContext` / `XmaContextNew`).
|
||||
|
||||
pub const BYTES_PER_PACKET: u32 = 2048;
|
||||
pub const BYTES_PER_PACKET_HEADER: u32 = 4;
|
||||
pub const BYTES_PER_PACKET_DATA: u32 = BYTES_PER_PACKET - BYTES_PER_PACKET_HEADER;
|
||||
pub const BITS_PER_PACKET: u32 = BYTES_PER_PACKET * 8;
|
||||
/// Canary `kBitsPerPacketHeader` (in the *new* context) is 32.
|
||||
pub const BITS_PER_PACKET_HEADER: u32 = 32;
|
||||
pub const BITS_PER_FRAME_HEADER: u32 = 15;
|
||||
|
||||
pub const SAMPLES_PER_FRAME: u32 = 512;
|
||||
pub const BYTES_PER_SAMPLE: u32 = 2;
|
||||
pub const BYTES_PER_FRAME_CHANNEL: u32 = SAMPLES_PER_FRAME * BYTES_PER_SAMPLE; // 1024
|
||||
pub const OUTPUT_BYTES_PER_BLOCK: u32 = 256;
|
||||
pub const OUTPUT_MAX_SIZE_BYTES: u32 = 31 * OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
pub const MAX_FRAME_LENGTH: u32 = 0x7FFF;
|
||||
pub const MAX_FRAME_SIZE_IN_BITS: u32 = 0x4000 - BITS_PER_PACKET_HEADER;
|
||||
|
||||
const ID_TO_SAMPLE_RATE: [u32; 4] = [24000, 32000, 44100, 48000];
|
||||
|
||||
/// Project a bare-physical XMA buffer pointer (`0x0xxxxxxx`) to the host-backed
|
||||
/// guest VA used by the rest of the emulator. Identical formula to
|
||||
/// `xenia_gpu::physical_to_backing` for the physical window; the input/output
|
||||
/// buffer pointers in the context are always in the low physical window.
|
||||
#[inline]
|
||||
pub fn xma_phys_to_backing(p: u32) -> u32 {
|
||||
0x4000_0000 | (p & 0x1FFF_FFFF)
|
||||
}
|
||||
|
||||
// ---- XMA_CONTEXT_DATA (canary `xma_context.h`, 64 bytes, 16 dwords).
|
||||
//
|
||||
// Stored big-endian in guest memory. We load all 16 dwords (BE) and unpack the
|
||||
// bitfields exactly per the canary layout (bitfields pack LSB-first within each
|
||||
// host-order dword). All fields below are kept as plain integers.
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct XmaContextData {
|
||||
// DWORD 0
|
||||
pub input_buffer_0_packet_count: u32, // :12
|
||||
pub loop_count: u32, // :8
|
||||
pub input_buffer_0_valid: u32, // :1
|
||||
pub input_buffer_1_valid: u32, // :1
|
||||
pub output_buffer_block_count: u32, // :5
|
||||
pub output_buffer_write_offset: u32, // :5
|
||||
// DWORD 1
|
||||
pub input_buffer_1_packet_count: u32, // :12
|
||||
pub loop_subframe_start: u32, // :2
|
||||
pub loop_subframe_end: u32, // :3
|
||||
pub loop_subframe_skip: u32, // :3
|
||||
pub subframe_decode_count: u32, // :4
|
||||
pub output_buffer_padding: u32, // :3
|
||||
pub sample_rate: u32, // :2
|
||||
pub is_stereo: u32, // :1
|
||||
pub unk_dword_1_c: u32, // :1
|
||||
pub output_buffer_valid: u32, // :1
|
||||
// DWORD 2
|
||||
pub input_buffer_read_offset: u32, // :26
|
||||
pub error_status: u32, // :5
|
||||
pub error_set: u32, // :1
|
||||
// DWORD 3
|
||||
pub loop_start: u32, // :26
|
||||
pub parser_error_status: u32, // :5
|
||||
pub parser_error_set: u32, // :1
|
||||
// DWORD 4
|
||||
pub loop_end: u32, // :26
|
||||
pub packet_metadata: u32, // :5
|
||||
pub current_buffer: u32, // :1
|
||||
// DWORD 5..8
|
||||
pub input_buffer_0_ptr: u32,
|
||||
pub input_buffer_1_ptr: u32,
|
||||
pub output_buffer_ptr: u32,
|
||||
pub work_buffer_ptr: u32,
|
||||
// DWORD 9
|
||||
pub output_buffer_read_offset: u32, // :5
|
||||
pub stop_when_done: u32, // :1 (bit 30)
|
||||
pub interrupt_when_done: u32, // :1 (bit 31)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn bits(v: u32, shift: u32, width: u32) -> u32 {
|
||||
(v >> shift) & ((1u32 << width) - 1)
|
||||
}
|
||||
|
||||
impl XmaContextData {
|
||||
/// Read the 64-byte context struct from guest VA `ctx_va` (already a VA,
|
||||
/// not a physical ptr). Each dword is read big-endian via `read_u32`.
|
||||
pub fn read(mem: &GuestMemory, ctx_va: u32) -> Self {
|
||||
let mut d = [0u32; 16];
|
||||
for (i, w) in d.iter_mut().enumerate() {
|
||||
*w = mem.read_u32(ctx_va + (i as u32) * 4);
|
||||
}
|
||||
let mut c = Self::default();
|
||||
// DWORD 0
|
||||
c.input_buffer_0_packet_count = bits(d[0], 0, 12);
|
||||
c.loop_count = bits(d[0], 12, 8);
|
||||
c.input_buffer_0_valid = bits(d[0], 20, 1);
|
||||
c.input_buffer_1_valid = bits(d[0], 21, 1);
|
||||
c.output_buffer_block_count = bits(d[0], 22, 5);
|
||||
c.output_buffer_write_offset = bits(d[0], 27, 5);
|
||||
// DWORD 1
|
||||
c.input_buffer_1_packet_count = bits(d[1], 0, 12);
|
||||
c.loop_subframe_start = bits(d[1], 12, 2);
|
||||
c.loop_subframe_end = bits(d[1], 14, 3);
|
||||
c.loop_subframe_skip = bits(d[1], 17, 3);
|
||||
c.subframe_decode_count = bits(d[1], 20, 4);
|
||||
c.output_buffer_padding = bits(d[1], 24, 3);
|
||||
c.sample_rate = bits(d[1], 27, 2);
|
||||
c.is_stereo = bits(d[1], 29, 1);
|
||||
c.unk_dword_1_c = bits(d[1], 30, 1);
|
||||
c.output_buffer_valid = bits(d[1], 31, 1);
|
||||
// DWORD 2
|
||||
c.input_buffer_read_offset = bits(d[2], 0, 26);
|
||||
c.error_status = bits(d[2], 26, 5);
|
||||
c.error_set = bits(d[2], 31, 1);
|
||||
// DWORD 3
|
||||
c.loop_start = bits(d[3], 0, 26);
|
||||
c.parser_error_status = bits(d[3], 26, 5);
|
||||
c.parser_error_set = bits(d[3], 31, 1);
|
||||
// DWORD 4
|
||||
c.loop_end = bits(d[4], 0, 26);
|
||||
c.packet_metadata = bits(d[4], 26, 5);
|
||||
c.current_buffer = bits(d[4], 31, 1);
|
||||
// DWORD 5..8
|
||||
c.input_buffer_0_ptr = d[5];
|
||||
c.input_buffer_1_ptr = d[6];
|
||||
c.output_buffer_ptr = d[7];
|
||||
c.work_buffer_ptr = d[8];
|
||||
// DWORD 9
|
||||
c.output_buffer_read_offset = bits(d[9], 0, 5);
|
||||
c.stop_when_done = bits(d[9], 30, 1);
|
||||
c.interrupt_when_done = bits(d[9], 31, 1);
|
||||
c
|
||||
}
|
||||
|
||||
/// Repack the bitfields back into the 16 dwords (host order). Only the
|
||||
/// decoder-owned fields differ from what was read; callers use
|
||||
/// [`store_merged`] to write back without clobbering game-owned fields.
|
||||
fn pack(&self) -> [u32; 16] {
|
||||
let mut d = [0u32; 16];
|
||||
d[0] = (self.input_buffer_0_packet_count & 0xFFF)
|
||||
| ((self.loop_count & 0xFF) << 12)
|
||||
| ((self.input_buffer_0_valid & 1) << 20)
|
||||
| ((self.input_buffer_1_valid & 1) << 21)
|
||||
| ((self.output_buffer_block_count & 0x1F) << 22)
|
||||
| ((self.output_buffer_write_offset & 0x1F) << 27);
|
||||
d[1] = (self.input_buffer_1_packet_count & 0xFFF)
|
||||
| ((self.loop_subframe_start & 0x3) << 12)
|
||||
| ((self.loop_subframe_end & 0x7) << 14)
|
||||
| ((self.loop_subframe_skip & 0x7) << 17)
|
||||
| ((self.subframe_decode_count & 0xF) << 20)
|
||||
| ((self.output_buffer_padding & 0x7) << 24)
|
||||
| ((self.sample_rate & 0x3) << 27)
|
||||
| ((self.is_stereo & 1) << 29)
|
||||
| ((self.unk_dword_1_c & 1) << 30)
|
||||
| ((self.output_buffer_valid & 1) << 31);
|
||||
d[2] = (self.input_buffer_read_offset & 0x3FF_FFFF)
|
||||
| ((self.error_status & 0x1F) << 26)
|
||||
| ((self.error_set & 1) << 31);
|
||||
d[3] = (self.loop_start & 0x3FF_FFFF)
|
||||
| ((self.parser_error_status & 0x1F) << 26)
|
||||
| ((self.parser_error_set & 1) << 31);
|
||||
d[4] = (self.loop_end & 0x3FF_FFFF)
|
||||
| ((self.packet_metadata & 0x1F) << 26)
|
||||
| ((self.current_buffer & 1) << 31);
|
||||
d[5] = self.input_buffer_0_ptr;
|
||||
d[6] = self.input_buffer_1_ptr;
|
||||
d[7] = self.output_buffer_ptr;
|
||||
d[8] = self.work_buffer_ptr;
|
||||
d[9] = (self.output_buffer_read_offset & 0x1F)
|
||||
| ((self.stop_when_done & 1) << 30)
|
||||
| ((self.interrupt_when_done & 1) << 31);
|
||||
d
|
||||
}
|
||||
|
||||
pub fn is_input_buffer_valid(&self, idx: u32) -> bool {
|
||||
if idx == 0 {
|
||||
self.input_buffer_0_valid != 0
|
||||
} else {
|
||||
self.input_buffer_1_valid != 0
|
||||
}
|
||||
}
|
||||
pub fn is_current_input_buffer_valid(&self) -> bool {
|
||||
self.is_input_buffer_valid(self.current_buffer)
|
||||
}
|
||||
pub fn is_any_input_buffer_valid(&self) -> bool {
|
||||
self.input_buffer_0_valid != 0 || self.input_buffer_1_valid != 0
|
||||
}
|
||||
pub fn input_buffer_address(&self, idx: u32) -> u32 {
|
||||
if idx == 0 {
|
||||
self.input_buffer_0_ptr
|
||||
} else {
|
||||
self.input_buffer_1_ptr
|
||||
}
|
||||
}
|
||||
pub fn current_input_buffer_address(&self) -> u32 {
|
||||
self.input_buffer_address(self.current_buffer)
|
||||
}
|
||||
pub fn input_buffer_packet_count(&self, idx: u32) -> u32 {
|
||||
if idx == 0 {
|
||||
self.input_buffer_0_packet_count
|
||||
} else {
|
||||
self.input_buffer_1_packet_count
|
||||
}
|
||||
}
|
||||
pub fn current_input_buffer_packet_count(&self) -> u32 {
|
||||
self.input_buffer_packet_count(self.current_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge decoder-owned fields back into guest memory (canary `StoreContextMerged`).
|
||||
/// Re-reads the current context (game may have raced an update), overwrites only
|
||||
/// the fields the decoder owns, and writes all 16 dwords back BE.
|
||||
fn store_merged(
|
||||
mem: &GuestMemory,
|
||||
ctx_va: u32,
|
||||
data: &XmaContextData,
|
||||
initial: &XmaContextData,
|
||||
) {
|
||||
let mut fresh = XmaContextData::read(mem, ctx_va);
|
||||
// DWORD 0
|
||||
fresh.loop_count = data.loop_count;
|
||||
fresh.output_buffer_write_offset = data.output_buffer_write_offset;
|
||||
if initial.input_buffer_0_valid != 0 && data.input_buffer_0_valid == 0 {
|
||||
fresh.input_buffer_0_valid = 0;
|
||||
}
|
||||
if initial.input_buffer_1_valid != 0 && data.input_buffer_1_valid == 0 {
|
||||
fresh.input_buffer_1_valid = 0;
|
||||
}
|
||||
// DWORD 1
|
||||
if initial.output_buffer_valid != 0 && data.output_buffer_valid == 0 {
|
||||
fresh.output_buffer_valid = 0;
|
||||
}
|
||||
// DWORD 2
|
||||
fresh.input_buffer_read_offset = data.input_buffer_read_offset;
|
||||
fresh.error_status = data.error_status;
|
||||
// DWORD 4
|
||||
fresh.current_buffer = data.current_buffer;
|
||||
// DWORD 9
|
||||
fresh.output_buffer_read_offset = data.output_buffer_read_offset;
|
||||
|
||||
let d = fresh.pack();
|
||||
for (i, w) in d.iter().enumerate() {
|
||||
mem.write_u32(ctx_va + (i as u32) * 4, *w);
|
||||
}
|
||||
}
|
||||
|
||||
/// Public wrapper for [`store_merged`] (called from the orchestrator in xma.rs).
|
||||
pub fn store_merged_pub(
|
||||
mem: &GuestMemory,
|
||||
ctx_va: u32,
|
||||
data: &XmaContextData,
|
||||
initial: &XmaContextData,
|
||||
) {
|
||||
store_merged(mem, ctx_va, data, initial);
|
||||
}
|
||||
|
||||
/// Free byte count in a ring buffer from `write_off` to `read_off`
|
||||
/// (canary `RingBuffer::write_count`).
|
||||
pub fn ring_write_count(read_off: u32, write_off: u32, capacity: u32) -> u32 {
|
||||
if read_off == write_off {
|
||||
capacity
|
||||
} else if write_off < read_off {
|
||||
read_off - write_off
|
||||
} else {
|
||||
(capacity - write_off) + read_off
|
||||
}
|
||||
}
|
||||
|
||||
/// Write `bytes` into the guest ring buffer at `backing + write_off`, wrapping
|
||||
/// at `capacity`. Returns the new write offset (canary `RingBuffer::Write`).
|
||||
pub fn ring_write(
|
||||
mem: &GuestMemory,
|
||||
backing: u32,
|
||||
capacity: u32,
|
||||
write_off: u32,
|
||||
bytes: &[u8],
|
||||
) -> u32 {
|
||||
let count = (bytes.len() as u32).min(capacity);
|
||||
if count == 0 {
|
||||
return write_off;
|
||||
}
|
||||
if write_off + count < capacity {
|
||||
mem.write_bytes(backing + write_off, &bytes[..count as usize]);
|
||||
write_off + count
|
||||
} else {
|
||||
let left = capacity - write_off;
|
||||
mem.write_bytes(backing + write_off, &bytes[..left as usize]);
|
||||
let right = count - left;
|
||||
mem.write_bytes(backing, &bytes[left as usize..(left + right) as usize]);
|
||||
right
|
||||
}
|
||||
}
|
||||
|
||||
// ---- BitStream (port of canary `base/bit_stream.cc`). Big-endian source.
|
||||
|
||||
pub struct BitStream<'a> {
|
||||
buf: &'a [u8],
|
||||
offset_bits: usize,
|
||||
size_bits: usize,
|
||||
}
|
||||
|
||||
impl<'a> BitStream<'a> {
|
||||
pub fn new(buf: &'a [u8], size_bits: usize) -> Self {
|
||||
Self { buf, offset_bits: 0, size_bits }
|
||||
}
|
||||
pub fn offset_bits(&self) -> usize {
|
||||
self.offset_bits
|
||||
}
|
||||
pub fn set_offset(&mut self, off: usize) {
|
||||
self.offset_bits = off.min(self.size_bits);
|
||||
}
|
||||
pub fn advance(&mut self, n: usize) {
|
||||
self.set_offset(self.offset_bits + n);
|
||||
}
|
||||
pub fn bits_remaining(&self) -> usize {
|
||||
self.size_bits - self.offset_bits
|
||||
}
|
||||
/// Peek up to 57 bits (canary contract). Reads 8 bytes BE then shifts.
|
||||
pub fn peek(&self, num_bits: usize) -> u64 {
|
||||
debug_assert!(num_bits <= 57);
|
||||
// offset_bytes = min(offset>>3, (size-64)>>3), matching canary so an
|
||||
// 8-byte load near the buffer end stays in range.
|
||||
let max_byte = if self.size_bits >= 64 {
|
||||
(self.size_bits - 64) >> 3
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let offset_bytes = (self.offset_bits >> 3).min(max_byte);
|
||||
let rel = self.offset_bits - (offset_bytes << 3);
|
||||
let mut tmp = [0u8; 8];
|
||||
let avail = self.buf.len().saturating_sub(offset_bytes).min(8);
|
||||
tmp[..avail].copy_from_slice(&self.buf[offset_bytes..offset_bytes + avail]);
|
||||
let mut value = u64::from_be_bytes(tmp);
|
||||
value >>= 64 - (rel + num_bits);
|
||||
value &= (1u64 << num_bits) - 1;
|
||||
value
|
||||
}
|
||||
pub fn read(&mut self, num_bits: usize) -> u64 {
|
||||
let v = self.peek(num_bits);
|
||||
self.advance(num_bits);
|
||||
v
|
||||
}
|
||||
/// Copy `num_bits` from the stream into `dest` (bit-packed, MSB-first within
|
||||
/// each byte). Returns the starting bit offset within the first byte
|
||||
/// (canary returns `rel_offset_bits` — the frame's intra-byte alignment).
|
||||
pub fn copy(&mut self, dest: &mut [u8], num_bits: usize) -> usize {
|
||||
let offset_bytes = self.offset_bits >> 3;
|
||||
let rel = self.offset_bits - (offset_bytes << 3);
|
||||
let mut bits_left = num_bits;
|
||||
let mut out = 0usize;
|
||||
|
||||
if rel != 0 {
|
||||
let bits = self.peek(8 - rel) as u8;
|
||||
let clear_mask = !(((1u8 << rel) - 1)) as u8;
|
||||
dest[out] &= clear_mask;
|
||||
dest[out] |= bits;
|
||||
bits_left -= 8 - rel;
|
||||
self.advance(8 - rel);
|
||||
out += 1;
|
||||
}
|
||||
if bits_left >= 8 {
|
||||
let nbytes = bits_left / 8;
|
||||
let src_off = (self.offset_bits >> 3).min(self.buf.len());
|
||||
let copy = nbytes.min(self.buf.len().saturating_sub(src_off));
|
||||
dest[out..out + copy]
|
||||
.copy_from_slice(&self.buf[src_off..src_off + copy]);
|
||||
out += nbytes;
|
||||
self.advance(nbytes * 8);
|
||||
bits_left -= nbytes * 8;
|
||||
}
|
||||
if bits_left != 0 {
|
||||
let mut b = self.peek(bits_left) as u8;
|
||||
b <<= 8 - bits_left;
|
||||
let clear_mask = ((1u16 << bits_left) - 1) as u8;
|
||||
dest[out] &= clear_mask;
|
||||
dest[out] |= b;
|
||||
self.advance(bits_left);
|
||||
}
|
||||
rel
|
||||
}
|
||||
}
|
||||
|
||||
// ---- XMA packet header helpers (canary `xma_helpers.h`).
|
||||
|
||||
#[inline]
|
||||
pub fn packet_frame_count(packet: &[u8]) -> u8 {
|
||||
packet[0] >> 2
|
||||
}
|
||||
#[inline]
|
||||
pub fn packet_metadata(packet: &[u8]) -> u8 {
|
||||
packet[2] & 0x7
|
||||
}
|
||||
#[inline]
|
||||
pub fn is_packet_xma2(packet: &[u8]) -> bool {
|
||||
packet_metadata(packet) == 1
|
||||
}
|
||||
#[inline]
|
||||
pub fn packet_skip_count(packet: &[u8]) -> u8 {
|
||||
packet[3]
|
||||
}
|
||||
/// First frame offset in bits (canary `GetPacketFrameOffset`): a 15-bit value
|
||||
/// across bytes 0..2, plus the 32-bit header.
|
||||
#[inline]
|
||||
pub fn packet_frame_offset(packet: &[u8]) -> u32 {
|
||||
let val = (((packet[0] as u32 & 0x3) << 13)
|
||||
| ((packet[1] as u32) << 5)
|
||||
| ((packet[2] as u32) >> 3))
|
||||
& 0xFFFF;
|
||||
val + 32
|
||||
}
|
||||
|
||||
/// Sample-rate id → Hz.
|
||||
pub fn sample_rate_hz(id: u32) -> u32 {
|
||||
ID_TO_SAMPLE_RATE[id.min(3) as usize]
|
||||
}
|
||||
|
||||
// ---- Packet-walk for faithful input-offset advance (canary `GetPacketInfo`,
|
||||
// `GetNextPacketReadOffset`, and the offset arithmetic at the tail of
|
||||
// `XmaContextNew::Decode`). These let us advance `input_buffer_read_offset` one
|
||||
// *frame* at a time at canary's exact cadence — independent of the mainline
|
||||
// xma2 decoder's whole-packet/burst framing — so the offset crosses packet and
|
||||
// buffer boundaries (and triggers SwapInputBuffer) at the true input-drain
|
||||
// rate the guest's WMV demuxer polls.
|
||||
|
||||
/// Info about the frame at a given bit offset within a packet (canary
|
||||
/// `kPacketInfo` / `GetPacketInfo`). `frame_count_` is the number of frames
|
||||
/// that begin in the packet; `current_frame_size_` is the compressed bit size
|
||||
/// of the frame at `frame_offset` (0 if it can't be resolved within this
|
||||
/// packet — a split header).
|
||||
#[derive(Default, Clone, Copy)]
|
||||
pub struct PacketInfo {
|
||||
pub frame_count: u32,
|
||||
pub current_frame: u32,
|
||||
pub current_frame_size: u32,
|
||||
}
|
||||
|
||||
impl PacketInfo {
|
||||
pub fn is_last_frame_in_packet(&self) -> bool {
|
||||
self.current_frame + 1 == self.frame_count
|
||||
}
|
||||
}
|
||||
|
||||
/// Faithful port of canary `XmaContextNew::GetPacketInfo`.
|
||||
pub fn get_packet_info(packet: &[u8], frame_offset: u32) -> PacketInfo {
|
||||
let mut info = PacketInfo::default();
|
||||
let first_frame_offset = packet_frame_offset(packet);
|
||||
let mut stream = BitStream::new(packet, BITS_PER_PACKET as usize);
|
||||
stream.set_offset(first_frame_offset as usize);
|
||||
|
||||
// Split frame from previous packet.
|
||||
if frame_offset < first_frame_offset {
|
||||
info.current_frame = 0;
|
||||
info.current_frame_size = first_frame_offset - frame_offset;
|
||||
}
|
||||
|
||||
loop {
|
||||
if stream.bits_remaining() < BITS_PER_FRAME_HEADER as usize {
|
||||
break;
|
||||
}
|
||||
let frame_size = stream.peek(BITS_PER_FRAME_HEADER as usize) as u32;
|
||||
if frame_size == 0 || frame_size == MAX_FRAME_LENGTH {
|
||||
break;
|
||||
}
|
||||
if stream.offset_bits() == frame_offset as usize {
|
||||
info.current_frame = info.frame_count;
|
||||
info.current_frame_size = frame_size;
|
||||
}
|
||||
info.frame_count += 1;
|
||||
if frame_size as usize > stream.bits_remaining() {
|
||||
// Last frame.
|
||||
break;
|
||||
}
|
||||
stream.advance((frame_size - 1) as usize);
|
||||
// Trailing continuation bit.
|
||||
if stream.read(1) == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if is_packet_xma2(packet) {
|
||||
let xma2_frame_count = packet_frame_count(packet) as u32;
|
||||
if xma2_frame_count > info.frame_count {
|
||||
if info.current_frame_size == 0 {
|
||||
info.current_frame = info.frame_count;
|
||||
}
|
||||
info.frame_count = xma2_frame_count;
|
||||
}
|
||||
}
|
||||
info
|
||||
}
|
||||
|
||||
/// Packet number for a bit offset (canary `GetPacketNumber`). Returns None when
|
||||
/// the offset is in the header or past the buffer.
|
||||
pub fn packet_number(size_bytes: u32, bit_offset: u32) -> Option<u32> {
|
||||
if bit_offset < BITS_PER_PACKET_HEADER {
|
||||
return None;
|
||||
}
|
||||
if bit_offset >= size_bytes * 8 {
|
||||
return None;
|
||||
}
|
||||
Some((bit_offset >> 3) / BYTES_PER_PACKET)
|
||||
}
|
||||
|
||||
/// min(remaining_stream_bits, frame_size) (canary `GetAmountOfBitsToRead`).
|
||||
pub fn amount_of_bits_to_read(remaining_stream_bits: u32, frame_size: u32) -> u32 {
|
||||
remaining_stream_bits.min(frame_size)
|
||||
}
|
||||
|
||||
// ---- Per-context decode state (lives in the XmaDecoder, one per ctx).
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ContextDecodeState {
|
||||
/// FFmpeg xma2 codec for this context (lazily created / reconfigured).
|
||||
pub codec: Option<Xma2Codec>,
|
||||
pub codec_rate: u32,
|
||||
pub codec_channels: u32,
|
||||
/// Staged interleaved S16BE PCM for the current decoded frame
|
||||
/// (`raw_frame_`), drained by Consume in 256-byte blocks.
|
||||
pub raw_frame: Vec<u8>,
|
||||
/// Decoded interleaved S16BE PCM not yet split into per-frame `raw_frame`s.
|
||||
/// The mainline xma2 decoder emits bursts of many 512-sample frames at once
|
||||
/// (internal FIFO + 4096-sample lookahead); we queue the bytes here and
|
||||
/// hand the guest exactly one 512-sample frame per `produce_frame`.
|
||||
pub pcm_queue: VecDeque<u8>,
|
||||
pub current_frame_remaining_subframes: u8,
|
||||
pub remaining_subframe_blocks_in_output: i32,
|
||||
/// Total 512-sample frames decoded for this context (diagnostic).
|
||||
pub frames_decoded: u64,
|
||||
/// Whether a "first frame" diagnostic has been emitted.
|
||||
pub first_frame_logged: bool,
|
||||
/// FFmpeg feed cursor: the next packet index (within the *current* input
|
||||
/// buffer at feed time) we will hand to FFmpeg. This is the decoder's
|
||||
/// internal intake position and is intentionally decoupled from the
|
||||
/// guest-visible `input_buffer_read_offset` (which advances per *emitted*
|
||||
/// frame via the faithful packet-walk). We feed ahead so FFmpeg always has
|
||||
/// enough buffered input to satisfy the guest's drain, while the guest sees
|
||||
/// the read offset move at canary's true per-frame cadence.
|
||||
pub feed_packet_index: u32,
|
||||
/// `current_buffer` the feed cursor is reading from; reset on swap so the
|
||||
/// feed follows the same ping-pong as the guest-visible buffer.
|
||||
pub feed_buffer: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// The bitfield unpack/pack must round-trip every decoder-relevant field at
|
||||
/// the exact canary offsets (regression against a shifted bit).
|
||||
#[test]
|
||||
fn context_bitfields_round_trip() {
|
||||
let mut c = XmaContextData::default();
|
||||
c.input_buffer_0_packet_count = 632;
|
||||
c.loop_count = 0;
|
||||
c.input_buffer_0_valid = 1;
|
||||
c.input_buffer_1_valid = 0;
|
||||
c.output_buffer_block_count = 30;
|
||||
c.output_buffer_write_offset = 5;
|
||||
c.subframe_decode_count = 8;
|
||||
c.output_buffer_padding = 1;
|
||||
c.sample_rate = 3;
|
||||
c.is_stereo = 1;
|
||||
c.output_buffer_valid = 1;
|
||||
c.input_buffer_read_offset = 16416;
|
||||
c.error_status = 4;
|
||||
c.current_buffer = 1;
|
||||
c.input_buffer_0_ptr = 0x0b9f_d000;
|
||||
c.output_buffer_ptr = 0x01f6_6e00;
|
||||
c.output_buffer_read_offset = 7;
|
||||
c.interrupt_when_done = 1;
|
||||
|
||||
// pack → words → re-read via the same word layout.
|
||||
let d = c.pack();
|
||||
// Simulate read() decode from the packed words.
|
||||
let mut c2 = XmaContextData::default();
|
||||
c2.input_buffer_0_packet_count = bits(d[0], 0, 12);
|
||||
c2.input_buffer_0_valid = bits(d[0], 20, 1);
|
||||
c2.output_buffer_block_count = bits(d[0], 22, 5);
|
||||
c2.output_buffer_write_offset = bits(d[0], 27, 5);
|
||||
c2.subframe_decode_count = bits(d[1], 20, 4);
|
||||
c2.output_buffer_padding = bits(d[1], 24, 3);
|
||||
c2.sample_rate = bits(d[1], 27, 2);
|
||||
c2.is_stereo = bits(d[1], 29, 1);
|
||||
c2.output_buffer_valid = bits(d[1], 31, 1);
|
||||
c2.input_buffer_read_offset = bits(d[2], 0, 26);
|
||||
c2.error_status = bits(d[2], 26, 5);
|
||||
c2.current_buffer = bits(d[4], 31, 1);
|
||||
c2.output_buffer_read_offset = bits(d[9], 0, 5);
|
||||
c2.interrupt_when_done = bits(d[9], 31, 1);
|
||||
|
||||
assert_eq!(c2.input_buffer_0_packet_count, 632);
|
||||
assert_eq!(c2.input_buffer_0_valid, 1);
|
||||
assert_eq!(c2.output_buffer_block_count, 30);
|
||||
assert_eq!(c2.output_buffer_write_offset, 5);
|
||||
assert_eq!(c2.subframe_decode_count, 8);
|
||||
assert_eq!(c2.output_buffer_padding, 1);
|
||||
assert_eq!(c2.sample_rate, 3);
|
||||
assert_eq!(c2.is_stereo, 1);
|
||||
assert_eq!(c2.output_buffer_valid, 1);
|
||||
assert_eq!(c2.input_buffer_read_offset, 16416);
|
||||
assert_eq!(c2.error_status, 4);
|
||||
assert_eq!(c2.current_buffer, 1);
|
||||
assert_eq!(c2.output_buffer_read_offset, 7);
|
||||
assert_eq!(c2.interrupt_when_done, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn phys_to_backing_projects_physical_window() {
|
||||
assert_eq!(xma_phys_to_backing(0x0b9f_d000), 0x4b9f_d000);
|
||||
assert_eq!(xma_phys_to_backing(0x01f6_6e00), 0x41f6_6e00);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ring_write_count_matches_canary() {
|
||||
// empty (read==write) → full capacity.
|
||||
assert_eq!(ring_write_count(0, 0, 7680), 7680);
|
||||
// write ahead of read.
|
||||
assert_eq!(ring_write_count(0, 256, 7680), 7680 - 256);
|
||||
// write wrapped behind read.
|
||||
assert_eq!(ring_write_count(512, 256, 7680), 256);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn packet_header_helpers() {
|
||||
// Matches the observed first packet word 0x08000000: byte0=0x08.
|
||||
let pkt = [0x08u8, 0x00, 0x00, 0x00];
|
||||
assert_eq!(packet_frame_count(&pkt), 2); // 0x08>>2 = 2
|
||||
// frame offset: ((0x08&3)<<13 | 0<<5 | 0x00>>3) + 32 = 32.
|
||||
assert_eq!(packet_frame_offset(&pkt), 32);
|
||||
// A non-zero byte2 shifts the offset: 0x08>>3 = 1 → +1.
|
||||
let pkt2 = [0x08u8, 0x00, 0x08, 0x00];
|
||||
assert_eq!(packet_frame_offset(&pkt2), 33);
|
||||
}
|
||||
}
|
||||
|
||||
impl ContextDecodeState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
codec: None,
|
||||
codec_rate: 0,
|
||||
codec_channels: 0,
|
||||
raw_frame: vec![0u8; (BYTES_PER_FRAME_CHANNEL * 2) as usize],
|
||||
pcm_queue: VecDeque::new(),
|
||||
current_frame_remaining_subframes: 0,
|
||||
remaining_subframe_blocks_in_output: 0,
|
||||
frames_decoded: 0,
|
||||
first_frame_logged: false,
|
||||
feed_packet_index: 0,
|
||||
feed_buffer: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -79,6 +79,14 @@ pub struct DecodedBlock {
|
||||
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
|
||||
/// pushes the first decoded word unconditionally).
|
||||
pub instrs: Vec<DecodedInstr>,
|
||||
/// True if this block contains a cross-thread synchronization point
|
||||
/// (`PpcOpcode::is_sync_sensitive`: reserved load/store or a memory
|
||||
/// barrier). Computed once at build time. The superblock runner ends
|
||||
/// the run after executing a sync-sensitive block so the lockstep
|
||||
/// interleaving stays fine-grained at exactly those points (preserving
|
||||
/// the cross-thread ordering the 2E/2F/2J boot work depends on),
|
||||
/// while chaining freely through ordinary straight-line blocks.
|
||||
pub sync_sensitive: bool,
|
||||
}
|
||||
|
||||
/// Per-slot status from a `lookup_or_build` probe. Internal only.
|
||||
@@ -187,11 +195,13 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
|
||||
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
|
||||
let page_base = start_pc & GUEST_PAGE_MASK;
|
||||
let mut cur = start_pc;
|
||||
let mut sync_sensitive = false;
|
||||
|
||||
loop {
|
||||
let raw = mem.read_u32(cur);
|
||||
let decoded = decode(raw, cur);
|
||||
let terminates = decoded.opcode.terminates_block();
|
||||
sync_sensitive |= decoded.opcode.is_sync_sensitive();
|
||||
instrs.push(decoded);
|
||||
|
||||
if terminates {
|
||||
@@ -215,6 +225,7 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
|
||||
end_pc,
|
||||
page_version,
|
||||
instrs,
|
||||
sync_sensitive,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -335,6 +346,40 @@ mod tests {
|
||||
assert_eq!(b.end_pc, 0x110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sync_sensitive_flag_set_for_barrier_block() {
|
||||
// A block containing `sync` (0x7C0004AC) must flag sync_sensitive
|
||||
// so the superblock runner ends the chain there (cross-thread
|
||||
// ordering point). `sync` does NOT terminate a block, so it sits
|
||||
// mid-block followed by straight-line code up to a terminator.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, 0x7C00_04AC); // sync
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
mem.put(0x10C, enc_b_self()); // terminator
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert!(
|
||||
b.sync_sensitive,
|
||||
"block containing `sync` must flag sync_sensitive; decoded last={:?}",
|
||||
b.instrs.iter().map(|i| i.opcode).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sync_sensitive_flag_clear_for_plain_block() {
|
||||
// A straight-line ALU block with no reserved-op / barrier must
|
||||
// NOT flag sync_sensitive (so the superblock runner is free to
|
||||
// chain through it).
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_addi(3, 3, 1));
|
||||
mem.put(0x108, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert!(!b.sync_sensitive, "plain ALU block must not flag sync_sensitive");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_page_boundary() {
|
||||
// Build from 0x1FFC. The next PC (0x2000) is in a different
|
||||
|
||||
217
crates/xenia-cpu/src/dispatch_rec.rs
Normal file
217
crates/xenia-cpu/src/dispatch_rec.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Runtime indirect-dispatch recorder.
|
||||
//!
|
||||
//! A reusable, env-gated facility that captures every indirect call performed
|
||||
//! through CTR (`bcctr`/`bcctrl`/`bctr`) as a unique `(call_site_pc ->
|
||||
//! target_pc)` pair, together with the object register `r3` seen at the call
|
||||
//! and a hit count. It exists to provide GROUND-TRUTH indirect-dispatch
|
||||
//! resolution for reverse-engineering vtable dispatch that the static
|
||||
//! analyzer fails to resolve (e.g. the Sylpheed movie engine vtable
|
||||
//! `0x8200a908`).
|
||||
//!
|
||||
//! ## Gating & overhead
|
||||
//! Recording is OFF by default. It is enabled only when the environment
|
||||
//! variable `XENIA_DISPATCH_REC` is set to a non-empty, non-`0` value at
|
||||
//! process start. When OFF, [`record`] is a single relaxed atomic-bool load
|
||||
//! followed by an early return — no allocation, no locking, no behavior
|
||||
//! change. The recorder is pure: it never reads the clock, never touches
|
||||
//! scheduling, and never mutates guest/CPU state, so enabling it does not
|
||||
//! perturb deterministic runs (only adds a HashMap insert behind a mutex).
|
||||
//!
|
||||
//! ## Focus filters (optional)
|
||||
//! Two env vars narrow what is recorded (both default to "record everything"):
|
||||
//! - `XENIA_DISPATCH_REC_TARGETS=0x82505c08,...` — only edges whose resolved
|
||||
//! target is in the list. Answers "who calls `<target>`": every recorded
|
||||
//! edge then carries the caller `site` and `lr`.
|
||||
//! - `XENIA_DISPATCH_REC_SITES=0x825078d8,...` — only edges from the listed
|
||||
//! call-site PCs.
|
||||
//! When both are set, an edge must satisfy BOTH. These keep a long focused
|
||||
//! run (e.g. the intro-movie trace) producing a small, relevant table instead
|
||||
//! of the whole program-wide dispatch set. Pure observe-only — filtering only
|
||||
//! affects which edges are stored, never guest/CPU state.
|
||||
//!
|
||||
//! ## Output
|
||||
//! On [`dump`] (call at end-of-run) the table is written to the path in
|
||||
//! `XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`), sorted by
|
||||
//! descending hit count, one record per line:
|
||||
//! `callsite_pc target_pc count r3=<obj>` (all hex).
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
/// Enabled flag, resolved once from the environment at first touch.
|
||||
static ENABLED: OnceLock<bool> = OnceLock::new();
|
||||
/// Fast-path mirror of `ENABLED` so the hot path is a single relaxed load
|
||||
/// (avoids the `OnceLock` get + deref on every indirect branch when OFF).
|
||||
static ENABLED_FAST: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
/// One observed indirect-dispatch edge.
|
||||
#[derive(Default, Clone, Copy)]
|
||||
struct Edge {
|
||||
count: u64,
|
||||
/// Last-seen object register (`r3`) at this (site,target) edge. Stable for
|
||||
/// a vtable dispatch where the same call site always dispatches on the
|
||||
/// same kind of object.
|
||||
last_r3: u64,
|
||||
/// Last-seen link register (return address) for the call.
|
||||
last_lr: u64,
|
||||
}
|
||||
|
||||
/// (call_site_pc, target_pc) -> Edge
|
||||
static TABLE: OnceLock<Mutex<HashMap<(u32, u32), Edge>>> = OnceLock::new();
|
||||
|
||||
/// Optional focus filters, resolved once from the environment. When either is
|
||||
/// non-empty, an edge is recorded only if its `target` is in `TARGET_FILTER`
|
||||
/// (when that set is non-empty) AND its `site` is in `SITE_FILTER` (when that
|
||||
/// set is non-empty). Empty sets mean "no constraint on that axis". This lets
|
||||
/// a long focused run (e.g. the intro-movie trace) record ONLY the dispatch
|
||||
/// edges relevant to a target-set under investigation — for example "every
|
||||
/// indirect call whose target is the XMV submit `sub_82505C08`", which answers
|
||||
/// the milestone-2 "who calls submit on the engine" question with the caller
|
||||
/// `lr` — instead of the whole (large) program-wide dispatch table.
|
||||
static TARGET_FILTER: OnceLock<Vec<u32>> = OnceLock::new();
|
||||
static SITE_FILTER: OnceLock<Vec<u32>> = OnceLock::new();
|
||||
|
||||
/// Parse a comma-separated list of hex PCs (`0x` prefix optional) into a
|
||||
/// sorted, deduped Vec. Empty/garbage tokens are skipped.
|
||||
fn parse_pc_list_str(s: &str) -> Vec<u32> {
|
||||
let mut v: Vec<u32> = s
|
||||
.split(',')
|
||||
.map(str::trim)
|
||||
.filter(|t| !t.is_empty())
|
||||
.filter_map(|t| {
|
||||
let hex = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")).unwrap_or(t);
|
||||
u32::from_str_radix(hex, 16).ok()
|
||||
})
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
v.dedup();
|
||||
v
|
||||
}
|
||||
|
||||
/// Parse a PC list from an env var. Missing var → empty Vec (no constraint).
|
||||
fn parse_pc_list(var: &str) -> Vec<u32> {
|
||||
match std::env::var(var) {
|
||||
Ok(s) => parse_pc_list_str(&s),
|
||||
Err(_) => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve the enabled flag (and focus filters) from the environment exactly
|
||||
/// once.
|
||||
fn init_enabled() -> bool {
|
||||
let on = match std::env::var("XENIA_DISPATCH_REC") {
|
||||
Ok(v) => !v.is_empty() && v != "0",
|
||||
Err(_) => false,
|
||||
};
|
||||
ENABLED_FAST.store(on, Ordering::Relaxed);
|
||||
let _ = TARGET_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_TARGETS"));
|
||||
let _ = SITE_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_SITES"));
|
||||
on
|
||||
}
|
||||
|
||||
/// Whether recording is enabled. Cheap after the first call.
|
||||
#[inline(always)]
|
||||
pub fn enabled() -> bool {
|
||||
// Hot path: relaxed atomic load. ENABLED_FAST is initialised by the first
|
||||
// call to `enabled_init` (below); until then it is `false`, which is also
|
||||
// the correct default. We force initialisation eagerly from `install`.
|
||||
ENABLED_FAST.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Force the env resolution (call once early in startup). Idempotent.
|
||||
pub fn install() {
|
||||
let _ = ENABLED.get_or_init(init_enabled);
|
||||
}
|
||||
|
||||
/// Record one indirect (CTR) call edge. No-op when disabled.
|
||||
///
|
||||
/// `site` = PC of the `bcctr`/`bctr` instruction, `target` = resolved CTR
|
||||
/// target, `r3` = object register at the call, `lr` = link register.
|
||||
#[inline(always)]
|
||||
pub fn record(site: u32, target: u32, r3: u64, lr: u64) {
|
||||
// Single predictable branch when OFF.
|
||||
if !ENABLED_FAST.load(Ordering::Relaxed) {
|
||||
return;
|
||||
}
|
||||
// Focus filters (only consulted when recording is ON, i.e. rare). An empty
|
||||
// filter set imposes no constraint on its axis.
|
||||
if let Some(targets) = TARGET_FILTER.get()
|
||||
&& !targets.is_empty()
|
||||
&& targets.binary_search(&target).is_err()
|
||||
{
|
||||
return;
|
||||
}
|
||||
if let Some(sites) = SITE_FILTER.get()
|
||||
&& !sites.is_empty()
|
||||
&& sites.binary_search(&site).is_err()
|
||||
{
|
||||
return;
|
||||
}
|
||||
let table = TABLE.get_or_init(|| Mutex::new(HashMap::new()));
|
||||
if let Ok(mut t) = table.lock() {
|
||||
let e = t.entry((site, target)).or_default();
|
||||
e.count += 1;
|
||||
e.last_r3 = r3;
|
||||
e.last_lr = lr;
|
||||
}
|
||||
}
|
||||
|
||||
/// Dump the recorded table to the output file. No-op when disabled or empty.
|
||||
pub fn dump() {
|
||||
if !enabled() {
|
||||
return;
|
||||
}
|
||||
let path = std::env::var("XENIA_DISPATCH_REC_OUT")
|
||||
.unwrap_or_else(|_| "/tmp/dispatch_rec.txt".to_string());
|
||||
let table = match TABLE.get() {
|
||||
Some(t) => t,
|
||||
None => return,
|
||||
};
|
||||
let guard = match table.lock() {
|
||||
Ok(g) => g,
|
||||
Err(_) => return,
|
||||
};
|
||||
let mut rows: Vec<((u32, u32), Edge)> =
|
||||
guard.iter().map(|(k, v)| (*k, *v)).collect();
|
||||
// Deterministic order: count desc, then site, then target.
|
||||
rows.sort_by(|a, b| {
|
||||
b.1.count
|
||||
.cmp(&a.1.count)
|
||||
.then(a.0 .0.cmp(&b.0 .0))
|
||||
.then(a.0 .1.cmp(&b.0 .1))
|
||||
});
|
||||
let mut out = String::with_capacity(rows.len() * 48);
|
||||
out.push_str("# callsite_pc target_pc count r3 lr\n");
|
||||
for ((site, target), e) in rows {
|
||||
out.push_str(&format!(
|
||||
"{:#010x} {:#010x} {} r3={:#018x} lr={:#018x}\n",
|
||||
site, target, e.count, e.last_r3, e.last_lr
|
||||
));
|
||||
}
|
||||
if let Err(err) = std::fs::write(&path, out) {
|
||||
eprintln!("dispatch_rec: failed to write {}: {}", path, err);
|
||||
} else {
|
||||
eprintln!("dispatch_rec: wrote {} edges to {}", guard.len(), path);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::parse_pc_list_str;
|
||||
|
||||
#[test]
|
||||
fn parse_pc_list_handles_prefixes_whitespace_and_dedup() {
|
||||
// Mixed 0x / bare hex, surrounding whitespace, an empty token, and a
|
||||
// duplicate. Result is sorted + deduped; garbage tokens are dropped.
|
||||
let got = parse_pc_list_str(" 0x82505c08 , 825078d8,, 82505c08 , zzz ");
|
||||
assert_eq!(got, vec![0x82505c08, 0x825078d8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pc_list_empty_is_no_constraint() {
|
||||
assert!(parse_pc_list_str("").is_empty());
|
||||
assert!(parse_pc_list_str(" , , ").is_empty());
|
||||
}
|
||||
}
|
||||
@@ -1012,7 +1012,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
|
||||
if cond_ok {
|
||||
let next_pc = ctx.pc + 4;
|
||||
ctx.pc = (ctx.ctr as u32) & !3;
|
||||
let target = (ctx.ctr as u32) & !3;
|
||||
// Ground-truth indirect-dispatch recording (env-gated, off by
|
||||
// default; pure record-only, no scheduling/state change).
|
||||
if crate::dispatch_rec::enabled() {
|
||||
crate::dispatch_rec::record(ctx.pc, target, ctx.gpr[3], ctx.lr);
|
||||
}
|
||||
ctx.pc = target;
|
||||
if instr.lk() {
|
||||
ctx.lr = next_pc as u64;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod block_cache;
|
||||
pub mod context;
|
||||
pub mod decoder;
|
||||
pub mod dispatch_rec;
|
||||
pub mod disasm;
|
||||
pub mod fpscr;
|
||||
pub mod interpreter;
|
||||
|
||||
@@ -204,6 +204,34 @@ impl PpcOpcode {
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns true if this opcode is a cross-thread synchronization
|
||||
/// point at which the superblock runner MUST yield back to the
|
||||
/// round-robin scheduler so the lockstep interleaving stays
|
||||
/// fine-grained enough to preserve correct cross-thread ordering:
|
||||
///
|
||||
/// - reserved load/store (`lwarx`/`ldarx`/`stwcx.`/`stdcx.`): the
|
||||
/// atomic primitive other threads race on. Running past one
|
||||
/// without returning to the scheduler would let a single slot
|
||||
/// win/lose a reservation across many blocks before any peer
|
||||
/// observes it.
|
||||
/// - memory barriers (`sync`/`eieio`/`isync`): the guest explicitly
|
||||
/// demands a global ordering point here; honour it by ending the
|
||||
/// superblock so the scheduler re-interleaves.
|
||||
///
|
||||
/// Purely a function of the opcode (no guest data), so the yield
|
||||
/// decision is deterministic and the schedule reproduces byte-identically.
|
||||
/// Note: `sc` (syscall) and traps already `terminates_block`, and
|
||||
/// import-thunk / halt-sentinel PCs are handled by the per-block
|
||||
/// prologue re-check in the superblock loop — they are not listed here.
|
||||
#[inline]
|
||||
pub fn is_sync_sensitive(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::lwarx | Self::ldarx | Self::stwcx | Self::stdcx
|
||||
| Self::sync | Self::eieio | Self::isync
|
||||
)
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Invalid => "invalid",
|
||||
|
||||
@@ -205,6 +205,21 @@ pub enum BlockReason {
|
||||
CriticalSection(u32),
|
||||
}
|
||||
|
||||
/// Floor of the **synthetic park-handle** range. Handles at or above this
|
||||
/// value are deliberately OUTSIDE the kernel object allocator (which starts
|
||||
/// at `0x1000`); they are used to park threads that must NEVER be woken by
|
||||
/// the normal signal/wait machinery — currently the dedicated audio-worker
|
||||
/// threads (`xenia_kernel::xaudio::XAUDIO_SYNTHETIC_HANDLE_BASE = 0xF000_0000`),
|
||||
/// which are only ever un-parked by audio-callback injection. The deadlock
|
||||
/// force-wake ([`Scheduler::unblock_on_deadlock`]) must skip waiters parked
|
||||
/// solely on such handles: they are not deadlock participants (the guest
|
||||
/// genuinely blocked on its own objects), and waking one runs its thread
|
||||
/// entry to the `LR_HALT` sentinel → premature exit, which then drops every
|
||||
/// subsequent injection. Kept in `xenia-cpu` (not imported from
|
||||
/// `xenia-kernel`, which depends on this crate); the kernel const must stay
|
||||
/// within `[SYNTHETIC_PARK_HANDLE_FLOOR, u32::MAX]`.
|
||||
pub const SYNTHETIC_PARK_HANDLE_FLOOR: u32 = 0xF000_0000;
|
||||
|
||||
/// Sink for PCR+0x2C writes — the scheduler writes the guest-visible
|
||||
/// current-processor-id here at spawn and Axis 4 rewrites on affinity
|
||||
/// migration. Implemented by `xenia-kernel` for `GuestMemory`; keeping it
|
||||
@@ -795,31 +810,46 @@ impl Scheduler {
|
||||
/// the fast path — zero bits mean no slot has work and the caller
|
||||
/// falls through to `advance_to_next_wake`.
|
||||
pub fn round_schedule(&mut self) -> Vec<u8> {
|
||||
let mut buf = [0u8; HW_THREAD_COUNT];
|
||||
let n = self.round_schedule_into(&mut buf);
|
||||
buf[..n].to_vec()
|
||||
}
|
||||
|
||||
/// Allocation-free variant of [`Self::round_schedule`] (Tier-A perf #2).
|
||||
/// Fills `buf` with the runnable slot ids and returns the count `n`; the
|
||||
/// valid range is `buf[..n]`. The hot scheduler loop (lockstep +
|
||||
/// parallel) calls this with a reusable stack array so it does not
|
||||
/// `__rust_alloc`/`__rust_dealloc` a fresh `Vec` every round (~7 instr
|
||||
/// apart at boot-to-splash → millions of churned allocations). Identical
|
||||
/// ordering / RNG-advance semantics to `round_schedule`, so the schedule
|
||||
/// — and thus the lockstep digest — is byte-for-byte unchanged.
|
||||
pub fn round_schedule_into(&mut self, buf: &mut [u8; HW_THREAD_COUNT]) -> usize {
|
||||
if self.non_empty_runnable == 0 {
|
||||
return Vec::new();
|
||||
return 0;
|
||||
}
|
||||
let start = self.rotation_cursor as usize;
|
||||
let mut out: Vec<u8> = Vec::with_capacity(HW_THREAD_COUNT);
|
||||
let mut n = 0usize;
|
||||
for off in 0..HW_THREAD_COUNT {
|
||||
let i = (start + off) % HW_THREAD_COUNT;
|
||||
if self.non_empty_runnable & (1 << i) != 0 {
|
||||
out.push(i as u8);
|
||||
buf[n] = i as u8;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
// Seeded mode layers a deterministic shuffle on top of the
|
||||
// already-filtered list. Same spawn/wake sequence + same seed ⇒
|
||||
// same schedule (invariant preserved from pre-Axis-1).
|
||||
if let OrderMode::Seeded { .. } = self.order {
|
||||
for i in (1..out.len()).rev() {
|
||||
for i in (1..n).rev() {
|
||||
self.rng_state ^= self.rng_state << 13;
|
||||
self.rng_state ^= self.rng_state >> 7;
|
||||
self.rng_state ^= self.rng_state << 17;
|
||||
let j = (self.rng_state as usize) % (i + 1);
|
||||
out.swap(i, j);
|
||||
buf.swap(i, j);
|
||||
}
|
||||
}
|
||||
self.rotation_cursor = ((start + 1) % HW_THREAD_COUNT) as u8;
|
||||
out
|
||||
n
|
||||
}
|
||||
|
||||
pub fn begin_round(&mut self) {
|
||||
@@ -1293,7 +1323,15 @@ impl Scheduler {
|
||||
};
|
||||
t.quantum_remaining = QUANTUM_DEFAULT;
|
||||
self.recompute_slot_runnable(r.hw_id);
|
||||
tracing::info!(
|
||||
// DEBUG, not INFO: this fires once per timed-wait deadline-wake, which
|
||||
// during the boot idle-spin happens hundreds of thousands of times. At
|
||||
// INFO it floods the console/log file and throttles the interactive
|
||||
// `exec --ui` path so hard (≈286K lines flushed to disk) that the guest
|
||||
// crawls and never reaches the ~30–150M-instruction splash window —
|
||||
// which masqueraded as a "--ui early termination" (iterate-3R). The
|
||||
// headless `check` path runs `--quiet` (WARN) so it was never throttled.
|
||||
// No execution-semantics change; deterministic golden is unaffected.
|
||||
tracing::debug!(
|
||||
"scheduler: advanced to deadline {} waking hw={} idx={}",
|
||||
deadline,
|
||||
r.hw_id,
|
||||
@@ -1376,6 +1414,27 @@ impl Scheduler {
|
||||
let mut woken = Vec::new();
|
||||
for (hw_id, slot) in self.slots.iter_mut().enumerate() {
|
||||
for (idx, t) in slot.runqueue.iter_mut().enumerate() {
|
||||
// Skip threads parked SOLELY on synthetic park-handles
|
||||
// (audio workers). They are not deadlock participants — the
|
||||
// guest blocked on its own objects — and waking one runs its
|
||||
// thread entry to the LR_HALT sentinel, exiting it and
|
||||
// dropping every subsequent audio-callback injection. Only
|
||||
// audio-callback injection may un-park them. A wait whose
|
||||
// handle set mixes synthetic and real handles is still
|
||||
// eligible (the real handle makes it a genuine waiter).
|
||||
let synthetic_park = match &t.state {
|
||||
HwState::Blocked(BlockReason::WaitAny { handles, .. })
|
||||
| HwState::Blocked(BlockReason::WaitAll { handles, .. }) => {
|
||||
!handles.is_empty()
|
||||
&& handles
|
||||
.iter()
|
||||
.all(|&h| h >= SYNTHETIC_PARK_HANDLE_FLOOR)
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
if synthetic_park {
|
||||
continue;
|
||||
}
|
||||
if matches!(
|
||||
t.state,
|
||||
HwState::Blocked(BlockReason::WaitAny { .. })
|
||||
@@ -1462,6 +1521,41 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unblock_on_deadlock_skips_synthetic_park_waiters() {
|
||||
// The audio worker parks on a synthetic handle (>= FLOOR) and must
|
||||
// survive the deadlock force-wake; a peer parked on a real handle
|
||||
// must be woken. Regression for the milestone-2 stall where the
|
||||
// force-wake destroyed the audio worker → all callbacks dropped.
|
||||
let mut s = mk_scheduler_with_initial();
|
||||
s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
|
||||
s.spawn(worker_spawn_params(3, 0x2010), &mut NullPcr).unwrap();
|
||||
let audio = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
|
||||
let real = ThreadRef { hw_id: 2, idx: 0, generation: 0 };
|
||||
s.thread_mut(audio).state = HwState::Blocked(BlockReason::WaitAny {
|
||||
handles: vec![SYNTHETIC_PARK_HANDLE_FLOOR],
|
||||
deadline: None,
|
||||
});
|
||||
s.thread_mut(real).state = HwState::Blocked(BlockReason::WaitAny {
|
||||
handles: vec![0x1234],
|
||||
deadline: None,
|
||||
});
|
||||
let woken = s.unblock_on_deadlock();
|
||||
assert!(
|
||||
woken.contains(&real),
|
||||
"real-handle waiter must be force-woken"
|
||||
);
|
||||
assert!(
|
||||
!woken.contains(&audio),
|
||||
"synthetic-park audio worker must NOT be force-woken"
|
||||
);
|
||||
assert!(matches!(
|
||||
s.thread(audio).state,
|
||||
HwState::Blocked(BlockReason::WaitAny { .. })
|
||||
));
|
||||
assert_eq!(s.thread(real).state, HwState::Ready);
|
||||
}
|
||||
|
||||
// ---- preserved from pre-Axis-1 (updated names and params) ----
|
||||
|
||||
#[test]
|
||||
|
||||
372
crates/xenia-gpu/src/draw_capture.rs
Normal file
372
crates/xenia-gpu/src/draw_capture.rs
Normal file
@@ -0,0 +1,372 @@
|
||||
//! Per-draw geometry capture for the host UI's faithful-render path.
|
||||
//!
|
||||
//! The deterministic headless core (`check --gpu-inline`) never touches this
|
||||
//! module — it is populated only when a UI bridge is installed and consumed
|
||||
//! only by `crates/xenia-ui`. The goal is to hand the UI the *real* guest
|
||||
//! geometry behind each `PM4_DRAW_INDX*` packet so it can rasterize the
|
||||
//! actual splash vertices instead of synthetic placeholder shapes.
|
||||
//!
|
||||
//! What the WGSL pipeline needs to reconstruct one draw (see
|
||||
//! `shaders/xenos_interp.wgsl` `vs_main` / `interpret_vertex_fetch`):
|
||||
//! * the active VS/PS blob keys (already published as assets),
|
||||
//! * the primitive type + the host vertex count to issue,
|
||||
//! * the raw guest vertex-buffer bytes for the fetched window, and
|
||||
//! * the *dword base* of that window so the shader can rebase the absolute
|
||||
//! fetch-constant address into the uploaded buffer.
|
||||
//!
|
||||
//! The hard part is sourcing the vertex window: the VS reads a vertex-fetch
|
||||
//! constant (`xe_gpu_vertex_fetch_t`) whose dword-0 carries the absolute
|
||||
//! guest dword address. We parse the active VS, find its first vertex fetch,
|
||||
//! read that fetch constant out of the register file, then copy a bounded
|
||||
//! window of guest memory starting at the fetch base.
|
||||
|
||||
use xenia_memory::access::MemoryAccess;
|
||||
|
||||
use crate::draw_state::{IndexSize, IndexSource, PrimitiveType};
|
||||
use crate::register_file::RegisterFile;
|
||||
|
||||
/// Texture-fetch / vertex-fetch constant region base, in register indices.
|
||||
/// Each fetch constant is 6 dwords (`xe_gpu_*_fetch_t`).
|
||||
const CONST_BASE_FETCH: u32 = 0x4800;
|
||||
|
||||
/// Upper bound (in dwords) on the vertex window we copy per draw. The splash
|
||||
/// UI draws are tiny (3–4 verts × ≤4 dwords); 64 KiB of dwords is generous
|
||||
/// slack while bounding the per-frame copy cost and the 16 MiB host buffer.
|
||||
const MAX_WINDOW_DWORDS: u32 = 16 * 1024;
|
||||
|
||||
/// One captured draw, with enough real state for the UI to replay it through
|
||||
/// the existing wgpu Xenos pipeline.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DrawCapture {
|
||||
/// Monotonic global draw index (matches `GpuStats::draws_seen` at capture).
|
||||
pub draw_index: u32,
|
||||
/// Xenos primitive-type code (see `SwapInfo::last_draw_prim` encoding).
|
||||
pub prim_code: u32,
|
||||
/// Host vertex count to issue (post primitive-processor rewrite).
|
||||
pub host_vertex_count: u32,
|
||||
/// Active VS blob key at draw time (0 = none).
|
||||
pub vs_key: u32,
|
||||
/// Active PS blob key at draw time (0 = none).
|
||||
pub ps_key: u32,
|
||||
/// Raw guest dwords of the fetched vertex window (host-endian as stored in
|
||||
/// guest memory — the WGSL applies the per-format endian swap). `addr 0`
|
||||
/// of this buffer corresponds to guest dword `window_base_dwords`.
|
||||
pub vertex_dwords: Vec<u32>,
|
||||
/// Guest dword address that maps to index 0 of `vertex_dwords`. The shader
|
||||
/// subtracts this from the fetch-constant base to index `vertex_dwords`.
|
||||
pub window_base_dwords: u32,
|
||||
/// `true` when we successfully resolved a real vertex window. When `false`
|
||||
/// the UI falls back to its procedural geometry for this draw (honest:
|
||||
/// nothing faked, just "couldn't source real vertices").
|
||||
pub has_real_vertices: bool,
|
||||
/// iterate-3S: per-draw NDC transform derived from the guest viewport /
|
||||
/// clip / VTE registers (mirrors canary `GetHostViewportInfo`). The host VS
|
||||
/// converts the guest-VS position to wgpu clip space via
|
||||
/// `clip.xy = pos.xy * ndc_scale + ndc_offset * pos.w`. The Y component
|
||||
/// already carries the render-target → wgpu Y-flip (negated).
|
||||
pub ndc_scale: [f32; 2],
|
||||
pub ndc_offset: [f32; 2],
|
||||
/// iterate-3T: the decoded texture(s) this draw's active pixel shader
|
||||
/// samples, keyed off its real `tfetch` fetch-constant slots (the 3M
|
||||
/// decoder makes these decode). The UI uploads + binds the FIRST entry
|
||||
/// per-draw so the textured logo samples the real artwork instead of the
|
||||
/// magenta stub. Empty for flat (no-tfetch) draws. Populated by
|
||||
/// `gpu_system` after decode (left empty by `build`).
|
||||
///
|
||||
/// Each entry is `(key, content_version, bytes)`. iterate-3AD: the
|
||||
/// `content_version` (from `span_max_version` over the texel span) lets the
|
||||
/// UI host texture cache RE-UPLOAD when the guest fills more of an evolving
|
||||
/// atlas. The publisher and the 2nd splash logo share one K8888 surface
|
||||
/// (base `0x4dbee000`); the 2nd logo's texels are CPU-written *after* the
|
||||
/// publisher's first upload. Without the real version the host cache (which
|
||||
/// previously pinned `version_when_uploaded = 1`) kept the first partial
|
||||
/// upload, so the 2nd logo sampled its still-zero atlas region as black.
|
||||
pub textures: Vec<(crate::texture_cache::TextureKey, u64, Vec<u8>)>,
|
||||
/// iterate-3Y: per-draw color/blend render state captured from the
|
||||
/// register file so the host pipeline composites the way the guest
|
||||
/// intends (instead of one fixed alpha-blend state). Mirrors the fields
|
||||
/// canary feeds into `GetCurrentStateDescription` (D3D12
|
||||
/// `pipeline_cache.cc`):
|
||||
/// * `blend_control` = `RB_BLENDCONTROL0` (RT0 src/dst factors + op,
|
||||
/// color and alpha). The Xbox 360 has no separate "blend enable" bit;
|
||||
/// `One,Zero,Add` *is* the opaque case.
|
||||
/// * `color_mask` = RT0 nibble of `RB_COLOR_MASK` (per-channel write
|
||||
/// enable). When 0, canary forces `One,Zero` (no blend).
|
||||
/// * `color_control` = `RB_COLORCONTROL` (alpha-test enable/func).
|
||||
/// * `depth_control` = `RB_DEPTHCONTROL` (z-test enable/func/write).
|
||||
pub blend_control: u32,
|
||||
pub color_mask: u8,
|
||||
pub color_control: u32,
|
||||
pub depth_control: u32,
|
||||
}
|
||||
|
||||
/// iterate-3S: compute the guest→host NDC XY transform for a draw, mirroring
|
||||
/// canary's `draw_util.cc::GetHostViewportInfo` (the XY half). The Xbox 360 VS
|
||||
/// emits a clip-space position which the HW then scales/offsets by the viewport
|
||||
/// (`PA_CL_VPORT_*`, gated by `PA_CL_VTE_CNTL`) into render-target pixels, OR,
|
||||
/// when clipping is disabled (`PA_CL_CLIP_CNTL.clip_disable`), the VS emits
|
||||
/// render-target-pixel coordinates directly (the screen-space UI / clear case —
|
||||
/// this is what Sylpheed's splash quads do). Either way we must rescale into the
|
||||
/// host's [-1,1] clip space and flip Y (render-target Y-down → wgpu Y-up).
|
||||
///
|
||||
/// Returns `(ndc_scale[2], ndc_offset[2])` such that
|
||||
/// `host_clip.xy = guest_pos.xy * ndc_scale + ndc_offset * guest_pos.w`.
|
||||
/// The Y entries are pre-negated to flip into wgpu's Y-up clip space.
|
||||
pub fn compute_ndc_xy(rf: &RegisterFile) -> ([f32; 2], [f32; 2]) {
|
||||
const PA_CL_CLIP_CNTL: u32 = 0x2204;
|
||||
const PA_SU_SC_MODE_CNTL: u32 = 0x2205;
|
||||
const PA_CL_VTE_CNTL: u32 = 0x2206;
|
||||
const PA_SU_VTX_CNTL: u32 = 0x2302;
|
||||
const PA_CL_VPORT_XSCALE: u32 = 0x210F;
|
||||
const PA_CL_VPORT_XOFFSET: u32 = 0x2110;
|
||||
const PA_CL_VPORT_YSCALE: u32 = 0x2111;
|
||||
const PA_CL_VPORT_YOFFSET: u32 = 0x2112;
|
||||
const PA_SC_WINDOW_OFFSET: u32 = 0x2080;
|
||||
const PA_SC_WINDOW_SCISSOR_BR: u32 = 0x2082;
|
||||
const RB_SURFACE_INFO: u32 = 0x2000;
|
||||
|
||||
let clip_cntl = rf.read(PA_CL_CLIP_CNTL);
|
||||
let vte = rf.read(PA_CL_VTE_CNTL);
|
||||
let su_sc_mode = rf.read(PA_SU_SC_MODE_CNTL);
|
||||
let su_vtx = rf.read(PA_SU_VTX_CNTL);
|
||||
let fbits = |r: u32| f32::from_bits(rf.read(r));
|
||||
|
||||
// VTE enable bits (xenos.h PA_CL_VTE_CNTL): bit0 vport_x_scale_ena,
|
||||
// bit1 vport_x_offset_ena, bit2 vport_y_scale_ena, bit3 vport_y_offset_ena.
|
||||
let scale_x = if vte & (1 << 0) != 0 { fbits(PA_CL_VPORT_XSCALE) } else { 1.0 };
|
||||
let off_x = if vte & (1 << 1) != 0 { fbits(PA_CL_VPORT_XOFFSET) } else { 0.0 };
|
||||
let scale_y = if vte & (1 << 2) != 0 { fbits(PA_CL_VPORT_YSCALE) } else { 1.0 };
|
||||
let off_y = if vte & (1 << 3) != 0 { fbits(PA_CL_VPORT_YOFFSET) } else { 0.0 };
|
||||
|
||||
// Render-target extent in guest pixels: clamp to the texture max (2048),
|
||||
// sourced from the window scissor BR (matches canary `x_max`/`y_max`).
|
||||
let br = rf.read(PA_SC_WINDOW_SCISSOR_BR);
|
||||
let x_max = ((br & 0x7FFF).max(1)).min(2048) as f32;
|
||||
let y_max = (((br >> 16) & 0x7FFF).max(1)).min(2048) as f32;
|
||||
let _ = RB_SURFACE_INFO;
|
||||
|
||||
// Half-pixel + window offsets added in render-target pixels.
|
||||
let mut add_x = 0.0f32;
|
||||
let mut add_y = 0.0f32;
|
||||
if su_sc_mode & (1 << 16) != 0 {
|
||||
let wo = rf.read(PA_SC_WINDOW_OFFSET);
|
||||
// 15-bit signed each (x: [14:0], y: [30:16]).
|
||||
let sx = (((wo & 0x7FFF) << 1) as i32) >> 1;
|
||||
let sy = ((((wo >> 16) & 0x7FFF) << 1) as i32) >> 1;
|
||||
add_x += sx as f32;
|
||||
add_y += sy as f32;
|
||||
}
|
||||
if su_vtx & 1 == 0 {
|
||||
// pix_center == kD3DZero → +0.5 half-pixel offset.
|
||||
add_x += 0.5;
|
||||
add_y += 0.5;
|
||||
}
|
||||
|
||||
let (s, o);
|
||||
if clip_cntl & (1 << 16) != 0 {
|
||||
// clip_disable: VS outputs render-target-*pixel* coords (Y-DOWN: pixel
|
||||
// y=0 is the top row of the render target). Rescale the whole RT extent
|
||||
// to [-1,1] and FLIP Y so pixel-top → wgpu clip-top (canary's
|
||||
// huge-host-viewport path; the framebuffer→clip flip is real here).
|
||||
let px2ndc_x = 2.0 / x_max;
|
||||
let px2ndc_y = 2.0 / y_max;
|
||||
let sx = scale_x * px2ndc_x;
|
||||
let ox = (off_x - x_max * 0.5 + add_x) * px2ndc_x;
|
||||
let sy = scale_y * px2ndc_y;
|
||||
let oy = (off_y - y_max * 0.5 + add_y) * px2ndc_y;
|
||||
// Flip Y: pixel-Y-down → wgpu clip-Y-up.
|
||||
s = [sx, -sy];
|
||||
o = [ox, -oy];
|
||||
} else {
|
||||
// iterate-3AA (DEFECT 1 ROOT): clipping enabled → the VS already emits
|
||||
// *clip-space* coordinates (Y-UP: +Y is the top of the screen), exactly
|
||||
// the convention the Xbox 360's D3D9 and wgpu BOTH use for clip space
|
||||
// (NDC +Y → framebuffer top in each API; the framebuffer Y-direction is
|
||||
// an internal viewport detail handled identically by both). A clip-space
|
||||
// position is therefore portable to wgpu with NO Y-flip. The previous
|
||||
// code unconditionally negated Y (the same flip the screen-space pixel
|
||||
// path needs), which mirrored the publisher logo vertically: its quad is
|
||||
// centered (±0.085 around 0) so the *position* stayed centered, but the
|
||||
// negation swapped top↔bottom vertices while the texture V was unchanged
|
||||
// → the sampled sub-rect (UV v 0.001→0.090) read bottom-up → "SQUARE
|
||||
// ENIX" rendered upside down in place. Measured (readback): the red dots
|
||||
// sit at 43% from the texture top but rendered at 58% from the top
|
||||
// (= a clean vertical mirror); removing the flip restores them to 43%.
|
||||
// Identity XY (no flip) maps guest clip-Y-up straight to wgpu clip-Y-up.
|
||||
s = [1.0, 1.0];
|
||||
o = [0.0, 0.0];
|
||||
return (s, o);
|
||||
}
|
||||
(s, o)
|
||||
}
|
||||
|
||||
/// Encode a [`PrimitiveType`] as the raw Xenos code used across the bridge.
|
||||
pub fn prim_code(p: PrimitiveType) -> u32 {
|
||||
match p {
|
||||
PrimitiveType::None => 0,
|
||||
PrimitiveType::PointList => 1,
|
||||
PrimitiveType::LineList => 2,
|
||||
PrimitiveType::LineStrip => 3,
|
||||
PrimitiveType::TriangleList => 4,
|
||||
PrimitiveType::TriangleFan => 5,
|
||||
PrimitiveType::TriangleStrip => 6,
|
||||
PrimitiveType::RectangleList => 8,
|
||||
PrimitiveType::QuadList => 13,
|
||||
PrimitiveType::Unknown(x) => x as u32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve the first vertex-fetch window referenced by the parsed VS.
|
||||
///
|
||||
/// Walks the VS instruction stream for the first `vfetch` (mini) instruction,
|
||||
/// reads its fetch constant from `rf`, and copies a bounded window of guest
|
||||
/// memory starting at the fetch base. Returns `(dwords, window_base_dwords)`
|
||||
/// or `None` if the VS has no vertex fetch or the constant is malformed.
|
||||
fn resolve_vertex_window(
|
||||
parsed_vs: &crate::ucode::ParsedShader,
|
||||
rf: &RegisterFile,
|
||||
mem: &dyn MemoryAccess,
|
||||
) -> Option<(Vec<u32>, u32)> {
|
||||
// iterate-3W (GPUBUG-109): the instruction block packs ALU and fetch
|
||||
// instructions identically (96 bits / 3 dwords each); ONLY the owning
|
||||
// `Exec` control-flow clause's `sequence` bitmap (2 bits per instruction,
|
||||
// bit[2*i]=fetch/ALU) tells them apart. The previous blind triple-walk
|
||||
// decoded ALU triples as fetches → garbage fetch-constant indices and a
|
||||
// bogus `type==3` guard, never reaching the real vertex fetch. Walk the CF
|
||||
// exec clauses exactly as the translator does (`translator.rs::emit_exec`)
|
||||
// and take the FIRST sequence-flagged *vertex* fetch.
|
||||
let instrs = &parsed_vs.instructions;
|
||||
let mut const_off: Option<u32> = None;
|
||||
'clauses: for clause in &parsed_vs.cf {
|
||||
let crate::ucode::control_flow::ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
..
|
||||
} = *clause
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
for i in 0..(count as usize) {
|
||||
// bit[2*i] of the sequence bitmap: 1 = fetch, 0 = ALU.
|
||||
if (sequence >> (i * 2)) & 1 == 0 {
|
||||
continue;
|
||||
}
|
||||
let base = (address as usize + i) * 3;
|
||||
if base + 2 >= instrs.len() {
|
||||
break;
|
||||
}
|
||||
if let crate::ucode::fetch::FetchInstruction::Vertex(vf) =
|
||||
crate::ucode::fetch::decode_fetch([instrs[base], instrs[base + 1], instrs[base + 2]])
|
||||
{
|
||||
const_off = Some(vf.const_reg_offset());
|
||||
break 'clauses;
|
||||
}
|
||||
}
|
||||
}
|
||||
// iterate-3X (GPUBUG-110): vertex fetch constants are addressed by
|
||||
// `const_index * 3 + const_index_sel` (canary `ucode.h:700` —
|
||||
// `VertexFetchInstruction::fetch_constant_index`), NOT by `const_index`
|
||||
// alone. The register region packs 3 two-dword vertex-fetch constants per
|
||||
// 6-dword group, so the constant lives at
|
||||
// `0x4800 + const_index*6 + const_index_sel*2`. The previous decode dropped
|
||||
// `const_index_sel` and read sub-slot 0 (`fc*6`), which for the publisher
|
||||
// logo (`const_index=31, sel=2`) held `0x00000001` (an unused slot) instead
|
||||
// of the real vertex-buffer base at sub-slot 2 (`0x48BE`). That made
|
||||
// `has_real_vertices=false` → the logo fell to the procedural fullscreen
|
||||
// magenta fallback. (Refutes iterate-3W's "geometry is auto-generated from
|
||||
// vertex_id" — measured: the real fetch constant is a 4-vertex QuadList
|
||||
// buffer at `0x0adf60f0`.)
|
||||
let const_reg = CONST_BASE_FETCH + const_off?;
|
||||
let dword0 = rf.read(const_reg);
|
||||
let dword1 = rf.read(const_reg + 1);
|
||||
// address:30 at bits[31:2] of dword0 (in bytes once masked). The fetch
|
||||
// constant carries a guest *physical* dword address — canary reads the
|
||||
// vertex buffer via `Memory::TranslatePhysical(fetch.address * 4)`
|
||||
// (`draw_util.cc:961`). On the Xbox 360 the physical range is mirrored at
|
||||
// several virtual windows; ours only maps the cached-physical window at
|
||||
// `0x4000_0000` (`gpu_system::physical_to_backing`). Reading the bare low
|
||||
// address (`0x0adf_xxxx`) hits an unmapped VA and returns zeros, so rebase
|
||||
// a low physical base onto the mapped `0x4000_0000` alias when the raw VA
|
||||
// is not itself mapped. `window_base_dwords` keeps the *original* base so
|
||||
// the shader's rebase against the (unmodified) fetch-constant address still
|
||||
// indexes the uploaded window correctly.
|
||||
let base_bytes = dword0 & 0xFFFF_FFFC;
|
||||
if base_bytes == 0 {
|
||||
return None;
|
||||
}
|
||||
let read_base = if mem.translate(base_bytes).is_some() {
|
||||
base_bytes
|
||||
} else if base_bytes < 0x2000_0000 && mem.translate(base_bytes | 0x4000_0000).is_some() {
|
||||
base_bytes | 0x4000_0000
|
||||
} else {
|
||||
base_bytes
|
||||
};
|
||||
// size:24 at bits[25:2] of dword1, in dwords. Clamp to our window cap.
|
||||
let size_dwords = ((dword1 >> 2) & 0x00FF_FFFF).clamp(1, MAX_WINDOW_DWORDS);
|
||||
let window_base_dwords = base_bytes >> 2;
|
||||
let mut dwords = Vec::with_capacity(size_dwords as usize);
|
||||
for i in 0..size_dwords {
|
||||
let addr = read_base.wrapping_add(i * 4);
|
||||
if addr < read_base {
|
||||
break; // wrap guard
|
||||
}
|
||||
// `read_u32` composes big-endian bytes into the u32 value; the WGSL's
|
||||
// `gpu_swap` expects the *raw little-endian dword* as it sits in guest
|
||||
// memory, so undo the BE composition with `swap_bytes`.
|
||||
dwords.push(mem.read_u32(addr).swap_bytes());
|
||||
}
|
||||
if dwords.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some((dwords, window_base_dwords))
|
||||
}
|
||||
|
||||
/// Build a [`DrawCapture`] for one draw. Best-effort: when the vertex window
|
||||
/// can't be resolved, `has_real_vertices` is `false` and the UI falls back to
|
||||
/// procedural geometry (never fabricated pixels).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn build(
|
||||
draw_index: u32,
|
||||
primitive: PrimitiveType,
|
||||
host_vertex_count: u32,
|
||||
_index_source: IndexSource,
|
||||
_index_size: IndexSize,
|
||||
vs_key: u32,
|
||||
ps_key: u32,
|
||||
parsed_vs: Option<&crate::ucode::ParsedShader>,
|
||||
rf: &RegisterFile,
|
||||
mem: &dyn MemoryAccess,
|
||||
) -> DrawCapture {
|
||||
let (vertex_dwords, window_base_dwords, has_real) = match parsed_vs
|
||||
.and_then(|vs| resolve_vertex_window(vs, rf, mem))
|
||||
{
|
||||
Some((d, base)) => (d, base, true),
|
||||
None => (Vec::new(), 0, false),
|
||||
};
|
||||
let (ndc_scale, ndc_offset) = compute_ndc_xy(rf);
|
||||
// iterate-3Y: capture RT0 color/blend/depth render state. Registers per
|
||||
// canary `registers.h`: RB_BLENDCONTROL0=0x2201, RB_COLOR_MASK=0x2104
|
||||
// (RT0 = bits[3:0]), RB_COLORCONTROL=0x2202, RB_DEPTHCONTROL=0x2200.
|
||||
const RB_BLENDCONTROL_0: u32 = 0x2201;
|
||||
const RB_COLOR_MASK: u32 = 0x2104;
|
||||
const RB_COLORCONTROL: u32 = 0x2202;
|
||||
const RB_DEPTHCONTROL: u32 = 0x2200;
|
||||
DrawCapture {
|
||||
draw_index,
|
||||
prim_code: prim_code(primitive),
|
||||
host_vertex_count,
|
||||
vs_key,
|
||||
ps_key,
|
||||
vertex_dwords,
|
||||
window_base_dwords,
|
||||
has_real_vertices: has_real,
|
||||
ndc_scale,
|
||||
ndc_offset,
|
||||
textures: Vec::new(),
|
||||
blend_control: rf.read(RB_BLENDCONTROL_0),
|
||||
color_mask: (rf.read(RB_COLOR_MASK) & 0xF) as u8,
|
||||
color_control: rf.read(RB_COLORCONTROL),
|
||||
depth_control: rf.read(RB_DEPTHCONTROL),
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,80 @@ use crate::primitive::{self, ProcessedPrimitive};
|
||||
use crate::register_file::RegisterFile;
|
||||
use crate::ring_view::RingBufferView;
|
||||
|
||||
/// The guest-virtual window that physical allocations are committed into.
|
||||
/// `xenia-kernel`'s `heap_alloc` bumps its cursor through `0x4000_0000..=
|
||||
/// 0x6FFF_FFFF` and commits the host backing for `MmAllocatePhysicalMemoryEx`
|
||||
/// there, so this write-combine mirror is the canonical home of physical DRAM.
|
||||
/// Keep in sync with `KernelState::heap_cursor`'s initial value.
|
||||
pub const PHYSICAL_BACKING_BASE: u32 = 0x4000_0000;
|
||||
|
||||
/// Re-project a guest *physical* address — as handed to the Vd/GPU ABI and
|
||||
/// embedded in PM4 pointers (`INDIRECT_BUFFER`, `WAIT_REG_MEM`-memory,
|
||||
/// `MEM_WRITE`, `EVENT_WRITE*`, `IM_LOAD`, …) — onto the guest-virtual window
|
||||
/// where its host backing is actually committed.
|
||||
///
|
||||
/// The Xbox 360 maps its 512 MB of physical DRAM into several virtual mirror
|
||||
/// windows that differ only in cache policy: bare physical (`0x0xxxxxxx`),
|
||||
/// write-combine (`0x4xxxxxxx`), and the cached `0xA/0xC/0xExxxxxxx` mirrors —
|
||||
/// all aliasing `addr & 0x1FFF_FFFF`. On real hardware (and in xenia-canary
|
||||
/// via overlapping `mmap`s) these are literally the same bytes.
|
||||
///
|
||||
/// Ours has a single flat `membase` and `MmAllocatePhysicalMemoryEx` commits
|
||||
/// physical backing in the write-combine `0x4xxxxxxx` window. The guest then
|
||||
/// masks its allocation base to *bare physical* before passing it to
|
||||
/// `VdInitializeRingBuffer` / `VdEnableRingBufferRPtrWriteBack`, and PM4
|
||||
/// pointers are likewise bare-physical. A flat `membase + phys` access
|
||||
/// therefore hits a never-committed, zero-filled page instead of the committed
|
||||
/// `0x4xxxxxxx` backing — so the GPU decoded zero PM4 headers and never ran
|
||||
/// the real command stream.
|
||||
///
|
||||
/// Projecting any physical-mirror address back onto the `0x4xxxxxxx` window
|
||||
/// lands on the page `heap_alloc` actually backed, regardless of which mirror
|
||||
/// the guest used (idempotent for `0x4xxxxxxx` itself). The projection is
|
||||
/// derived from `heap_alloc`'s placement, not a guess — if that window ever
|
||||
/// moves, `PHYSICAL_BACKING_BASE` must move with it.
|
||||
///
|
||||
/// This is deliberately applied only at the GPU/Vd boundary (where addresses
|
||||
/// arrive in their bare-physical form), NOT on the CPU's flat load/store path:
|
||||
/// the guest CPU already accesses its allocations through the `0x4xxxxxxx`
|
||||
/// base, and non-physical guest-virtual addresses (image `0x82xxxxxx`, stacks
|
||||
/// `0x7xxxxxxx`) must stay flat.
|
||||
#[inline]
|
||||
pub fn physical_to_backing(addr: u32) -> u32 {
|
||||
match addr {
|
||||
0x0000_0000..=0x1FFF_FFFF
|
||||
| 0x4000_0000..=0x4FFF_FFFF
|
||||
| 0xA000_0000..=0xBFFF_FFFF
|
||||
| 0xC000_0000..=0xDFFF_FFFF
|
||||
| 0xE000_0000..=0xFFFF_FFFF => PHYSICAL_BACKING_BASE | (addr & 0x1FFF_FFFF),
|
||||
_ => addr,
|
||||
}
|
||||
}
|
||||
|
||||
/// Max guest page-version over the `[base, base+len)` span, walking 4 KiB
|
||||
/// pages via the `MemoryAccess` trait's `page_version`.
|
||||
///
|
||||
/// The concrete heap exposes an inherent `max_page_version(base, len)`, but
|
||||
/// the draw handler only holds `&dyn MemoryAccess` (which carries the coarser
|
||||
/// `page_version(addr)` accessor). This is byte-equivalent to
|
||||
/// `heap::max_page_version` and stays a pure function of the per-page write
|
||||
/// counters (no wall-clock), so texture-decode timing remains deterministic.
|
||||
fn span_max_version(mem: &dyn MemoryAccess, base: u32, len: u32) -> u64 {
|
||||
const PAGE: u32 = 0x1000;
|
||||
let last = base.saturating_add(len.saturating_sub(1));
|
||||
let mut page = base & !(PAGE - 1);
|
||||
let last_page = last & !(PAGE - 1);
|
||||
let mut max = 0u64;
|
||||
loop {
|
||||
max = max.max(mem.page_version(page));
|
||||
if page >= last_page {
|
||||
break;
|
||||
}
|
||||
page = page.wrapping_add(PAGE);
|
||||
}
|
||||
max
|
||||
}
|
||||
|
||||
/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ShaderBlob {
|
||||
@@ -58,21 +132,37 @@ pub enum WaitCmp {
|
||||
GreaterEq,
|
||||
/// value > ref
|
||||
Greater,
|
||||
/// Always — caller wants to sleep regardless.
|
||||
/// Always — caller wants to sleep regardless (selector bit 7).
|
||||
Always,
|
||||
/// Never matches — `wait_info & 7 == 0` selects bit 0 of canary's
|
||||
/// selector word, which is always zero.
|
||||
Never,
|
||||
}
|
||||
|
||||
impl WaitCmp {
|
||||
/// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`.
|
||||
/// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`
|
||||
/// (`pm4_command_processor_implement.h:685-696`). Canary forms a selector
|
||||
/// `((value<ref)<<1) | ((value<=ref)<<2) | ((value==ref)<<3) |
|
||||
/// ((value!=ref)<<4) | ((value>=ref)<<5) | ((value>ref)<<6) | (1<<7)` and
|
||||
/// evaluates `(selector >> (wait_info & 7)) & 1`. So the index is the bit
|
||||
/// position: 1=Less, 2=LessEq, 3=Equal, 4=NotEqual, 5=GreaterEq,
|
||||
/// 6=Greater, 7=always-true, 0=never (bit 0 is always clear).
|
||||
///
|
||||
/// GPUBUG: the prior mapping was off by one (it started at `0 => Less`),
|
||||
/// so `wait_info & 7 == 3` decoded as `NotEqual` instead of `Equal`. That
|
||||
/// inverted the standard CP coherency wait
|
||||
/// (`WAIT_REG_MEM COHER_STATUS_HOST, Equal 0`): the GPU parked forever on
|
||||
/// the first INDIRECT_BUFFER and never reached any draw.
|
||||
pub fn from_wait_info(wait_info: u32) -> Self {
|
||||
match wait_info & 0x7 {
|
||||
0 => WaitCmp::Less,
|
||||
1 => WaitCmp::LessEq,
|
||||
2 => WaitCmp::Equal,
|
||||
3 => WaitCmp::NotEqual,
|
||||
4 => WaitCmp::GreaterEq,
|
||||
5 => WaitCmp::Greater,
|
||||
_ => WaitCmp::Always,
|
||||
1 => WaitCmp::Less,
|
||||
2 => WaitCmp::LessEq,
|
||||
3 => WaitCmp::Equal,
|
||||
4 => WaitCmp::NotEqual,
|
||||
5 => WaitCmp::GreaterEq,
|
||||
6 => WaitCmp::Greater,
|
||||
7 => WaitCmp::Always,
|
||||
_ => WaitCmp::Never,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,6 +175,7 @@ impl WaitCmp {
|
||||
WaitCmp::GreaterEq => value >= reference,
|
||||
WaitCmp::Greater => value > reference,
|
||||
WaitCmp::Always => true,
|
||||
WaitCmp::Never => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -333,12 +424,24 @@ pub struct GpuSystem {
|
||||
/// on every texture-fetch resolution; the UI thread sees the decoded
|
||||
/// bytes via `UiBridge::publish_texture`.
|
||||
pub texture_cache: crate::texture_cache::TextureCache,
|
||||
/// P5b: textures decoded at the most recent `PM4_DRAW_INDX*`, keyed off
|
||||
/// the *active* pixel shader's real `tfetch` fetch-constant slots (not a
|
||||
/// hardcoded slot). `vd_swap` publishes the first of these to the UI so
|
||||
/// the replay binds the texture the draw actually samples. Cleared and
|
||||
/// repopulated each draw; empty when the active PS issues no `tfetch`.
|
||||
pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, u64, Vec<u8>)>,
|
||||
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
|
||||
/// (future) host-render-target readback; read by the resolve byte-copy
|
||||
/// path that writes tiled pixels into guest memory. Allocated once at
|
||||
/// `GpuSystem::new` and lives for the whole GPU lifetime — no
|
||||
/// per-frame churn.
|
||||
pub edram: crate::edram::ShadowEdram,
|
||||
/// UI-only: when `Some`, every `PM4_DRAW_INDX*` appends a
|
||||
/// [`crate::draw_capture::DrawCapture`] here so the host UI can replay the
|
||||
/// real guest geometry. `None` in headless/deterministic mode — the
|
||||
/// `--gpu-inline` golden never enables this, so capture is entirely inert
|
||||
/// for `check`. Drained (taken) by `vd_swap` at each present.
|
||||
pub frame_captures: Option<Vec<crate::draw_capture::DrawCapture>>,
|
||||
}
|
||||
|
||||
impl GpuSystem {
|
||||
@@ -364,7 +467,17 @@ impl GpuSystem {
|
||||
rt_cache: crate::render_target_cache::RenderTargetCache::new(),
|
||||
last_resolve: None,
|
||||
texture_cache: crate::texture_cache::TextureCache::new(),
|
||||
last_draw_textures: Vec::new(),
|
||||
edram: crate::edram::ShadowEdram::new(),
|
||||
frame_captures: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable per-draw geometry capture for the host UI. Inert (and never
|
||||
/// called) in headless/deterministic mode. Idempotent.
|
||||
pub fn enable_frame_capture(&mut self) {
|
||||
if self.frame_captures.is_none() {
|
||||
self.frame_captures = Some(Vec::new());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -536,14 +649,21 @@ impl GpuSystem {
|
||||
/// Release.
|
||||
pub fn sync_with_mmio(&mut self) {
|
||||
let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire);
|
||||
if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 {
|
||||
self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords;
|
||||
// CP_RB_WPTR governs ONLY the primary ring. While an indirect buffer
|
||||
// is executing, the active `self.ring` is a fixed linear sub-stream
|
||||
// and the primary ring is saved at the bottom of the IB stack —
|
||||
// applying the (primary) write pointer to the IB would corrupt its
|
||||
// extent (e.g. `wptr % ib_size`) and strand the GPU mid-buffer.
|
||||
let primary = self.ib_stack.first_mut().unwrap_or(&mut self.ring);
|
||||
if wptr_dwords != primary.write_offset_dwords && primary.size_dwords != 0 {
|
||||
primary.write_offset_dwords = wptr_dwords % primary.size_dwords;
|
||||
}
|
||||
// Mirror our read pointer (Release pairs with any guest-side
|
||||
let primary_rptr = primary.read_offset_dwords;
|
||||
// Mirror the *primary* read pointer (Release pairs with any guest-side
|
||||
// Acquire-load of CP_RB_RPTR for ring writeback bookkeeping).
|
||||
self.mmio
|
||||
.cp_rb_rptr
|
||||
.store(self.ring.read_offset_dwords, Ordering::Release);
|
||||
.store(primary_rptr, Ordering::Release);
|
||||
}
|
||||
|
||||
/// True iff `execute_one` is expected to make progress without blocking.
|
||||
@@ -551,7 +671,11 @@ impl GpuSystem {
|
||||
if let Some(block) = &self.pending_block {
|
||||
return block.is_satisfied(mem, &self.register_file);
|
||||
}
|
||||
self.ring.has_pending()
|
||||
// Pending work may be in the active ring OR in a saved caller ring
|
||||
// further down the IB stack (an exhausted IB still needs `execute_one`
|
||||
// to pop back and resume the primary ring, whose WPTR may have since
|
||||
// advanced).
|
||||
self.ring.has_pending() || self.ib_stack.iter().any(|r| r.has_pending())
|
||||
}
|
||||
|
||||
/// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when
|
||||
@@ -561,6 +685,12 @@ impl GpuSystem {
|
||||
pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome {
|
||||
// 0) If currently parked, probe the condition and either wake up or stay blocked.
|
||||
if let Some(block) = self.pending_block.clone() {
|
||||
// Re-service the CP coherency handshake on each probe so a
|
||||
// COHER_STATUS_HOST wait can clear (canary does this in its WAIT
|
||||
// loop body, not just at entry).
|
||||
if let GpuBlock::WaitRegMem { poll_addr, is_memory: false, .. } = &block {
|
||||
self.make_coherent(*poll_addr);
|
||||
}
|
||||
if block.is_satisfied(mem, &self.register_file) {
|
||||
tracing::debug!(?block, "gpu: wait satisfied — resuming");
|
||||
self.pending_block = None;
|
||||
@@ -642,10 +772,13 @@ impl GpuSystem {
|
||||
width,
|
||||
height,
|
||||
});
|
||||
self.pending_interrupts.push(PendingInterrupt {
|
||||
source: InterruptSource::Swap,
|
||||
cpu_mask: 0x1,
|
||||
});
|
||||
// iterate-2T: do NOT raise a CP swap-complete interrupt here. Canary's
|
||||
// `VdSwap`/PM4_XE_SWAP path raises no interrupt; swap-complete CP
|
||||
// interrupts come ONLY from in-stream `PM4_INTERRUPT` packets, which
|
||||
// are naturally ordered after D3D has armed the swap-callback slot.
|
||||
// Synthesizing one out of band (as we did pre-2T) delivered a CP
|
||||
// interrupt while the slot still held the `0xBADF00D` placeholder,
|
||||
// tripping the graphics ISR's "Unanticipated CPU_INTERRUPT" assert.
|
||||
tracing::info!(
|
||||
frame = self.swap_counter,
|
||||
fb = format_args!("{frontbuffer_phys:#010x}"),
|
||||
@@ -657,9 +790,21 @@ impl GpuSystem {
|
||||
|
||||
/// Called by `VdInitializeRingBuffer` to give us the primary ring.
|
||||
pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
|
||||
let size_bytes = 1u32 << size_log2.min(31);
|
||||
// Canary `CommandProcessor::InitializeRingBuffer` (command_processor.cc:
|
||||
// 436): `primary_buffer_size_ = 1 << (size_log2 + 3)` *bytes*. The
|
||||
// `VdInitializeRingBuffer` `r4` argument is log2(size-in-quadwords),
|
||||
// so the byte size is `1 << (size_log2 + 3)` (× 8 bytes/quadword), i.e.
|
||||
// `1 << (size_log2 + 1)` dwords. (Sylpheed passes size_log2=12 →
|
||||
// 32768 bytes / 8192 dwords; the previous `1 << size_log2` undersized
|
||||
// the ring 8× and desynced WPTR wrap math from the guest.)
|
||||
let size_bytes = 1u32 << size_log2.saturating_add(3).min(31);
|
||||
// The guest hands us a bare *physical* ring base; project it onto the
|
||||
// committed backing window so ring reads hit real PM4 packets (see
|
||||
// `physical_to_backing`).
|
||||
let base = physical_to_backing(base);
|
||||
self.ring.base = base;
|
||||
self.ring.size_dwords = size_bytes / 4;
|
||||
self.ring.indirect = false;
|
||||
self.ring.read_offset_dwords = 0;
|
||||
// `write_offset` is driven by the guest — start at 0 so the ring
|
||||
// appears empty until MMIO writes advance it.
|
||||
@@ -675,6 +820,10 @@ impl GpuSystem {
|
||||
/// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest
|
||||
/// expects us to mirror `read_offset_dwords`.
|
||||
pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) {
|
||||
// The guest registers a bare *physical* writeback address and polls
|
||||
// the same allocation through its `0x4xxxxxxx` base; project so our
|
||||
// RPtr store lands on the page the guest actually reads.
|
||||
let addr = physical_to_backing(addr);
|
||||
self.ring.rptr_writeback_addr = addr;
|
||||
self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31);
|
||||
tracing::info!(
|
||||
@@ -724,6 +873,58 @@ impl GpuSystem {
|
||||
/// upstream packet effects (memory writes, register file updates
|
||||
/// the guest reads via subsequent MMIO) happen-before the
|
||||
/// CPU-visible RPTR bump.
|
||||
/// Service a CP coherency request, mirroring canary's
|
||||
/// `CommandProcessor::MakeCoherent` (`command_processor.cc:801-838`).
|
||||
///
|
||||
/// The guest requests a vertex/texture-cache flush by writing
|
||||
/// `COHER_STATUS_HOST` with its status bit (bit 31) set, then spins on a
|
||||
/// `WAIT_REG_MEM COHER_STATUS_HOST, Equal 0`. We have no host cache to
|
||||
/// flush (memory is shared, coherency is implicit), so completing the
|
||||
/// request is simply clearing the register — which lets the wait satisfy.
|
||||
/// No-op unless `poll_addr` is `COHER_STATUS_HOST` and its status bit is
|
||||
/// set, so it is safe to call on every coherency-register WAIT probe.
|
||||
fn make_coherent(&mut self, poll_addr: u32) {
|
||||
if poll_addr != reg::COHER_STATUS_HOST {
|
||||
return;
|
||||
}
|
||||
let status = self.register_file.read(reg::COHER_STATUS_HOST);
|
||||
if status & 0x8000_0000 != 0 {
|
||||
self.register_file.write(reg::COHER_STATUS_HOST, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// CP scratch-register memory writeback, mirroring canary's
|
||||
/// `CommandProcessor::HandleSpecialRegisterWrite`
|
||||
/// (`command_processor.cc:545-552`). Every register write runs through
|
||||
/// here; when the target is one of the eight `SCRATCH_REG{n}`
|
||||
/// (`0x0578..=0x057F`) **and** the matching bit in `SCRATCH_UMSK` is set,
|
||||
/// the value is also written (big-endian, as `mem.write_u32` already
|
||||
/// stores) to `SCRATCH_ADDR + n*4` in guest physical memory.
|
||||
///
|
||||
/// Sylpheed arms its CP swap-complete interrupt callback through this
|
||||
/// path: it programs `SCRATCH_ADDR` to the GPU command-block descriptor
|
||||
/// (`[gfx+10772]`, runtime `0x0b1d5000`), `SCRATCH_UMSK` bit 4, then a
|
||||
/// Type-0 write of the callback PC `0x824ce2b8` into `SCRATCH_REG4`
|
||||
/// (`0x057C`). The writeback lands it at descriptor+16 (`0x4b1d5010`),
|
||||
/// which the graphics ISR (`sub_824BE9A0`) reads via `[[gfx+10772]+16]`
|
||||
/// and `bcctrl`s to fire the swap-complete callback. Without this
|
||||
/// writeback the slot stayed NULL, the ISR skipped the callback, the
|
||||
/// swap counter never advanced, and the title's per-frame manager
|
||||
/// re-fired once then plateaued.
|
||||
fn scratch_register_writeback(&self, mem: &dyn MemoryAccess, index: u32, value: u32) {
|
||||
if !(reg::SCRATCH_REG0..=reg::SCRATCH_REG7).contains(&index) {
|
||||
return;
|
||||
}
|
||||
let scratch_reg = index - reg::SCRATCH_REG0;
|
||||
let umsk = self.register_file.read(reg::SCRATCH_UMSK);
|
||||
if (1u32 << scratch_reg) & umsk == 0 {
|
||||
return;
|
||||
}
|
||||
let scratch_addr = self.register_file.read(reg::SCRATCH_ADDR);
|
||||
let mem_addr = physical_to_backing(scratch_addr.wrapping_add(scratch_reg * 4));
|
||||
mem.write_u32(mem_addr, value);
|
||||
}
|
||||
|
||||
fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) {
|
||||
if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() {
|
||||
mem.write_u32_fence(
|
||||
@@ -748,6 +949,7 @@ impl GpuSystem {
|
||||
let value = mem.read_u32(dword_addr);
|
||||
let target = if write_one { base_index } else { base_index + i };
|
||||
self.register_file.write(target, value);
|
||||
self.scratch_register_writeback(mem, target, value);
|
||||
}
|
||||
tracing::trace!(
|
||||
base = format_args!("{base_index:#x}"),
|
||||
@@ -770,6 +972,8 @@ impl GpuSystem {
|
||||
let b = mem.read_u32(b_addr);
|
||||
self.register_file.write(reg_index_1, a);
|
||||
self.register_file.write(reg_index_2, b);
|
||||
self.scratch_register_writeback(mem, reg_index_1, a);
|
||||
self.scratch_register_writeback(mem, reg_index_2, b);
|
||||
tracing::trace!(
|
||||
r1 = format_args!("{reg_index_1:#x}"),
|
||||
r2 = format_args!("{reg_index_2:#x}"),
|
||||
@@ -816,7 +1020,9 @@ impl GpuSystem {
|
||||
}
|
||||
pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => {
|
||||
self.stats.indirect_buffer_jumps += 1;
|
||||
let ib_ptr = self.read_payload(mem, 1);
|
||||
// The IB pointer is a guest *physical* address — project it
|
||||
// onto the committed backing window (see `physical_to_backing`).
|
||||
let ib_ptr = physical_to_backing(self.read_payload(mem, 1));
|
||||
let ib_size = self.read_payload(mem, 2);
|
||||
// Advance past the IB header + payload before recursing so
|
||||
// the return location is correct.
|
||||
@@ -832,6 +1038,10 @@ impl GpuSystem {
|
||||
write_offset_dwords: ib_size, // IB is fully-written at jump time
|
||||
rptr_writeback_addr: 0,
|
||||
rptr_writeback_block_dwords: 0,
|
||||
// Linear sub-stream: drain [0, ib_size) then pop. Never
|
||||
// wraps, and `sync_with_mmio`'s CP_RB_WPTR must not touch
|
||||
// it (canary executes IBs through a separate reader).
|
||||
indirect: true,
|
||||
};
|
||||
tracing::debug!(
|
||||
ib_ptr = format_args!("{ib_ptr:#010x}"),
|
||||
@@ -854,7 +1064,8 @@ impl GpuSystem {
|
||||
let is_memory = (wait_info & 0x10) != 0;
|
||||
let cmp = WaitCmp::from_wait_info(wait_info);
|
||||
let poll_addr = if is_memory {
|
||||
poll_addr_raw & !3
|
||||
// Physical memory poll address → committed backing.
|
||||
physical_to_backing(poll_addr_raw & !3)
|
||||
} else {
|
||||
poll_addr_raw
|
||||
};
|
||||
@@ -865,6 +1076,12 @@ impl GpuSystem {
|
||||
mask,
|
||||
cmp,
|
||||
};
|
||||
// A WAIT polling COHER_STATUS_HOST is the CP coherency
|
||||
// handshake: service it now so the status bit clears (see
|
||||
// `make_coherent`), exactly as canary does in its WAIT loop.
|
||||
if !is_memory {
|
||||
self.make_coherent(poll_addr);
|
||||
}
|
||||
if block.is_satisfied(mem, &self.register_file) {
|
||||
// Condition already true; proceed past this packet.
|
||||
tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied");
|
||||
@@ -908,7 +1125,7 @@ impl GpuSystem {
|
||||
pm4::PM4_REG_TO_MEM => {
|
||||
// payload[0] = reg_index, payload[1] = mem addr
|
||||
let reg_index = self.read_payload(mem, 1) & 0x1FFF;
|
||||
let dst = self.read_payload(mem, 2) & !3;
|
||||
let dst = physical_to_backing(self.read_payload(mem, 2) & !3);
|
||||
let value = self.register_file.read(reg_index);
|
||||
mem.write_u32(dst, value);
|
||||
tracing::trace!(
|
||||
@@ -920,7 +1137,7 @@ impl GpuSystem {
|
||||
}
|
||||
pm4::PM4_MEM_WRITE => {
|
||||
// payload[0] = dst, payload[1..=count-1] = values
|
||||
let mut dst = self.read_payload(mem, 1) & !3;
|
||||
let mut dst = physical_to_backing(self.read_payload(mem, 1) & !3);
|
||||
for i in 2..=count {
|
||||
let val = self.read_payload(mem, i);
|
||||
mem.write_u32(dst, val);
|
||||
@@ -936,7 +1153,7 @@ impl GpuSystem {
|
||||
let mask = self.read_payload(mem, 4);
|
||||
let is_memory = (wait_info & 0x10) != 0;
|
||||
let cmp = WaitCmp::from_wait_info(wait_info);
|
||||
let poll_addr = if is_memory { poll_raw & !3 } else { poll_raw };
|
||||
let poll_addr = if is_memory { physical_to_backing(poll_raw & !3) } else { poll_raw };
|
||||
let cur_raw = if is_memory {
|
||||
mem.read_u32(poll_addr)
|
||||
} else {
|
||||
@@ -946,7 +1163,7 @@ impl GpuSystem {
|
||||
let write_addr = self.read_payload(mem, 5);
|
||||
let write_data = self.read_payload(mem, 6);
|
||||
if (wait_info & 0x100) != 0 {
|
||||
mem.write_u32(write_addr & !3, write_data);
|
||||
mem.write_u32(physical_to_backing(write_addr & !3), write_data);
|
||||
} else {
|
||||
self.register_file
|
||||
.write(write_addr & 0x1FFF, write_data);
|
||||
@@ -965,7 +1182,7 @@ impl GpuSystem {
|
||||
// payload[0] = initiator (bit 31: write counter, else write `value`)
|
||||
// payload[1] = address, payload[2] = value
|
||||
let initiator = self.read_payload(mem, 1);
|
||||
let address = self.read_payload(mem, 2);
|
||||
let address = physical_to_backing(self.read_payload(mem, 2));
|
||||
let value = self.read_payload(mem, 3);
|
||||
self.register_file
|
||||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||||
@@ -993,7 +1210,7 @@ impl GpuSystem {
|
||||
// payload[0] = initiator, [1] = address. Writes 6 u16 extents
|
||||
// (min/max x/y/z) — we're not tracking scissors yet, so write zeros.
|
||||
let initiator = self.read_payload(mem, 1);
|
||||
let address = self.read_payload(mem, 2) & !3;
|
||||
let address = physical_to_backing(self.read_payload(mem, 2) & !3);
|
||||
self.register_file
|
||||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||||
self.handle_event_initiator(initiator & 0x3F, mem);
|
||||
@@ -1093,7 +1310,146 @@ impl GpuSystem {
|
||||
"gpu: DRAW_INDX captured"
|
||||
);
|
||||
self.last_draw = Some(ds);
|
||||
let host_vertex_count = processed.host_vertex_count;
|
||||
self.last_primitive = Some(processed);
|
||||
|
||||
// iterate-3O: UI-only per-draw geometry capture. Resolves the
|
||||
// real guest vertex window behind this draw (from the active
|
||||
// VS's vertex-fetch constant) so the host UI can replay the
|
||||
// actual splash geometry instead of synthetic shapes. Entirely
|
||||
// inert in headless/deterministic mode (`frame_captures` is
|
||||
// `None`), so the `--gpu-inline` golden is unaffected.
|
||||
if self.frame_captures.is_some() {
|
||||
let vs_key = self.active_vs_key.unwrap_or(0);
|
||||
let ps_key = self.active_ps_key.unwrap_or(0);
|
||||
let parsed_vs = self
|
||||
.active_vs_key
|
||||
.and_then(|k| self.shader_blobs.get(&k))
|
||||
.map(|b| crate::ucode::parse_shader(&b.dwords));
|
||||
let (idx_src, idx_size) = match ds.index_source {
|
||||
crate::draw_state::IndexSource::Dma { index_size, .. } => {
|
||||
(ds.index_source, index_size)
|
||||
}
|
||||
crate::draw_state::IndexSource::Immediate { index_size } => {
|
||||
(ds.index_source, index_size)
|
||||
}
|
||||
crate::draw_state::IndexSource::AutoIndex => {
|
||||
(ds.index_source, crate::draw_state::IndexSize::Sixteen)
|
||||
}
|
||||
};
|
||||
let cap = crate::draw_capture::build(
|
||||
self.stats.draws_seen as u32,
|
||||
ds.primitive,
|
||||
host_vertex_count,
|
||||
idx_src,
|
||||
idx_size,
|
||||
vs_key,
|
||||
ps_key,
|
||||
parsed_vs.as_ref(),
|
||||
&self.register_file,
|
||||
mem,
|
||||
);
|
||||
if let Some(caps) = self.frame_captures.as_mut() {
|
||||
// Bound the per-frame list so a runaway frame can't grow
|
||||
// host memory without limit; keep the most recent.
|
||||
const MAX_CAPS: usize = 4096;
|
||||
if caps.len() >= MAX_CAPS {
|
||||
caps.remove(0);
|
||||
}
|
||||
caps.push(cap);
|
||||
}
|
||||
}
|
||||
|
||||
// P5b: decode the textures the *active pixel shader* actually
|
||||
// samples. Parse the bound PS, collect its `tfetch`
|
||||
// fetch-constant slots, read each 6-dword fetch constant from
|
||||
// the register file, and decode+cache it. `vd_swap` publishes
|
||||
// the result. Empty for flat (no-tfetch) shaders — the
|
||||
// dominant case on Sylpheed's current splash, where this stays
|
||||
// inert until the textured logo draw is reached.
|
||||
self.last_draw_textures.clear();
|
||||
if let Some(ps_key) = self.active_ps_key {
|
||||
// Collect slots under an immutable borrow of `shader_blobs`,
|
||||
// then drop it before mutating `texture_cache`.
|
||||
let slots: Vec<u8> = match self.shader_blobs.get(&ps_key) {
|
||||
Some(blob) => {
|
||||
let parsed = crate::ucode::parse_shader(&blob.dwords);
|
||||
crate::shader_metrics::tfetch_slots(&parsed)
|
||||
}
|
||||
None => Vec::new(),
|
||||
};
|
||||
for slot in slots {
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (k, w) in fetch6.iter_mut().enumerate() {
|
||||
*w = self
|
||||
.register_file
|
||||
.read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32);
|
||||
}
|
||||
let Some(mut key) = crate::texture_cache::decode_fetch_constant(fetch6) else {
|
||||
continue;
|
||||
};
|
||||
// The Xenos texture fetch constant carries a guest
|
||||
// *physical* base address (`base >> 12`). On the Xbox
|
||||
// 360 the GPU reads the unified physical memory; the
|
||||
// CPU writes the (decompressed) texels through its
|
||||
// cached-physical aperture, which ours backs at the
|
||||
// committed `0x4000_0000` window. Map the physical
|
||||
// base onto that backing window so the GPU samples the
|
||||
// bytes the guest actually wrote — exactly as the
|
||||
// vertex-fetch path does (`draw_capture.rs`) and as
|
||||
// canary reads textures through its GPU shared memory
|
||||
// (= physical). Without this the decode reads the
|
||||
// low VA `0x0dbee000` (always zero) instead of the
|
||||
// filled `0x4dbee000`, flattening every disk-asset
|
||||
// texture (e.g. the publisher logo `E59B2B3D`).
|
||||
key.base_address = physical_to_backing(key.base_address);
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
* (bi.bytes_per_block as u32)
|
||||
/ (bi.block_w as u32);
|
||||
let version = span_max_version(mem, key.base_address, span_bytes.max(4));
|
||||
match self.texture_cache.ensure_cached(key, version, mem) {
|
||||
Ok(entry) => {
|
||||
// iterate-3AD: carry the real content `version`
|
||||
// (from `span_max_version`) so the UI host
|
||||
// texture cache re-uploads when the guest fills
|
||||
// more of an evolving atlas (e.g. the 2nd splash
|
||||
// logo's texels land after the publisher's, in
|
||||
// the SAME K8888 surface). Previously the UI
|
||||
// pinned `version_when_uploaded = 1`, so the
|
||||
// first (partial) upload stuck and later draws
|
||||
// sampled the not-yet-filled region as black.
|
||||
self.last_draw_textures
|
||||
.push((entry.key, version, entry.bytes.clone()));
|
||||
metrics::counter!(
|
||||
"gpu.texture.decode",
|
||||
"fmt" => format!("{:?}", key.format),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::counter!(
|
||||
"gpu.texture.reject",
|
||||
"reason" => format!("{e:?}"),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// iterate-3T: attach this draw's decoded textures to the just-
|
||||
// captured draw so the UI can bind the real artwork per-draw
|
||||
// (keyed off the active PS's real tfetch slots) instead of a
|
||||
// single last-draw `primary_texture`. UI-only (`frame_captures`
|
||||
// is `None` headless); does not touch the deterministic core.
|
||||
if !self.last_draw_textures.is_empty()
|
||||
&& let Some(caps) = self.frame_captures.as_mut()
|
||||
&& let Some(last) = caps.last_mut()
|
||||
{
|
||||
last.textures = self.last_draw_textures.clone();
|
||||
}
|
||||
}
|
||||
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
||||
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
||||
@@ -1123,7 +1479,7 @@ impl GpuSystem {
|
||||
}
|
||||
pm4::PM4_LOAD_ALU_CONSTANT => {
|
||||
// payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords
|
||||
let src = self.read_payload(mem, 1) & !3;
|
||||
let src = physical_to_backing(self.read_payload(mem, 1) & !3);
|
||||
let offset_type = self.read_payload(mem, 2);
|
||||
let size_dwords = self.read_payload(mem, 3);
|
||||
let index = offset_type & 0x7FF;
|
||||
@@ -1155,7 +1511,7 @@ impl GpuSystem {
|
||||
}
|
||||
v
|
||||
} else {
|
||||
let addr = self.read_payload(mem, 1) & !3;
|
||||
let addr = physical_to_backing(self.read_payload(mem, 1) & !3);
|
||||
let mut v = Vec::with_capacity(size_dwords as usize);
|
||||
for i in 0..size_dwords {
|
||||
v.push(mem.read_u32(addr + i * 4));
|
||||
@@ -1373,11 +1729,31 @@ pub mod reg {
|
||||
/// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
|
||||
/// Bit 0 = VBLANK_INT_OCCURRED.
|
||||
pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
|
||||
/// `XE_GPU_REG_D1MODE_VIEWPORT_SIZE` / `AVIVO_D1MODE_VIEWPORT_SIZE`
|
||||
/// (Canary `register_table.inc:1134`). Packs the active display resolution
|
||||
/// as `(width << 16) | height` with 12-bit fields. The guest's
|
||||
/// swap-complete interrupt callback (`sub_824CE2B8`) divides by the low
|
||||
/// 12 bits (`height`) as a refresh-pacing term, so a 0 read makes its
|
||||
/// `twi` divide-by-zero guard trap and abort the ISR before it clears the
|
||||
/// swap-acknowledge fence. Canary returns the constant below from
|
||||
/// `GraphicsSystem::ReadRegister` (graphics_system.cc:311).
|
||||
pub const D1MODE_VIEWPORT_SIZE: u32 = 0x1961;
|
||||
/// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
|
||||
pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
|
||||
/// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
|
||||
/// (Canary `register_table.inc:530`).
|
||||
pub const COHER_STATUS_HOST: u32 = 0x0A31;
|
||||
/// `XE_GPU_REG_SCRATCH_UMSK` — bitmask of which `SCRATCH_REG{n}` writes are
|
||||
/// mirrored to memory (Canary `register_table.inc:139`).
|
||||
pub const SCRATCH_UMSK: u32 = 0x01DC;
|
||||
/// `XE_GPU_REG_SCRATCH_ADDR` — base physical address of the scratch
|
||||
/// writeback block (Canary `register_table.inc:141`).
|
||||
pub const SCRATCH_ADDR: u32 = 0x01DD;
|
||||
/// `XE_GPU_REG_SCRATCH_REG0` — first of 8 CP scratch registers
|
||||
/// (`0x0578..=0x057F`, Canary `register_table.inc:331-338`).
|
||||
pub const SCRATCH_REG0: u32 = 0x0578;
|
||||
/// `XE_GPU_REG_SCRATCH_REG7` — last CP scratch register.
|
||||
pub const SCRATCH_REG7: u32 = 0x057F;
|
||||
}
|
||||
|
||||
/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a
|
||||
@@ -1468,6 +1844,38 @@ mod tests {
|
||||
assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scratch_reg_write_mirrors_to_memory_when_umsk_enabled() {
|
||||
// Mirrors Sylpheed's CP swap-callback arming: SCRATCH_ADDR points at a
|
||||
// descriptor, SCRATCH_UMSK enables bit 4, and a Type-0 write of the
|
||||
// callback PC into SCRATCH_REG4 (0x57C) must land at SCRATCH_ADDR + 16.
|
||||
let mut gpu = GpuSystem::new();
|
||||
let mut mem = build_mem();
|
||||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||||
// Program SCRATCH_ADDR = 0x4000_1000 (physical-mirror identity), and
|
||||
// SCRATCH_UMSK = bit 4 only (so SCRATCH_REG4 mirrors, REG3 does not).
|
||||
gpu.register_file.write(reg::SCRATCH_ADDR, 0x4000_1000);
|
||||
gpu.register_file.write(reg::SCRATCH_UMSK, 1 << 4);
|
||||
// Type0 write run: base = SCRATCH_REG3 (0x57B), count = 2 → writes
|
||||
// 0x11111111 → SCRATCH_REG3 (UMSK bit 3 clear), 0x824CE2B8 →
|
||||
// SCRATCH_REG4 (UMSK bit 4 set → mirrored to ADDR + 4*4 = +16).
|
||||
const SCRATCH_REG3: u32 = 0x057B;
|
||||
let hdr = (1u32 << 16) | SCRATCH_REG3;
|
||||
mem.write_u32(0x4000_0000, hdr);
|
||||
mem.write_u32(0x4000_0004, 0x1111_1111);
|
||||
mem.write_u32(0x4000_0008, 0x824C_E2B8);
|
||||
gpu.extend_write_ptr(3);
|
||||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||||
// SCRATCH_REG3 (bit 3 clear) must NOT mirror; SCRATCH_REG4 (bit 4 set)
|
||||
// must mirror to SCRATCH_ADDR + 16.
|
||||
assert_eq!(mem.read_u32(0x4000_1000 + 12), 0, "reg3 must not mirror");
|
||||
assert_eq!(
|
||||
mem.read_u32(0x4000_1000 + 16),
|
||||
0x824C_E2B8,
|
||||
"reg4 must mirror to SCRATCH_ADDR+16"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() {
|
||||
let mut gpu = GpuSystem::new();
|
||||
@@ -1477,8 +1885,9 @@ mod tests {
|
||||
// header
|
||||
let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8);
|
||||
mem.write_u32(0x4000_0000, hdr);
|
||||
// wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 2)
|
||||
mem.write_u32(0x4000_0004, 0x12);
|
||||
// wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 3, per canary's
|
||||
// MatchValueAndRef selector: 1=Less, 2=LessEq, 3=Equal, …).
|
||||
mem.write_u32(0x4000_0004, 0x13);
|
||||
mem.write_u32(0x4000_0008, 0x4000_1000);
|
||||
mem.write_u32(0x4000_000C, 0x42);
|
||||
mem.write_u32(0x4000_0010, 0xFFFF_FFFF);
|
||||
|
||||
@@ -444,6 +444,23 @@ impl GpuBackend {
|
||||
}
|
||||
}
|
||||
|
||||
/// Current guest present (`VdSwap`) count. Cheap single-field read used
|
||||
/// by the present-anchored vsync ticker (iterate-3AJ) every scheduler
|
||||
/// round. Inline mode reads the live counter directly; threaded mode
|
||||
/// reads the last-published digest mirror under a brief lock (the
|
||||
/// `--parallel` path uses the wall-clock vsync ticker anyway, so the
|
||||
/// exact freshness here is not load-bearing).
|
||||
pub fn swaps_seen(&self) -> u64 {
|
||||
match self {
|
||||
GpuBackend::Inline(s) => s.stats.swaps_seen,
|
||||
GpuBackend::Threaded(h) => h
|
||||
.digest
|
||||
.lock()
|
||||
.map(|d| d.stats.swaps_seen)
|
||||
.unwrap_or(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Forward [`GpuSystem::has_pending_interrupts`] under inline mode;
|
||||
/// under threaded mode peek the `int_rx` channel.
|
||||
pub fn has_pending_interrupts(&self) -> bool {
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
//! [`gpu_system::GpuSystem`].
|
||||
|
||||
pub mod command_processor;
|
||||
pub mod draw_capture;
|
||||
pub mod draw_state;
|
||||
pub mod edram;
|
||||
pub mod gpu_system;
|
||||
@@ -34,7 +35,7 @@ pub mod xenos_constants;
|
||||
|
||||
pub use gpu_system::{
|
||||
ExecOutcome, GpuBlock, GpuMmio, GpuStats, GpuSystem, InterruptSource, PendingInterrupt,
|
||||
ShaderBlob, SwapNotification, WaitCmp,
|
||||
PHYSICAL_BACKING_BASE, ShaderBlob, SwapNotification, WaitCmp, physical_to_backing,
|
||||
};
|
||||
pub use handle::{
|
||||
DrainReply, GpuBackend, GpuCommand, GpuDigestSnapshot, GpuHandle, GpuWorker,
|
||||
|
||||
@@ -58,6 +58,15 @@ pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
|
||||
reg::D1MODE_VBLANK_VLINE_STATUS => {
|
||||
read_vblank_status.load(Ordering::Relaxed)
|
||||
}
|
||||
// AVIVO_D1MODE_VIEWPORT_SIZE: the active display resolution
|
||||
// (1280x720) packed as `(width << 16) | height`. Canary
|
||||
// serves this constant from `GraphicsSystem::ReadRegister`
|
||||
// (graphics_system.cc:311). The guest swap-complete interrupt
|
||||
// callback divides by the low 12 bits (`height = 0x2D0`); a 0
|
||||
// read trips its `twi` divide-guard and aborts the ISR before
|
||||
// it acknowledges the per-present swap fence — which strands
|
||||
// the present/title loop. Mirror canary exactly.
|
||||
reg::D1MODE_VIEWPORT_SIZE => 0x0500_02D0,
|
||||
_ => {
|
||||
tracing::trace!(
|
||||
reg = format_args!("{reg_index:#x}"),
|
||||
|
||||
@@ -5,9 +5,8 @@
|
||||
//! rectangles) we rewrite indices on the CPU side so the host just sees a
|
||||
//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
|
||||
//!
|
||||
//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
|
||||
//! (list, strip, fan). Rectangle + quad expansions are stubs logged via
|
||||
//! `tracing::warn!` for later.
|
||||
//! Scope: list, strip, fan, quad, and rectangle expansions are all handled
|
||||
//! (rectangles via CPU triangle-list rewrite — see `expand_rectangles`).
|
||||
|
||||
use crate::draw_state::{IndexSize, PrimitiveType};
|
||||
|
||||
@@ -138,18 +137,43 @@ fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitiv
|
||||
}
|
||||
|
||||
/// Rectangle lists: a Xenos-specific primitive where each group of 3
|
||||
/// vertices defines a right-angle rectangle by its three non-repeated
|
||||
/// corners (the 4th is derived). The uber-shader doesn't support this yet;
|
||||
/// the ucode translator will emulate it as a geometry-stage fake. For P3
|
||||
/// we emit an empty draw.
|
||||
fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
|
||||
tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
|
||||
metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
|
||||
/// vertices defines a rectangle; the 4th corner is extrapolated as
|
||||
/// `v3 = v0 + v2 - v1` (parallelogram completion). Canary expands this in a
|
||||
/// host vertex-shader variant (`kRectangleListAsTriangleStrip`,
|
||||
/// `primitive_processor.cc:389-456`): a 4-vertex triangle strip per rect with
|
||||
/// the 4th corner synthesized *in the VS* from the host-vertex index.
|
||||
///
|
||||
/// Our replay pipeline has no host-VS corner synthesis (and the procedural
|
||||
/// `vs_main` does not consume `rewritten_indices` yet), so we mirror the
|
||||
/// `expand_quads`/`expand_fan` CPU idiom and emit the 3 real vertices of each
|
||||
/// rect as one triangle list `(v0,v1,v2)` — the visible lower half of the
|
||||
/// rect. This un-rejects the draw and gives a faithful `host_vertex_count`.
|
||||
///
|
||||
/// TODO: once `vs_main` does real vertex fetch + interpolation, upgrade to the
|
||||
/// full quad — 6 indices `[v0,v1,v2, v2,v1,v3]` with a synthesized `v3` corner
|
||||
/// — mirroring canary's `kRectangleListAsTriangleStrip`.
|
||||
fn expand_rectangles(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||
let rect_count = vertex_count / 3;
|
||||
let mut out = Vec::with_capacity(3 * rect_count as usize);
|
||||
let get = |i: u32| -> u32 {
|
||||
match indices {
|
||||
Some(buf) => buf[i as usize],
|
||||
None => i,
|
||||
}
|
||||
};
|
||||
for r in 0..rect_count {
|
||||
let base = r * 3;
|
||||
out.push(get(base));
|
||||
out.push(get(base + 1));
|
||||
out.push(get(base + 2));
|
||||
}
|
||||
let host_vertex_count = out.len() as u32;
|
||||
metrics::counter!("gpu.primitive.expanded", "shape" => "rectangle_list").increment(1);
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(Vec::new()),
|
||||
host_vertex_count: 0,
|
||||
rejected: true,
|
||||
rewritten_indices: Some(out),
|
||||
host_vertex_count,
|
||||
rejected: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,6 +237,17 @@ mod tests {
|
||||
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rectangle_list_expansion() {
|
||||
// 2 rects (6 verts) → one triangle (v0,v1,v2) per rect, not rejected.
|
||||
let p = process(PrimitiveType::RectangleList, 6, None);
|
||||
let idx = p.rewritten_indices.unwrap();
|
||||
assert_eq!(idx, vec![0, 1, 2, 3, 4, 5]);
|
||||
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||
assert_eq!(p.host_vertex_count, 6);
|
||||
assert!(!p.rejected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn widen_u16_indices_big_endian() {
|
||||
// 3 indices [1, 2, 0x1234] in BE u16.
|
||||
|
||||
@@ -364,7 +364,11 @@ pub fn copy_to_memory(
|
||||
// Destination coordinates are 0-based against `dest_base` — the
|
||||
// base already points at the top-left of the copy rectangle.
|
||||
let dst_off = tiled_2d_offset(dx, dy, pitch_aligned, bpp_log2);
|
||||
let dst_addr = info.dest_base.wrapping_add(dst_off);
|
||||
// `dest_base` is a bare guest *physical* address; project onto the
|
||||
// committed backing window so resolved pixels land where the guest
|
||||
// (and `vd_swap`'s frontbuffer read) actually see them.
|
||||
let dst_addr =
|
||||
crate::gpu_system::physical_to_backing(info.dest_base.wrapping_add(dst_off));
|
||||
|
||||
if info.source_is_64bpp {
|
||||
let (lo, hi) = match single_sample_idx {
|
||||
|
||||
@@ -32,6 +32,16 @@ pub struct RingBufferView {
|
||||
/// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so
|
||||
/// we don't actually use this for scheduling — kept for observability.
|
||||
pub rptr_writeback_block_dwords: u32,
|
||||
/// True for an indirect-buffer (`INDIRECT_BUFFER`) view. An IB is a fixed
|
||||
/// *linear* sub-stream, not a circular ring: it is fully written when the
|
||||
/// GPU jumps to it, so the read pointer advances monotonically from `0` to
|
||||
/// `size_dwords` and then the buffer is exhausted (the caller ring is
|
||||
/// popped). It must NOT wrap, and the primary `CP_RB_WPTR` must not be
|
||||
/// applied to it. Mirrors canary `ExecuteIndirectBuffer`, which executes
|
||||
/// the IB through a separate `RingBuffer reader_` and restores the primary
|
||||
/// reader afterward (command_processor.cc). Circular (primary-ring)
|
||||
/// semantics are used when this is `false`.
|
||||
pub indirect: bool,
|
||||
}
|
||||
|
||||
impl RingBufferView {
|
||||
@@ -46,7 +56,16 @@ impl RingBufferView {
|
||||
|
||||
/// True if there is pending unread data to consume.
|
||||
pub fn has_pending(&self) -> bool {
|
||||
self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords
|
||||
if !self.is_initialized() {
|
||||
return false;
|
||||
}
|
||||
if self.indirect {
|
||||
// Linear sub-stream: exhausted once the read pointer reaches the
|
||||
// (fixed) write pointer. Never wraps.
|
||||
self.read_offset_dwords < self.write_offset_dwords
|
||||
} else {
|
||||
self.read_offset_dwords != self.write_offset_dwords
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of dwords we can consume without wrapping past the write ptr.
|
||||
@@ -54,7 +73,10 @@ impl RingBufferView {
|
||||
if !self.is_initialized() {
|
||||
return 0;
|
||||
}
|
||||
if self.write_offset_dwords >= self.read_offset_dwords {
|
||||
if self.indirect {
|
||||
self.write_offset_dwords
|
||||
.saturating_sub(self.read_offset_dwords)
|
||||
} else if self.write_offset_dwords >= self.read_offset_dwords {
|
||||
self.write_offset_dwords - self.read_offset_dwords
|
||||
} else {
|
||||
// write has wrapped — we can read up to the end of the ring.
|
||||
@@ -62,13 +84,19 @@ impl RingBufferView {
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance the read pointer by `dwords`, wrapping at `size_dwords`.
|
||||
/// Advance the read pointer by `dwords`. Circular rings wrap at
|
||||
/// `size_dwords`; an indirect buffer advances linearly (no wrap) so it
|
||||
/// terminates exactly at its fixed write pointer.
|
||||
pub fn advance_read(&mut self, dwords: u32) {
|
||||
if self.size_dwords == 0 {
|
||||
return;
|
||||
}
|
||||
self.read_offset_dwords =
|
||||
(self.read_offset_dwords + dwords) % self.size_dwords;
|
||||
if self.indirect {
|
||||
self.read_offset_dwords = self.read_offset_dwords.saturating_add(dwords);
|
||||
} else {
|
||||
self.read_offset_dwords =
|
||||
(self.read_offset_dwords + dwords) % self.size_dwords;
|
||||
}
|
||||
}
|
||||
|
||||
/// Guest address for the dword at relative offset `i` from the current
|
||||
@@ -77,7 +105,11 @@ impl RingBufferView {
|
||||
if !self.is_initialized() {
|
||||
return None;
|
||||
}
|
||||
let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords;
|
||||
let off = if self.indirect {
|
||||
self.read_offset_dwords.saturating_add(offset_dwords)
|
||||
} else {
|
||||
(self.read_offset_dwords + offset_dwords) % self.size_dwords
|
||||
};
|
||||
Some(self.base.wrapping_add(off.wrapping_mul(4)))
|
||||
}
|
||||
}
|
||||
@@ -120,4 +152,52 @@ mod tests {
|
||||
assert_eq!(v.addr_at_offset(1), Some(0x4000_0000));
|
||||
assert_eq!(v.addr_at_offset(2), Some(0x4000_0004));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn indirect_buffer_drains_linearly_and_terminates() {
|
||||
// An indirect buffer is a fixed linear sub-stream: read advances from
|
||||
// 0 to `size_dwords` and then is exhausted — it must NOT wrap back to
|
||||
// 0 (which previously caused an infinite re-read of a system command
|
||||
// buffer; iterate-2O). write_offset == size, exactly as the
|
||||
// INDIRECT_BUFFER handler sets it.
|
||||
let mut ib = RingBufferView {
|
||||
base: 0x4adf_5080,
|
||||
size_dwords: 11,
|
||||
read_offset_dwords: 0,
|
||||
write_offset_dwords: 11,
|
||||
rptr_writeback_addr: 0,
|
||||
rptr_writeback_block_dwords: 0,
|
||||
indirect: true,
|
||||
};
|
||||
assert!(ib.has_pending());
|
||||
// Drain the exact packet layout observed for Sylpheed's init IB:
|
||||
// 2 + 3 + 6 dwords = 11.
|
||||
ib.advance_read(2);
|
||||
assert!(ib.has_pending());
|
||||
ib.advance_read(3);
|
||||
assert!(ib.has_pending());
|
||||
ib.advance_read(6); // reaches 11 == write
|
||||
assert_eq!(ib.read_offset_dwords, 11);
|
||||
assert!(
|
||||
!ib.has_pending(),
|
||||
"indirect buffer must terminate at write ptr, not wrap to 0"
|
||||
);
|
||||
// addr_at_offset must not modulo-wrap for an indirect buffer.
|
||||
ib.read_offset_dwords = 9;
|
||||
assert_eq!(ib.addr_at_offset(1), Some(0x4adf_5080 + 10 * 4));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn indirect_flag_does_not_affect_circular_ring() {
|
||||
// Sanity: a circular (primary) ring still wraps as before.
|
||||
let mut v = RingBufferView::new();
|
||||
v.base = 0x4adc_c000;
|
||||
v.size_dwords = 8192;
|
||||
v.read_offset_dwords = 8190;
|
||||
v.write_offset_dwords = 2;
|
||||
assert!(v.has_pending());
|
||||
v.advance_read(4); // (8190 + 4) % 8192 = 2
|
||||
assert_eq!(v.read_offset_dwords, 2);
|
||||
assert!(!v.has_pending());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,8 +45,9 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(_) => vfetch_count += 1,
|
||||
@@ -174,6 +175,50 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect the unique texture-fetch-constant slot indices a shader samples.
|
||||
///
|
||||
/// Walks the same exec-clause / sequence-bitmap path as [`emit_for`] but only
|
||||
/// extracts `TextureFetch.fetch_const` slots, deduplicated and in first-seen
|
||||
/// order. The GPU draw handler uses this to decide which fetch constants to
|
||||
/// decode + cache at draw time (keyed off the *active* pixel shader's real
|
||||
/// `tfetch` instructions rather than a hardcoded slot).
|
||||
pub fn tfetch_slots(parsed: &ParsedShader) -> Vec<u8> {
|
||||
let mut slots: Vec<u8> = Vec::new();
|
||||
for clause in &parsed.cf {
|
||||
if let ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
..
|
||||
} = clause
|
||||
{
|
||||
for i in 0..(*count as usize) {
|
||||
let base = (*address as usize + i) * 3;
|
||||
if base + 2 >= parsed.instructions.len() {
|
||||
break;
|
||||
}
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if !is_fetch {
|
||||
continue;
|
||||
}
|
||||
let words = [
|
||||
parsed.instructions[base],
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
if let FetchInstruction::Texture(tf) = decode_fetch(words) {
|
||||
if !slots.contains(&tf.fetch_const) {
|
||||
slots.push(tf.fetch_const);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
slots
|
||||
}
|
||||
|
||||
fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
|
||||
if !buf.contains(&name) {
|
||||
buf.push(name);
|
||||
@@ -298,6 +343,46 @@ mod tests {
|
||||
emit_for(&shader, "vs");
|
||||
}
|
||||
|
||||
/// `tfetch_slots` should extract the fetch-constant slot of a texture
|
||||
/// fetch (and dedup), and return empty for a flat ALU-only shader.
|
||||
#[test]
|
||||
fn tfetch_slots_extracts_texture_fetch_constants() {
|
||||
// word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, const_index=3 in
|
||||
// bits[24:20] (Xenos `ucode.h:844`) → 0x01 | (3 << 20).
|
||||
let tfetch_w0: u32 = 0x01 | (3u32 << 20);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 2,
|
||||
// instruction 0 is a fetch (bit[0] of its 2-bit field set),
|
||||
// instruction 1 is ALU. is_fetch = (sequence >> (i*2)) & 1.
|
||||
sequence: 0b00_01,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
ControlFlowInstruction::Exit,
|
||||
],
|
||||
instructions: vec![tfetch_w0, 0, 0, /* ALU triple */ 0, 0, 0],
|
||||
};
|
||||
assert_eq!(tfetch_slots(&shader), vec![3]);
|
||||
|
||||
// Flat shader: no fetch bits → no slots.
|
||||
let flat = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
}],
|
||||
instructions: vec![0, 0, 0],
|
||||
};
|
||||
assert!(tfetch_slots(&flat).is_empty());
|
||||
}
|
||||
|
||||
/// P8: a shader containing `LoopStart` should mark `cf_loop` as used
|
||||
/// so the HUD can surface which deferred feature a game triggers.
|
||||
#[test]
|
||||
|
||||
@@ -20,7 +20,15 @@ struct XenosDrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
// iterate-3O: guest dword address that maps to index 0 of `vertex_buffer`.
|
||||
// The CPU uploads a bounded guest-memory window starting at the active
|
||||
// vertex-fetch base; the shader subtracts this base from the absolute
|
||||
// fetch-constant address so it indexes the uploaded window. 0 means "no
|
||||
// real vertex window" (procedural fallback path).
|
||||
vertex_base_dwords: u32,
|
||||
// iterate-3S: guest viewport → host NDC XY transform (Y pre-flipped).
|
||||
ndc_scale: vec2<f32>,
|
||||
ndc_offset: vec2<f32>,
|
||||
};
|
||||
|
||||
struct XenosConstants {
|
||||
@@ -56,6 +64,7 @@ const CF_KIND_LOOP_END: u32 = 5u;
|
||||
const CF_KIND_COND_JMP: u32 = 6u;
|
||||
const CF_KIND_COND_CALL: u32 = 7u;
|
||||
const CF_KIND_RETURN: u32 = 8u;
|
||||
const CF_KIND_NOP: u32 = 9u;
|
||||
const CF_KIND_UNKNOWN: u32 = 15u;
|
||||
|
||||
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
|
||||
@@ -628,8 +637,8 @@ const VFMT_32_32_32_FLOAT: u32 = 57u;
|
||||
// layout in `ucode.h:690`):
|
||||
// w0 [4:0] opcode
|
||||
// w0 [10:5] src_reg[5:0]
|
||||
// w0 [17:11] dst_reg[6:0] + must-be-one
|
||||
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
|
||||
// w0 [17:12] dst_reg[5:0]
|
||||
// w0 [24:20] const_index[4:0], [26:25] const_index_sel[1:0]
|
||||
// w1 [21:16] format[5:0]
|
||||
// w2 [7:0] stride (in dwords)
|
||||
// w2 [30:8] offset (signed, in dwords)
|
||||
@@ -641,9 +650,9 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
let w0 = vs_instr_dword(t, 0u);
|
||||
let w1 = vs_instr_dword(t, 1u);
|
||||
let w2 = vs_instr_dword(t, 2u);
|
||||
let fetch_const = (w0 >> 5u) & 0x1Fu;
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let fetch_const = (w0 >> 20u) & 0x1Fu;
|
||||
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||
let format = (w1 >> 16u) & 0x3Fu;
|
||||
let stride = w2 & 0xFFu;
|
||||
|
||||
@@ -651,7 +660,20 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
// dword 1 carries (endian[1:0], size[25:2]).
|
||||
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
||||
let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u];
|
||||
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||
// iterate-3O: the fetch constant holds an *absolute* guest dword address.
|
||||
// The CPU uploaded a window of guest memory starting at
|
||||
// `draw_ctx.vertex_base_dwords`, so rebase the absolute address into that
|
||||
// window. When no real window was published (`vertex_base_dwords == 0`)
|
||||
// keep the absolute value (the `addr < n` guards below then skip the read
|
||||
// and the procedural fallback position is used).
|
||||
// GPUBUG-108 (iterate-3S): the captured window begins exactly at the fetch
|
||||
// base, so index from 0 (vertex i at i*stride). The uniform `fetch[]` holds
|
||||
// the last-published per-frame constant, not this draw's — recomputing
|
||||
// `abs_base` from it produced a stale out-of-window address (the splash
|
||||
// collapsed to one pixel). Only consult the uniform for the no-window
|
||||
// synthetic fallback.
|
||||
let abs_base = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||
let base_dwords = select(abs_base, 0u, draw_ctx.vertex_base_dwords != 0u);
|
||||
// GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is
|
||||
// big-endian; the host is little-endian. Pre-fix every dword was
|
||||
// bitcast as-is — vertex positions were byte-reversed garbage.
|
||||
@@ -773,20 +795,20 @@ fn interpret_texture_fetch(t: u32, is_vertex: bool) {
|
||||
} else {
|
||||
w0 = ps_instr_dword(t, 0u);
|
||||
}
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let uv = registers[src_reg & 0x7Fu].xy;
|
||||
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||
let uv = registers[src_reg & 0x3Fu].xy;
|
||||
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
|
||||
registers[dst_reg & 0x7Fu] = sample;
|
||||
registers[dst_reg & 0x3Fu] = sample;
|
||||
}
|
||||
|
||||
// Walk an Exec clause's instruction triples.
|
||||
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
|
||||
// (we ignore in MVP); bit 1 = is-fetch.
|
||||
// sequence: 2-bit-per-instruction bitmap. Bit 0 of a pair = fetch(1)/ALU(0);
|
||||
// bit 1 = serialize (ignored). (Xenos `ucode.h:226`.)
|
||||
fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
|
||||
// 0x00 = vertex fetch, 0x01 = texture fetch.
|
||||
@@ -803,7 +825,7 @@ fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||
fn exec_ps(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
interpret_texture_fetch(t, false);
|
||||
} else {
|
||||
@@ -871,7 +893,13 @@ fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
|
||||
// Use registers[OPOS_REG] as position; the procedural fallback above
|
||||
// seeded it so an un-interpreted shader still draws a recognisable
|
||||
// circle.
|
||||
out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
||||
var opos = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
||||
// iterate-3S: guest VS position → host clip space (see translator.rs). When
|
||||
// the transform is unset (procedural fallback) pass through unchanged.
|
||||
if (draw_ctx.ndc_scale.x != 0.0 || draw_ctx.ndc_scale.y != 0.0) {
|
||||
opos = vec4<f32>(opos.xy * draw_ctx.ndc_scale + draw_ctx.ndc_offset * opos.w, opos.z, opos.w);
|
||||
}
|
||||
out.position = opos;
|
||||
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
|
||||
return out;
|
||||
}
|
||||
@@ -962,6 +990,9 @@ fn walk_cf_vs() {
|
||||
// No call stack — mark and continue.
|
||||
reject_mask |= REJECT_CF_CALL;
|
||||
}
|
||||
case CF_KIND_NOP: {
|
||||
// kNop padding / kMarkVsFetchDone hint — no-op, just advance.
|
||||
}
|
||||
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||
}
|
||||
if stop { break; }
|
||||
|
||||
@@ -94,7 +94,9 @@ struct XenosDrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
vertex_base_dwords: u32,
|
||||
ndc_scale: vec2<f32>,
|
||||
ndc_offset: vec2<f32>,
|
||||
};
|
||||
|
||||
struct XenosConstants {
|
||||
@@ -113,9 +115,21 @@ struct XenosConstants {
|
||||
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||
@group(1) @binding(1) var xenos_samp : sampler;
|
||||
|
||||
// iterate-3T: real interpolator passthrough. The Xenos VS exports up to 16
|
||||
// interpolators (export index 0..15); the PS reads interpolator i from its
|
||||
// general register r[i]. We carry 8 interpolator vec4s (covers Sylpheed's
|
||||
// splash: r0=color, r1=texcoord). `color` retained as an alias of interp0 so
|
||||
// older single-color paths keep working.
|
||||
struct VsOut {
|
||||
@builtin(position) position: vec4<f32>,
|
||||
@location(0) color: vec4<f32>,
|
||||
@location(0) interp0: vec4<f32>,
|
||||
@location(1) interp1: vec4<f32>,
|
||||
@location(2) interp2: vec4<f32>,
|
||||
@location(3) interp3: vec4<f32>,
|
||||
@location(4) interp4: vec4<f32>,
|
||||
@location(5) interp5: vec4<f32>,
|
||||
@location(6) interp6: vec4<f32>,
|
||||
@location(7) interp7: vec4<f32>,
|
||||
};
|
||||
|
||||
struct FsOut {
|
||||
@@ -154,6 +168,14 @@ struct EmitCtx {
|
||||
stage: Stage,
|
||||
out: String,
|
||||
indent: usize,
|
||||
/// GPUBUG-114: dword stride of the most recent *full* vfetch, keyed by
|
||||
/// fetch-const register offset. A vfetch_mini carries stride=0 and reuses
|
||||
/// the address + stride of the preceding full vfetch of the same stream
|
||||
/// (canary ucode.h:733). Without this a mini color attribute indexes by its
|
||||
/// tight dword count instead of the real vertex stride → reads the wrong
|
||||
/// vertex's data (Sylpheed's background fill `0x36660986` read garbage →
|
||||
/// white instead of the intended color).
|
||||
last_full_stride: std::collections::HashMap<u32, u32>,
|
||||
}
|
||||
|
||||
impl EmitCtx {
|
||||
@@ -162,6 +184,7 @@ impl EmitCtx {
|
||||
stage,
|
||||
out: String::with_capacity(2048),
|
||||
indent: 0,
|
||||
last_full_stride: std::collections::HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,19 +221,74 @@ impl EmitCtx {
|
||||
self.push("var ps: f32 = 0.0;");
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
// iterate-3T: host→guest vertex-index remap for primitives the
|
||||
// replay draws non-indexed as a flat triangle list. wgpu has no
|
||||
// QuadList/RectangleList topology, so the host issues 6 vertices
|
||||
// per quad/rect and we map them back to the guest's 4/3 source
|
||||
// vertices here (mirrors `primitive.rs` index rewrite, but in the
|
||||
// VS since the replay path is non-indexed):
|
||||
// QuadList(13): 6 host verts → guest [0,1,2, 0,2,3]
|
||||
// RectangleList(8): drawn as one triangle [0,1,2] (the 4th
|
||||
// corner needs cross-vertex synthesis — TODO), so host
|
||||
// indices >=3 fold onto the existing triangle.
|
||||
// Other prims pass through unchanged.
|
||||
self.push("var gvidx: u32 = vidx;");
|
||||
self.push("if (draw_ctx.prim_kind == 13u) {");
|
||||
self.indent += 1;
|
||||
self.push("let q = vidx % 6u; let qbase = (vidx / 6u) * 4u;");
|
||||
self.push("var lut = array<u32, 6>(0u, 1u, 2u, 0u, 2u, 3u);");
|
||||
self.push("gvidx = qbase + lut[q];");
|
||||
self.indent -= 1;
|
||||
self.push("} else if (draw_ctx.prim_kind == 8u) {");
|
||||
self.indent += 1;
|
||||
self.push("let t = vidx % 3u; let rbase = (vidx / 3u) * 3u;");
|
||||
self.push("gvidx = rbase + t;");
|
||||
self.indent -= 1;
|
||||
self.push("}");
|
||||
// Seed r0 with vertex index for simple shaders that read it.
|
||||
self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
|
||||
// Synthetic export slots — match the interpreter's layout so
|
||||
// the fallback path and translator path produce the same
|
||||
// visual output on shaders both support.
|
||||
self.push("r[0] = vec4<f32>(f32(gvidx), 0.0, 0.0, 1.0);");
|
||||
// iterate-3T: real export model. Xenos export index 62 = oPos;
|
||||
// indices 0..15 = interpolators. We hold position + 8
|
||||
// interpolator vec4s; `emit_export` writes the right slot keyed
|
||||
// on the export index.
|
||||
//
|
||||
// iterate-3AE (WHITE-TRIANGLE ROOT): interpolators a VS does NOT
|
||||
// export must default to ZERO, not white. The old `ointerp[0] =
|
||||
// (1,1,1,1)` was an iterate-3T debug convenience ("so a VS that
|
||||
// only exports position still yields a visible non-zero color")
|
||||
// — but it is a FAKE: it injects white that no guest value backs.
|
||||
// The transition/background draws use the position-only VS
|
||||
// `0xd4c14f46` (one vfetch → oPos; it exports NO color) paired
|
||||
// with PS `0xed732b5a` (`ocolor0 = interp0`). With the white
|
||||
// seed, interp0 stayed (1,1,1,1) → the fullscreen fill rendered
|
||||
// OPAQUE WHITE (the diagonal half-triangle artifact that flashed
|
||||
// before each splash logo and persisted across the dev-logo
|
||||
// transition). Canary shows a black background there because the
|
||||
// un-exported interpolator carries no white. Default to
|
||||
// (0,0,0,0): a position-only VS now contributes nothing visible
|
||||
// under its real (opaque or premultiplied) blend, matching
|
||||
// canary, while every VS that really exports interp0 (the logo
|
||||
// `0x03b7b020`, the `0x36660986` color fill) overwrites this seed
|
||||
// and is unaffected. RGB=0 → black fill; A=0 → premultiplied
|
||||
// overlays stay transparent.
|
||||
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
||||
self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
|
||||
self.push("var ointerp: array<vec4<f32>, 8>;");
|
||||
self.push("for (var i = 0u; i < 8u; i = i + 1u) { ointerp[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0); }");
|
||||
}
|
||||
Stage::Pixel => {
|
||||
// Seed r0.xy with interpolated color lane so trivial shaders
|
||||
// that read r0 still produce something.
|
||||
self.push("r[0] = in.color;");
|
||||
self.push("var ocolor0: vec4<f32> = in.color;");
|
||||
// iterate-3T: the PS reads interpolator i from general register
|
||||
// r[i] (Xenos PS input GPR mapping). Seed r0..r7 from the VS's
|
||||
// interpolators so e.g. the logo PS's texcoord (r1) and color
|
||||
// (r0) arrive correctly; tfetch then samples at the real UV.
|
||||
self.push("r[0] = in.interp0;");
|
||||
self.push("r[1] = in.interp1;");
|
||||
self.push("r[2] = in.interp2;");
|
||||
self.push("r[3] = in.interp3;");
|
||||
self.push("r[4] = in.interp4;");
|
||||
self.push("r[5] = in.interp5;");
|
||||
self.push("r[6] = in.interp6;");
|
||||
self.push("r[7] = in.interp7;");
|
||||
self.push("var ocolor0: vec4<f32> = in.interp0;");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -237,6 +315,10 @@ impl EmitCtx {
|
||||
current_alloc = *kind;
|
||||
}
|
||||
ControlFlowInstruction::Exit => break,
|
||||
// Non-executing CF clauses: padding (`kNop`) and the
|
||||
// vertex-fetch-done hint (`kMarkVsFetchDone`). Skip them.
|
||||
ControlFlowInstruction::Nop
|
||||
| ControlFlowInstruction::MarkVsFetchDone => {}
|
||||
ControlFlowInstruction::LoopStart { .. }
|
||||
| ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
|
||||
ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
|
||||
@@ -250,13 +332,41 @@ impl EmitCtx {
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
self.push("var out: VsOut;");
|
||||
// iterate-3S: guest VS position → host clip space. The guest
|
||||
// emits either clip-space or (screen-space, clip disabled)
|
||||
// render-target-pixel coords; `ndc_scale`/`ndc_offset` (from
|
||||
// canary's GetHostViewportInfo, computed CPU-side per draw)
|
||||
// rescale XY into wgpu clip space with Y already flipped. When
|
||||
// the transform is unset (all-zero scale, procedural fallback)
|
||||
// pass the position through unchanged.
|
||||
self.push("if (draw_ctx.ndc_scale.x != 0.0 || draw_ctx.ndc_scale.y != 0.0) {");
|
||||
self.indent += 1;
|
||||
self.push("opos = vec4<f32>(opos.xy * draw_ctx.ndc_scale + draw_ctx.ndc_offset * opos.w, opos.z, opos.w);");
|
||||
self.indent -= 1;
|
||||
self.push("}");
|
||||
self.push("out.position = opos;");
|
||||
self.push("out.color = ocolor;");
|
||||
self.push("out.interp0 = ointerp[0];");
|
||||
self.push("out.interp1 = ointerp[1];");
|
||||
self.push("out.interp2 = ointerp[2];");
|
||||
self.push("out.interp3 = ointerp[3];");
|
||||
self.push("out.interp4 = ointerp[4];");
|
||||
self.push("out.interp5 = ointerp[5];");
|
||||
self.push("out.interp6 = ointerp[6];");
|
||||
self.push("out.interp7 = ointerp[7];");
|
||||
self.push("return out;");
|
||||
}
|
||||
Stage::Pixel => {
|
||||
self.push("var out: FsOut;");
|
||||
self.push("out.color0 = ocolor0;");
|
||||
// GPUBUG-115: saturate the color export to [0,1], flushing NaN
|
||||
// to 0 — exactly what canary does before writing a UNORM render
|
||||
// target (spirv_shader_translator.cc:3607 "Saturate, flushing
|
||||
// NaN to 0"). The Xenos RB clamps PS output for UNORM targets;
|
||||
// without this an out-of-range guest color (Sylpheed's
|
||||
// background fill exports a huge negative float `-32896.5` as a
|
||||
// fullscreen-clear value) writes garbage/NaN to the sRGB target
|
||||
// → renders white instead of the clamped black canary shows.
|
||||
// `clamp(x,0,1)` returns 0 for NaN under WGSL's clamp semantics.
|
||||
self.push("out.color0 = clamp(ocolor0, vec4<f32>(0.0), vec4<f32>(1.0));");
|
||||
self.push("return out;");
|
||||
}
|
||||
}
|
||||
@@ -284,7 +394,9 @@ impl EmitCtx {
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
|
||||
@@ -378,53 +490,185 @@ impl EmitCtx {
|
||||
}
|
||||
|
||||
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
||||
// Xenos's export "register" indexing within an alloc range is
|
||||
// normally (alloc_base + offset). Since our CF stream doesn't
|
||||
// carry per-export slot offsets cleanly, use `alloc` to pick the
|
||||
// target.
|
||||
let lhs = match (self.stage, alloc) {
|
||||
(Stage::Vertex, AllocKind::Position) => "opos",
|
||||
(Stage::Vertex, AllocKind::Interpolators) => "ocolor",
|
||||
(Stage::Vertex, AllocKind::Colors) => "ocolor",
|
||||
(Stage::Vertex, _) => "ocolor", // fall through — any other alloc
|
||||
(Stage::Pixel, AllocKind::Colors) => "ocolor0",
|
||||
(Stage::Pixel, _) => "ocolor0",
|
||||
};
|
||||
let _ = dst_reg; // per-slot export indexing reserved for a richer v2
|
||||
self.emit_masked_write(lhs, expr, mask);
|
||||
// iterate-3T: real Xenos export-index model (replaces the `AllocKind`
|
||||
// heuristic, which collapsed every VS export to a single color slot and
|
||||
// dropped the texcoord interpolator → tfetch sampled (0,0) → flat).
|
||||
// When `export_data` is set the 6-bit vector_dest IS the export index:
|
||||
// VS: 62 = oPos, 63 = oPointSize/edge (ignored), 0..15 = interpolators.
|
||||
// PS: 0..3 = color render targets (we honor RT0).
|
||||
let _ = alloc;
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
let lhs = if dst_reg == 62 {
|
||||
"opos".to_string()
|
||||
} else if dst_reg <= 15 {
|
||||
// Clamp to the 8 interpolator slots we carry; higher slots
|
||||
// are unused by Sylpheed's splash.
|
||||
let i = (dst_reg as usize).min(7);
|
||||
format!("ointerp[{i}u]")
|
||||
} else {
|
||||
// oPointSize (63) / unknown export slot — discard.
|
||||
return;
|
||||
};
|
||||
self.emit_masked_write(&lhs, expr, mask);
|
||||
}
|
||||
Stage::Pixel => {
|
||||
// Only RT0 (export index 0) is wired to the single host target.
|
||||
if dst_reg == 0 {
|
||||
self.emit_masked_write("ocolor0", expr, mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
||||
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
||||
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
||||
// formats alongside the CPU texture cache's format expansion.
|
||||
// GPUBUG-107 (iterate-3S): decode the vertex FORMAT + dword STRIDE from
|
||||
// the vfetch instruction instead of hardcoding R32G32B32A32 (4 floats,
|
||||
// stride 4). Sylpheed's splash quads are `k_32_32_FLOAT` (2 floats,
|
||||
// stride 2); over-reading them put the next vertex's X into .w → a
|
||||
// negative W → the whole rectangle clipped behind the camera. We cover
|
||||
// the float vertex formats (the UI / screen-space draws); other formats
|
||||
// reject to the interpreter.
|
||||
//
|
||||
// GPUBUG-102: the fetch constant (xe_gpu_vertex_fetch_t,
|
||||
// xenos.h:1158-1172) holds the endian field in dword_1's low
|
||||
// 2 bits. Vertex data on Xbox 360 is big-endian; the host is
|
||||
// little-endian. Pre-fix, every dword was bitcast as-is →
|
||||
// vertex positions were byte-reversed garbage and any draw
|
||||
// that did reach the host produced clipped / NaN positions.
|
||||
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
||||
// GPUBUG-102: the fetch constant holds the endian field in dword_1's
|
||||
// low 2 bits; Xbox 360 vertex data is big-endian, so `gpu_swap` undoes
|
||||
// it per component.
|
||||
// (comps, dwords_read) per format. Float formats are 1 dword/component;
|
||||
// iterate-3T adds the packed-16 `k_16_16` (format 6) used for the logo
|
||||
// UV interpolator — 2 components packed into ONE dword.
|
||||
#[derive(PartialEq)]
|
||||
enum Pack {
|
||||
Float, // N f32 lanes, N dwords
|
||||
Norm16x2, // 2× u16 normalized into [0,1], 1 dword (k_16_16)
|
||||
Norm8x4, // 4× u8 normalized into [0,1], 1 dword (k_8_8_8_8)
|
||||
}
|
||||
let (comps, dwords_read, pack): (u32, u32, Pack) = match vf.format {
|
||||
36 => (1, 1, Pack::Float), // k_32_FLOAT
|
||||
37 => (2, 2, Pack::Float), // k_32_32_FLOAT
|
||||
57 => (3, 3, Pack::Float), // k_32_32_32_FLOAT
|
||||
38 => (4, 4, Pack::Float), // k_32_32_32_32_FLOAT
|
||||
6 => (4, 1, Pack::Norm8x4), // k_8_8_8_8 (packed RGBA8 — GPUBUG-112)
|
||||
25 => (2, 1, Pack::Norm16x2), // k_16_16
|
||||
_ => return Err(reject::VFETCH_FMT),
|
||||
};
|
||||
// iterate-3X (GPUBUG-110): index the fetch-constant region by the full
|
||||
// `const_index*3 + const_index_sel` mapping (canary `ucode.h:700`),
|
||||
// packed as `const_index*6 + sel*2` dwords. The previous expression
|
||||
// `(vf.raw[0] >> 5) & 0x1F` read the *src_reg* bits, not the const
|
||||
// index — wrong for the endian term and the no-window fallback base.
|
||||
let const_off = vf.const_reg_offset();
|
||||
// GPUBUG-114: a full vfetch carries the real vertex dword stride; a
|
||||
// vfetch_mini reuses the address + stride of the preceding full vfetch
|
||||
// of the same stream (canary ucode.h:733). Track the last full stride
|
||||
// per fetch-const and inherit it for mini-fetches (stride field == 0).
|
||||
let stride = if vf.is_mini_fetch || vf.stride == 0 {
|
||||
*self
|
||||
.last_full_stride
|
||||
.get(&const_off)
|
||||
.unwrap_or(&dwords_read)
|
||||
} else {
|
||||
self.last_full_stride.insert(const_off, vf.stride as u32);
|
||||
vf.stride as u32
|
||||
};
|
||||
// iterate-3T: per-attribute dword offset within the vertex (vfetches
|
||||
// sharing one fetch constant read different attributes).
|
||||
let attr_off = vf.offset;
|
||||
let src_reg = vf.src_register & 0x7F;
|
||||
let dst_reg = vf.dest_register & 0x7F;
|
||||
// is_signed selects [-1,1] vs [0,1] for normalized integer formats.
|
||||
let signed = vf.is_signed;
|
||||
// Build the per-component reads; unread lanes default to 0/0/0/1 so an
|
||||
// XY-only position keeps W=1 (and Z=0).
|
||||
let lane = |i: u32| -> String {
|
||||
match pack {
|
||||
Pack::Float => {
|
||||
if i < comps {
|
||||
format!("bitcast<f32>(gpu_swap(vertex_buffer[addr + {i}u], endian))")
|
||||
} else if i == 3 {
|
||||
"1.0".to_string()
|
||||
} else {
|
||||
"0.0".to_string()
|
||||
}
|
||||
}
|
||||
Pack::Norm16x2 => {
|
||||
// One dword holds [u16 lo | u16 hi] after the endian swap.
|
||||
// Component 0 = low halfword, component 1 = high halfword.
|
||||
if i == 0 {
|
||||
if signed {
|
||||
"(max(f32(i32(w16 << 16u) >> 16u) / 32767.0, -1.0))".to_string()
|
||||
} else {
|
||||
"(f32(w16 & 0xFFFFu) / 65535.0)".to_string()
|
||||
}
|
||||
} else if i == 1 {
|
||||
if signed {
|
||||
"(max(f32(i32(w16) >> 16u) / 32767.0, -1.0))".to_string()
|
||||
} else {
|
||||
"(f32(w16 >> 16u) / 65535.0)".to_string()
|
||||
}
|
||||
} else if i == 3 {
|
||||
"1.0".to_string()
|
||||
} else {
|
||||
"0.0".to_string()
|
||||
}
|
||||
}
|
||||
Pack::Norm8x4 => {
|
||||
// One dword holds 4× u8 (canary spirv_shader_translator_fetch
|
||||
// k_8_8_8_8: comp0@bit0, comp1@bit8, comp2@bit16, comp3@bit24)
|
||||
// after the endian swap. All four channels present → normalize
|
||||
// to [0,1]. GPUBUG-112: this is the logo/background vertex
|
||||
// COLOR (RGBA8), previously misdecoded as k_16_16 (2 chans,
|
||||
// B forced 0) → white texture × (R,G,0) = yellow.
|
||||
let sh = i * 8;
|
||||
if signed {
|
||||
format!(
|
||||
"(max(f32(i32(w16 << {l}u) >> 24u) / 127.0, -1.0))",
|
||||
l = 24 - sh
|
||||
)
|
||||
} else {
|
||||
format!("(f32((w16 >> {sh}u) & 0xFFu) / 255.0)")
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
let read_bound = dwords_read - 1;
|
||||
// GPUBUG-108 (iterate-3S): for the captured-geometry path the CPU
|
||||
// uploads a vertex window that begins EXACTLY at the fetch base, so the
|
||||
// base within `vertex_buffer` is 0 and vertex i sits at `i * stride`.
|
||||
// The previous `abs_base - vertex_base_dwords` rebase recomputed the
|
||||
// base from `xenos_consts.fetch[]`, but that uniform carries the
|
||||
// *last-published* (per-frame) fetch constant, not this draw's — for
|
||||
// the splash it was stale (0x8a000002 vs the real 0x0adf… base), so the
|
||||
// rebase produced a huge out-of-window address, the bounds guard
|
||||
// failed, and every vertex kept its seed (vertex_index, 0, 0, 1) →
|
||||
// every quad collapsed to ~one pixel at the origin. Index from 0 when a
|
||||
// real window is present (`vertex_base_dwords != 0`); only the
|
||||
// synthetic/no-window fallback consults the uniform fetch constant.
|
||||
let endian_term = format!("xenos_consts.fetch[{}u] & 0x3u", const_off + 1);
|
||||
// For packed formats (k_16_16, k_8_8_8_8) we read one dword into `w16`
|
||||
// (post endian-swap) and the `lane()` exprs above unpack the channels.
|
||||
let w16_decl = if pack == Pack::Norm16x2 || pack == Pack::Norm8x4 {
|
||||
"let w16 = gpu_swap(vertex_buffer[addr], endian); "
|
||||
} else {
|
||||
""
|
||||
};
|
||||
self.push(&format!(
|
||||
"{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \
|
||||
let fc1 = xenos_consts.fetch[{fc1_idx}u]; \
|
||||
let endian = fc1 & 0x3u; \
|
||||
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
||||
"{{ let endian = {endian_term}; \
|
||||
let vidx = u32(r[{src_reg}u].x); \
|
||||
let addr = base + vidx * 4u; \
|
||||
var base = 0u; \
|
||||
if (draw_ctx.vertex_base_dwords == 0u) {{ \
|
||||
base = (xenos_consts.fetch[{fc0_idx}u] & 0xFFFFFFFCu) >> 2u; \
|
||||
}} \
|
||||
let addr = base + vidx * {stride}u + {attr_off}u; \
|
||||
let n = arrayLength(&vertex_buffer); \
|
||||
if (addr + 3u < n) {{ \
|
||||
r[{dst_reg}u] = vec4<f32>( \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian))); \
|
||||
if (addr + {read_bound}u < n) {{ \
|
||||
{w16_decl}\
|
||||
r[{dst_reg}u] = vec4<f32>({l0}, {l1}, {l2}, {l3}); \
|
||||
}} }}",
|
||||
fc0_idx = fetch_const * 2,
|
||||
fc1_idx = fetch_const * 2 + 1,
|
||||
fc0_idx = const_off,
|
||||
l0 = lane(0),
|
||||
l1 = lane(1),
|
||||
l2 = lane(2),
|
||||
l3 = lane(3),
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
@@ -477,6 +721,22 @@ fn src_operand(src_byte: u8, is_temp: bool, swizzle: u8, negate: bool) -> String
|
||||
}
|
||||
|
||||
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||
// Semantics mirror the runtime interpreter's `exec_vector_op`
|
||||
// (`shaders/xenos_interp.wgsl`), which in turn mirrors canary's
|
||||
// `AluVectorOpcode` (ucode.h:1001+). Side-effecting ops (kill*, setp_push)
|
||||
// need per-invocation state the AOT emitter doesn't track yet → still
|
||||
// `None` (interpreter fallback).
|
||||
let cmp4 = |op: &str| {
|
||||
format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x{op}{b}.x), select(0.0,1.0,{a}.y{op}{b}.y), select(0.0,1.0,{a}.z{op}{b}.z), select(0.0,1.0,{a}.w{op}{b}.w))"
|
||||
)
|
||||
};
|
||||
// CND* : per-lane select(c, b, a <cmp> 0).
|
||||
let cnd4 = |op: &str| {
|
||||
format!(
|
||||
"vec4<f32>(select({c}.x,{b}.x,{a}.x{op}0.0), select({c}.y,{b}.y,{a}.y{op}0.0), select({c}.z,{b}.z,{a}.z{op}0.0), select({c}.w,{b}.w,{a}.w{op}0.0))"
|
||||
)
|
||||
};
|
||||
let s = match op {
|
||||
vop::ADD => format!("({a} + {b})"),
|
||||
vop::MUL => format!("({a} * {b})"),
|
||||
@@ -485,37 +745,63 @@ fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||
vop::MAD => format!("({a} * {b} + {c})"),
|
||||
vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
|
||||
vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
|
||||
vop::DOT2_ADD => format!(
|
||||
"vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"
|
||||
),
|
||||
vop::SEQ => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))"
|
||||
),
|
||||
vop::SGT => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))"
|
||||
),
|
||||
vop::SGE => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))"
|
||||
),
|
||||
vop::SNE => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))"
|
||||
),
|
||||
vop::DOT2_ADD => format!("vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"),
|
||||
vop::SEQ => cmp4("=="),
|
||||
vop::SGT => cmp4(">"),
|
||||
vop::SGE => cmp4(">="),
|
||||
vop::SNE => cmp4("!="),
|
||||
vop::CND_EQ => cnd4("=="),
|
||||
vop::CND_GE => cnd4(">="),
|
||||
vop::CND_GT => cnd4(">"),
|
||||
vop::FRC => format!("fract({a})"),
|
||||
vop::TRUNC => format!("trunc({a})"),
|
||||
vop::FLOOR => format!("floor({a})"),
|
||||
vop::MAX4 => format!("vec4<f32>(max(max({a}.x,{a}.y), max({a}.z,{a}.w)))"),
|
||||
// dst = (1, src0.y*src1.y, src0.z, src1.w) (canary kDst)
|
||||
vop::DST => format!("vec4<f32>(1.0, {a}.y * {b}.y, {a}.z, {b}.w)"),
|
||||
_ => return None,
|
||||
};
|
||||
Some(s)
|
||||
}
|
||||
|
||||
fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
|
||||
// Semantics mirror the runtime interpreter's `exec_scalar_op`
|
||||
// (`shaders/xenos_interp.wgsl`) / canary's `AluScalarOpcode`
|
||||
// (ucode.h:1001+). Side-effecting ops (setp*, kills*, maxas*) need
|
||||
// per-invocation predicate/kill/address state the AOT emitter doesn't
|
||||
// track yet → still `None` (interpreter fallback).
|
||||
let s = match op {
|
||||
sop::ADDS => format!("({a} + {b})"),
|
||||
sop::ADDS_PREV => format!("({a} + {prev})"),
|
||||
sop::MULS => format!("({a} * {b})"),
|
||||
sop::MULS_PREV => format!("({a} * {prev})"),
|
||||
// muls_prev2 / LIT emulation (canary kMulsPrev2): guard against
|
||||
// -FLT_MAX / non-finite ps & b, and b <= 0.
|
||||
sop::MULS_PREV2 => format!(
|
||||
"select({a} * {prev}, -3.4028235e38, {prev} == -3.4028235e38 || !(\
|
||||
{prev} == {prev}) || abs({prev}) > 3.4028235e38 || !({b} == {b}) || \
|
||||
abs({b}) > 3.4028235e38 || {b} <= 0.0)"
|
||||
),
|
||||
sop::MAXS => format!("max({a}, {b})"),
|
||||
sop::MINS => format!("min({a}, {b})"),
|
||||
sop::RCP => format!("xe_rcp({a})"),
|
||||
sop::SEQS => format!("select(0.0, 1.0, {a} == 0.0)"),
|
||||
sop::SGTS => format!("select(0.0, 1.0, {a} > 0.0)"),
|
||||
sop::SGES => format!("select(0.0, 1.0, {a} >= 0.0)"),
|
||||
sop::SNES => format!("select(0.0, 1.0, {a} != 0.0)"),
|
||||
sop::FRCS => format!("fract({a})"),
|
||||
sop::TRUNCS => format!("trunc({a})"),
|
||||
sop::FLOORS => format!("floor({a})"),
|
||||
sop::SUBS => format!("({a} - {b})"),
|
||||
sop::SUBS_PREV => format!("({a} - {prev})"),
|
||||
sop::EXP => format!("exp2({a})"),
|
||||
sop::LOG | sop::LOGC => format!("select(log2({a}), 0.0, {a} == 1.0)"),
|
||||
sop::RCP | sop::RCPC | sop::RCPF => format!("xe_rcp({a})"),
|
||||
sop::RSQ | sop::RSQC | sop::RSQF => {
|
||||
format!("select(0.0, inverseSqrt({a}), {a} > 0.0)")
|
||||
}
|
||||
sop::SQRT => format!("select(0.0, sqrt({a}), {a} >= 0.0)"),
|
||||
sop::SIN => format!("sin({a})"),
|
||||
sop::COS => format!("cos({a})"),
|
||||
sop::RETAIN_PREV => prev.to_string(),
|
||||
_ => return None,
|
||||
};
|
||||
@@ -528,17 +814,68 @@ mod tests {
|
||||
use crate::ucode::alu::{sop, vop};
|
||||
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||
|
||||
/// iterate-3T: the real publisher-logo VS (`vs_key 0x03b7b020`, captured
|
||||
/// from the live boot) must now TRANSLATE — pre-3T it rejected with
|
||||
/// `vfetch_fmt` because (a) the `k_16_16` color stream (format 6) was
|
||||
/// unsupported and (b) the export-index model (62=oPos, 0/1=interpolators)
|
||||
/// was a wrong AllocKind heuristic. This locks in the format-6 + per-
|
||||
/// attribute-offset + export-index work so the UV interpolator reaches the
|
||||
/// pixel shader (texcoord in r1) instead of collapsing to a single color.
|
||||
#[test]
|
||||
fn real_logo_vs_translates_with_interpolators() {
|
||||
let ucode: [u32; 30] = [
|
||||
0x70153003, 0x00001200, 0xC2000000, 0x00001006, 0x00001200, 0xC4000000,
|
||||
0x00002007, 0x00002200, 0x00000000, 0x2DF82000, 0x00393A88, 0x00000006,
|
||||
0x05F81000, 0x4006060A, 0x00000306, 0x05F80000, 0x40253FC8, 0x00000406,
|
||||
0xC80F803E, 0x00000000, 0xC2020200, 0xC8038001, 0x00B0B000, 0xC2000000,
|
||||
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||
];
|
||||
let p = crate::ucode::parse_shader(&ucode);
|
||||
let body = match translate(&p, Stage::Vertex) {
|
||||
Translation::Ok(b) => b,
|
||||
Translation::Reject(r) => panic!("logo VS rejected: {r}"),
|
||||
};
|
||||
// Position must come from the export-index-62 path (`opos`) and the
|
||||
// UV/color interpolators must be exported as distinct slots.
|
||||
assert!(body.contains("opos ="), "no position export: {body}");
|
||||
assert!(body.contains("ointerp[0u]"), "no interp0 export: {body}");
|
||||
assert!(body.contains("ointerp[1u]"), "no interp1 export: {body}");
|
||||
// The k_16_16 attribute must unpack via the packed-16 helper.
|
||||
assert!(body.contains("w16"), "no packed-16 unpack for k_16_16: {body}");
|
||||
}
|
||||
|
||||
/// The logo pixel shader (`ps_key 0x03b79001`) samples its texture at the
|
||||
/// interpolated texcoord register r1 — which the PS now seeds from the VS
|
||||
/// interpolator `in.interp1` (Xenos PS-input-GPR mapping). Verifies the UV
|
||||
/// chain so tfetch samples the real UV instead of (0,0).
|
||||
#[test]
|
||||
fn ps_seeds_interpolators_into_registers() {
|
||||
// A trivial PS that just exports — we only assert the preamble wiring.
|
||||
let p = crate::ucode::ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exit],
|
||||
instructions: vec![],
|
||||
};
|
||||
let body = match translate(&p, Stage::Pixel) {
|
||||
Translation::Ok(b) => b,
|
||||
Translation::Reject(r) => panic!("trivial PS rejected: {r}"),
|
||||
};
|
||||
assert!(body.contains("r[1] = in.interp1;"), "PS must seed r1 from interp1: {body}");
|
||||
}
|
||||
|
||||
fn synthetic_trivial_shader() -> ParsedShader {
|
||||
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||
// precedes so the ALU's export (if it were one) would target oPos.
|
||||
// Word-0 bits 29-31 set so all three operands resolve as temps —
|
||||
// matches the prior assertion `r[0u] = (r[0u] + r[0u])`.
|
||||
let w0 = (1u32 << 29) | (1u32 << 30) | (1u32 << 31);
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (0u32 << 16); // vector_dest = 0
|
||||
// GPUBUG-106 canary layout: dest/mask/scalar_opc in w0; vector_opc +
|
||||
// src_sel in w2. All three operands temps → r0.
|
||||
let w0 = (0u32) // vector_dest = 0
|
||||
| (0xFu32 << 16) // vector_write_mask = 0xF
|
||||
| ((sop::RETAIN_PREV as u32) << 26); // scalar_opc
|
||||
let w1 = 0u32;
|
||||
let w2 = ((vop::ADD as u32) << 24) // vector_opc
|
||||
| (1u32 << 31) // src1_sel = temp
|
||||
| (1u32 << 30) // src2_sel = temp
|
||||
| (1u32 << 29); // src3_sel = temp
|
||||
ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Alloc {
|
||||
@@ -554,7 +891,7 @@ mod tests {
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![w0, 0, w2],
|
||||
instructions: vec![w0, w1, w2],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -642,19 +979,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn shader_using_c0_emits_xenos_consts_read() {
|
||||
// ALU: r0 = c0 + r0. src_a (low byte) is constant index 0;
|
||||
// src_b (next byte) is temp index 0. src_a_is_temp=false →
|
||||
// src1_sel-style bit at w0 bit 29 = 0; src_b_is_temp=true →
|
||||
// bit 30 = 1. (src_c left as 0/temp; unused.)
|
||||
let w0 = 0x00u32 // src_a = c0
|
||||
| (0x00u32 << 8) // src_b = r0
|
||||
| (0x00u32 << 16) // src_c
|
||||
| (0u32 << 29) // src_a_is_temp = false (constant)
|
||||
| (1u32 << 30); // src_b_is_temp = true (register)
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12)
|
||||
| (0u32 << 16);
|
||||
// ALU: r0 = c0 + r0. GPUBUG-106 canary layout. src_a = src1 (w2
|
||||
// 16:23), src_b = src2 (w2 8:15). src1_sel (w2 bit31) = 0 → c0;
|
||||
// src2_sel (w2 bit30) = 1 → r0.
|
||||
let w0 = (0u32) // vector_dest = 0
|
||||
| (0xFu32 << 16) // vector_write_mask
|
||||
| ((sop::RETAIN_PREV as u32) << 26); // scalar_opc
|
||||
let w2 = ((vop::ADD as u32) << 24) // vector_opc
|
||||
| (0u32 << 16) // src1_reg = 0 → c0
|
||||
| (0u32 << 8) // src2_reg = 0 → r0
|
||||
| (0u32 << 31) // src1_sel = 0 (constant)
|
||||
| (1u32 << 30); // src2_sel = 1 (temp)
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Alloc {
|
||||
@@ -695,9 +1030,16 @@ mod tests {
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
let vf = crate::ucode::fetch::VertexFetch {
|
||||
fetch_const: 0,
|
||||
const_index_sel: 0,
|
||||
src_register: 0,
|
||||
dest_register: 0,
|
||||
dest_write_mask: 0xF,
|
||||
format: 38, // k_32_32_32_32_FLOAT (4 floats)
|
||||
stride: 4,
|
||||
offset: 0,
|
||||
is_signed: false,
|
||||
is_normalized: true,
|
||||
is_mini_fetch: false,
|
||||
raw: [0; 3],
|
||||
};
|
||||
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
||||
@@ -705,6 +1047,70 @@ mod tests {
|
||||
assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}");
|
||||
}
|
||||
|
||||
fn vf(format: u8, stride: u8, offset: u32, mini: bool) -> crate::ucode::fetch::VertexFetch {
|
||||
crate::ucode::fetch::VertexFetch {
|
||||
fetch_const: 0,
|
||||
const_index_sel: 0,
|
||||
src_register: 0,
|
||||
dest_register: 0,
|
||||
dest_write_mask: 0xF,
|
||||
format,
|
||||
stride,
|
||||
offset,
|
||||
is_signed: false,
|
||||
is_normalized: true,
|
||||
is_mini_fetch: mini,
|
||||
raw: [0; 3],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vfetch_k8888_unpacks_four_channels() {
|
||||
// GPUBUG-112: VertexFormat 6 = k_8_8_8_8 (4× u8 normalized, 1 dword),
|
||||
// NOT k_16_16. All four channels (R,G,B,A) must be unpacked so a
|
||||
// vertex COLOR keeps its blue channel (white texture × white color =
|
||||
// white, not yellow).
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
ctx.emit_vfetch(&vf(6, 6, 3, false)).expect("emit");
|
||||
let body = ctx.finish();
|
||||
// Four /255.0 channel reads from one packed dword `w16`.
|
||||
assert!(body.contains("let w16 ="), "needs packed dword: {body}");
|
||||
assert_eq!(body.matches("/ 255.0").count(), 4, "four 8-bit channels: {body}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vfetch_mini_inherits_full_stride() {
|
||||
// GPUBUG-114: a vfetch_mini (stride field 0) inherits the stride of the
|
||||
// preceding full vfetch of the same stream (canary ucode.h:733). Emit a
|
||||
// full fetch (stride 7) then a mini fetch and assert the mini indexes by
|
||||
// stride 7, not its tight dword count.
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
ctx.emit_vfetch(&vf(57, 7, 0, false)).expect("full"); // k_32_32_32_FLOAT
|
||||
ctx.emit_vfetch(&vf(38, 0, 3, true)).expect("mini"); // k_32_32_32_32_FLOAT, mini
|
||||
let body = ctx.finish();
|
||||
assert!(body.contains("vidx * 7u + 3u"), "mini must inherit stride 7: {body}");
|
||||
assert!(!body.contains("vidx * 4u"), "mini must not use tight stride 4: {body}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ps_color_export_is_saturated() {
|
||||
// GPUBUG-115: the PS color export must be clamped to [0,1] (canary
|
||||
// saturates before UNORM RT write) so an out-of-range guest color
|
||||
// doesn't write garbage/white to the sRGB target.
|
||||
let p = crate::ucode::ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exit],
|
||||
instructions: vec![],
|
||||
};
|
||||
let body = match translate(&p, Stage::Pixel) {
|
||||
Translation::Ok(b) => b,
|
||||
Translation::Reject(r) => panic!("PS rejected: {r}"),
|
||||
};
|
||||
assert!(
|
||||
body.contains("clamp(ocolor0, vec4<f32>(0.0), vec4<f32>(1.0))"),
|
||||
"PS must saturate color export: {body}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn loop_clause_rejected() {
|
||||
let shader = ParsedShader {
|
||||
@@ -722,9 +1128,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn unsupported_op_rejected() {
|
||||
let w2 = (29u32) // VOP_MAX_A, not in v1 subset
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12);
|
||||
// GPUBUG-106 layout: vector_write_mask in w0 (16:19), vector_opc in
|
||||
// w2 (24:28). MAX_A (29) is outside the supported subset → reject.
|
||||
let w0 = (0xFu32 << 16) | ((sop::RETAIN_PREV as u32) << 26);
|
||||
let w2 = (29u32) << 24; // VOP_MAX_A
|
||||
let shader = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
@@ -734,7 +1141,7 @@ mod tests {
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
}],
|
||||
instructions: vec![0, 0, w2],
|
||||
instructions: vec![w0, 0, w2],
|
||||
};
|
||||
assert!(matches!(
|
||||
translate(&shader, Stage::Vertex),
|
||||
|
||||
@@ -71,33 +71,50 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
// GPUBUG-106 (iterate-3S): correct the dword field map to match canary's
|
||||
// `AluInstruction` union (ucode.h:2036-2086). Pre-fix this read the
|
||||
// dest/mask/export/scalar-opcode out of `w2`, but they live in `w0`; the
|
||||
// vector opcode + source registers live in `w2`, and swizzle/negate/pred
|
||||
// in `w1`. The misread made every *export* ALU decode with
|
||||
// `vector_write_mask=0` → no oPos/oColor export emitted → the translated VS
|
||||
// collapsed every vertex to the clip origin (degenerate, nothing drawn).
|
||||
//
|
||||
// w0: vector_dest(0:5) vector_dest_rel(6) abs_constants(7)
|
||||
// scalar_dest(8:13) scalar_dest_rel(14) export_data(15)
|
||||
// vector_write_mask(16:19) scalar_write_mask(20:23)
|
||||
// vector_clamp(24) scalar_clamp(25) scalar_opc(26:31)
|
||||
// w1: src3_swiz(0:7) src2_swiz(8:15) src1_swiz(16:23)
|
||||
// src3/2/1_reg_negate(24/25/26) pred_condition(27) is_predicated(28)
|
||||
// w2: src3_reg(0:7) src2_reg(8:15) src1_reg(16:23)
|
||||
// vector_opc(24:28) src3/2/1_sel(29/30/31)
|
||||
//
|
||||
// Our (a,b,c) operands map to canary's (src1,src2,src3).
|
||||
AluInstruction {
|
||||
vector_opcode: (w2 & 0x3F) as u8,
|
||||
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
|
||||
vector_dest: ((w2 >> 16) & 0x7F) as u8,
|
||||
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
|
||||
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
|
||||
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
|
||||
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
|
||||
src_a: (w0 & 0xFF) as u8,
|
||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||
// Word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
|
||||
// ucode.h:2078-2086). Our `src_a` is canary's third operand
|
||||
// (low byte of w0), so its selector is bit 29.
|
||||
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
||||
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
||||
src_a_swiz: (w1 & 0xFF) as u8,
|
||||
vector_opcode: ((w2 >> 24) & 0x1F) as u8,
|
||||
scalar_opcode: ((w0 >> 26) & 0x3F) as u8,
|
||||
vector_dest: (w0 & 0x3F) as u8,
|
||||
scalar_dest: ((w0 >> 8) & 0x3F) as u8,
|
||||
vector_write_mask: ((w0 >> 16) & 0xF) as u8,
|
||||
scalar_write_mask: ((w0 >> 20) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w0 >> 15) & 1) != 0,
|
||||
// Not a real microcode bit — the scalar pipe selects `ps` implicitly
|
||||
// via the *_PREV opcodes, which `scalar_expr` handles by opcode.
|
||||
scalar_src_is_ps: false,
|
||||
src_a: ((w2 >> 16) & 0xFF) as u8,
|
||||
src_b: ((w2 >> 8) & 0xFF) as u8,
|
||||
src_c: (w2 & 0xFF) as u8,
|
||||
// sel==1 → operand is a temp register; sel==0 → ALU constant.
|
||||
src_a_is_temp: ((w2 >> 31) & 1) != 0,
|
||||
src_b_is_temp: ((w2 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w2 >> 29) & 1) != 0,
|
||||
src_a_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
||||
src_c_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 24) & 1) != 0,
|
||||
src_c_swiz: (w1 & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 26) & 1) != 0,
|
||||
src_b_negate: ((w1 >> 25) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 26) & 1) != 0,
|
||||
predicated: ((w0 >> 27) & 1) != 0,
|
||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 24) & 1) != 0,
|
||||
predicated: ((w1 >> 28) & 1) != 0,
|
||||
predicate_condition: ((w1 >> 27) & 1) != 0,
|
||||
raw: words,
|
||||
}
|
||||
}
|
||||
@@ -225,19 +242,24 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn decode_extracts_opcodes_and_dests() {
|
||||
// Build a minimal ALU word:
|
||||
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
|
||||
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RCP as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (3u32 << 16) // vector_dest
|
||||
| (7u32 << 24); // scalar_dest
|
||||
let alu = decode_alu([0, 0, w2]);
|
||||
// GPUBUG-106: correct canary field map. w0 carries dest/mask/scalar_opc;
|
||||
// w2 carries vector_opc + source regs.
|
||||
// vector_opcode = ADD (0) → w2 bits 24:28
|
||||
// scalar_opcode = RCP (22) → w0 bits 26:31
|
||||
// vector_dest = 3 → w0 bits 0:5, scalar_dest = 7 → w0 bits 8:13
|
||||
// vector_write_mask = 0xF → w0 bits 16:19, export_data → w0 bit 15
|
||||
let w0 = 3u32 // vector_dest
|
||||
| (7u32 << 8) // scalar_dest
|
||||
| (1u32 << 15) // export_data
|
||||
| (0xFu32 << 16) // vector_write_mask
|
||||
| ((sop::RCP as u32) << 26); // scalar_opc
|
||||
let w2 = (vop::ADD as u32) << 24; // vector_opc
|
||||
let alu = decode_alu([w0, 0, w2]);
|
||||
assert_eq!(alu.vector_opcode, vop::ADD);
|
||||
assert_eq!(alu.scalar_opcode, sop::RCP);
|
||||
assert_eq!(alu.vector_dest, 3);
|
||||
assert_eq!(alu.scalar_dest, 7);
|
||||
assert_eq!(alu.vector_write_mask, 0xF);
|
||||
assert!(alu.vector_dest_is_export);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,7 +43,15 @@ pub enum ControlFlowInstruction {
|
||||
Return,
|
||||
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
||||
Alloc { size: u32, kind: AllocKind },
|
||||
/// Exit the shader (terminal).
|
||||
/// `kNop` — fills space in the CF block; executes nothing, does not end
|
||||
/// the shader. (Xenos opcode 0.)
|
||||
Nop,
|
||||
/// `kMarkVsFetchDone` — hint that no more vertex fetches will be performed.
|
||||
/// (Xenos opcode 15.) Non-terminating.
|
||||
MarkVsFetchDone,
|
||||
/// Exit the shader (terminal). Synthesized — Xenos has no dedicated exit
|
||||
/// opcode; the shader ends after an `Exec`/`CondExec` clause with the
|
||||
/// END bit set (`is_end`). Retained for callers/tests that reference it.
|
||||
Exit,
|
||||
/// Unknown / unhandled opcode.
|
||||
Unknown { opcode: u8 },
|
||||
@@ -88,42 +96,66 @@ pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruc
|
||||
fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
// Top 4 bits of the 48-bit payload.
|
||||
let opcode = ((payload >> 44) & 0xF) as u8;
|
||||
// Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
|
||||
// extraction — good enough for the interpreter, which logs unknowns.
|
||||
let predicated = ((payload >> 28) & 1) != 0;
|
||||
let predicate_condition = ((payload >> 29) & 1) != 0;
|
||||
|
||||
// GPUBUG-103 (iterate-3P): clause-level predication is determined by the
|
||||
// *opcode*, not by free bits. The 48-bit CF payload is word0 = bits 0..31,
|
||||
// word1 = bits 32..47. Per canary `ucode.h`:
|
||||
// * `ControlFlowExecInstruction` (kExec/kExecEnd, opcodes 1/2): NOT
|
||||
// predicate-gated — it runs unconditionally.
|
||||
// * `ControlFlowCondExecInstruction` (kCondExec/kCondExecEnd, 3/4): gated
|
||||
// by a *bool constant*, `condition_` at word1 bit 10 = payload bit 42.
|
||||
// We don't model bool-constant gating in the WGSL paths (the bool is
|
||||
// virtually always set for these), so treat as unconditional.
|
||||
// * `ControlFlowCondExecPredInstruction` (kCondExecPred/...End/Clean...,
|
||||
// 5/6/13/14): gated by the *predicate register*; `condition_` at word1
|
||||
// bit 9 = payload bit 41.
|
||||
// The prior code read bits 28/29 (which fall inside `sequence_`/`vc_hi_`)
|
||||
// and stamped `predicated=true` on plenty of plain `kExec` clauses — which
|
||||
// made the P7 translator reject EVERY splash VS as `cf_cond`, forcing the
|
||||
// interpreter (placeholder geometry) for all draws.
|
||||
let is_pred_gated = matches!(opcode, 5 | 6 | 13 | 14);
|
||||
let predicated = is_pred_gated;
|
||||
let predicate_condition = is_pred_gated && ((payload >> 41) & 1) != 0;
|
||||
|
||||
// Xenos `ControlFlowOpcode` (canary `ucode.h:86-160`):
|
||||
// 0 kNop, 1 kExec, 2 kExecEnd, 3 kCondExec, 4 kCondExecEnd,
|
||||
// 5 kCondExecPred, 6 kCondExecPredEnd, 7 kLoopStart, 8 kLoopEnd,
|
||||
// 9 kCondCall, 10 kReturn, 11 kCondJmp, 12 kAlloc,
|
||||
// 13 kCondExecPredClean, 14 kCondExecPredCleanEnd, 15 kMarkVsFetchDone.
|
||||
// All exec variants share the address(12)/count(3)/sequence(12) layout
|
||||
// of `ControlFlowExecInstruction`; the `*End` variants terminate the
|
||||
// shader. (Prior table was off-by-one — it mapped 0→Exec and 1→Exit,
|
||||
// so a real `kExec` clause was misread as a terminal `Exit`, truncating
|
||||
// the CF block and dropping every `tfetch` in it.)
|
||||
let exec = |is_end: bool| ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
};
|
||||
match opcode {
|
||||
0 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: false,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
1 => ControlFlowInstruction::Exit,
|
||||
2 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: true,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
6 => ControlFlowInstruction::LoopStart {
|
||||
0 => ControlFlowInstruction::Nop,
|
||||
1 => exec(false),
|
||||
2 => exec(true),
|
||||
3 => exec(false),
|
||||
4 => exec(true),
|
||||
5 => exec(false),
|
||||
6 => exec(true),
|
||||
7 => ControlFlowInstruction::LoopStart {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
7 => ControlFlowInstruction::LoopEnd {
|
||||
8 => ControlFlowInstruction::LoopEnd {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
8 => ControlFlowInstruction::CondCall {
|
||||
9 => ControlFlowInstruction::CondCall {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
},
|
||||
9 => ControlFlowInstruction::Return,
|
||||
10 => ControlFlowInstruction::CondJmp {
|
||||
10 => ControlFlowInstruction::Return,
|
||||
11 => ControlFlowInstruction::CondJmp {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
@@ -132,6 +164,9 @@ fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
size: (payload & 0x7) as u32,
|
||||
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
||||
},
|
||||
13 => exec(false),
|
||||
14 => exec(true),
|
||||
15 => ControlFlowInstruction::MarkVsFetchDone,
|
||||
other => ControlFlowInstruction::Unknown { opcode: other },
|
||||
}
|
||||
}
|
||||
@@ -141,12 +176,49 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn opcode_exit_decodes() {
|
||||
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
|
||||
fn opcode_nop_and_exec_decode() {
|
||||
// Xenos opcode 0 = kNop (non-terminating padding).
|
||||
let payload: u64 = 0u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
assert_eq!(decode_cf_pair(hi, lo, 0).0, ControlFlowInstruction::Nop);
|
||||
// Xenos opcode 1 = kExec (executes instructions; NOT a terminal exit).
|
||||
let payload: u64 = 1u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||
assert_eq!(cf, ControlFlowInstruction::Exit);
|
||||
match decode_cf_pair(hi, lo, 0).0 {
|
||||
ControlFlowInstruction::Exec { is_end, .. } => assert!(!is_end),
|
||||
other => panic!("opcode 1 should be non-end Exec, got {other:?}"),
|
||||
}
|
||||
// Xenos opcode 15 = kMarkVsFetchDone (non-terminating hint).
|
||||
let payload: u64 = 15u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
assert_eq!(
|
||||
decode_cf_pair(hi, lo, 0).0,
|
||||
ControlFlowInstruction::MarkVsFetchDone
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_logo_shader_has_tfetch_clauses() {
|
||||
// The publisher-logo pixel shader E59B2B3DA4AA9008 (captured from the
|
||||
// canary oracle, byte-identical to the microcode our guest IM_LOADs).
|
||||
// Regression for iterate-3M: the old off-by-one opcode table decoded
|
||||
// its leading `kExec` (opcode 1) as a terminal `Exit`, truncating the
|
||||
// CF block so the `tfetch2D` never appeared → flat splash.
|
||||
let ucode: [u32; 24] = [
|
||||
0x00011002, 0x00001200, 0xC4000000, 0x00004003, 0x00002200, 0x00000000,
|
||||
0x10082021, 0x1F1FF688, 0x00004000, 0xC8080001, 0x001B1B00, 0xC1020000,
|
||||
0xC8070000, 0x00C0C000, 0xC1020000, 0xC8070001, 0x00C01B00, 0xC1000100,
|
||||
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||
];
|
||||
let p = crate::ucode::parse_shader(&ucode);
|
||||
let exec_clauses = p
|
||||
.cf
|
||||
.iter()
|
||||
.filter(|c| matches!(c, ControlFlowInstruction::Exec { .. }))
|
||||
.count();
|
||||
assert!(exec_clauses >= 1, "expected >=1 Exec clause, cf={:?}", p.cf);
|
||||
let slots = crate::shader_metrics::tfetch_slots(&p);
|
||||
assert!(!slots.is_empty(), "expected tfetch slots, got none; cf={:?}", p.cf);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -17,17 +17,64 @@ pub enum FetchInstruction {
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct VertexFetch {
|
||||
/// Vertex fetch constant index (0..=95).
|
||||
/// Vertex fetch *const_index* (5 bits, w0[20:24]). The full fetch-constant
|
||||
/// index is `const_index * 3 + const_index_sel` (canary `ucode.h:700`); use
|
||||
/// [`VertexFetch::const_reg_offset`] for the register-region dword offset.
|
||||
pub fetch_const: u8,
|
||||
/// iterate-3X (GPUBUG-110): `const_index_sel` (2 bits, w0[25:26]) — selects
|
||||
/// one of the 3 two-dword vertex-fetch constants packed in each 6-dword
|
||||
/// register group. Dropping this read sub-slot 0 of the group, missing the
|
||||
/// real vertex-buffer base for shaders that use sub-slot 1/2 (the publisher
|
||||
/// logo uses `const_index=31, sel=2`).
|
||||
pub const_index_sel: u8,
|
||||
/// Source register index (vertex index in r#).
|
||||
pub src_register: u8,
|
||||
/// Destination register for the fetched value.
|
||||
pub dest_register: u8,
|
||||
/// 4-bit write mask.
|
||||
pub dest_write_mask: u8,
|
||||
/// iterate-3S (GPUBUG-107): `xenos::VertexFormat` (6 bits, dword1[16:21]).
|
||||
/// Determines how many components to read and their packing. Pre-fix the
|
||||
/// translator hardcoded `k_32_32_32_32_FLOAT` (4 floats, stride 4),
|
||||
/// over-striding 2-float UI quads (`k_32_32_FLOAT`) → wrong/clipped
|
||||
/// positions (the next vertex's X bled into .w, giving negative W → the
|
||||
/// whole rectangle was clipped behind the camera).
|
||||
pub format: u8,
|
||||
/// Dword stride between consecutive vertices (dword2[0:7]).
|
||||
pub stride: u8,
|
||||
/// iterate-3T: dword offset of THIS attribute within the vertex stride
|
||||
/// (dword2[16:38] in canary's `VertexFetchInstruction`; the low 23 bits).
|
||||
/// A 6-dword vertex with position@0 + UV@2 + extra@3 needs this so the
|
||||
/// three vfetches sharing one fetch-constant read different attributes
|
||||
/// instead of all reading offset 0.
|
||||
pub offset: u32,
|
||||
/// `is_signed` = canary `fomat_comp_all`, word1 bit 12 (ucode.h:757) —
|
||||
/// selects signed vs unsigned interpretation of packed integer formats.
|
||||
/// (GPUBUG-113: was read from word1 bit 24, which is inside `exp_adjust`.)
|
||||
pub is_signed: bool,
|
||||
/// `is_normalized` = canary `num_format_all == 0`, word1 bit 13
|
||||
/// (ucode.h:758). Set bit ⇒ integer (un-normalized); clear ⇒ normalized.
|
||||
/// We store the normalized sense directly. (GPUBUG-113: was word1 bit 25.)
|
||||
pub is_normalized: bool,
|
||||
/// `is_mini_fetch` = canary word1 bit 30 (ucode.h:764). A mini-fetch reuses
|
||||
/// the address AND STRIDE of the preceding full vfetch of the same stream;
|
||||
/// its own `stride` field is 0. Required so a vfetch_mini color attribute
|
||||
/// indexes by the real vertex stride instead of its tight dword count.
|
||||
pub is_mini_fetch: bool,
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
impl VertexFetch {
|
||||
/// Dword offset of this fetch's 2-dword constant within the fetch-constant
|
||||
/// register region (`CONST_BASE_FETCH`). Vertex fetch constants are packed
|
||||
/// 3 per 6-dword group: `const_index * 6 + const_index_sel * 2`
|
||||
/// (canary `ucode.h:700` `fetch_constant_index = const_index*3 + sel`,
|
||||
/// each constant 2 dwords).
|
||||
pub fn const_reg_offset(&self) -> u32 {
|
||||
self.fetch_const as u32 * 6 + self.const_index_sel as u32 * 2
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct TextureFetch {
|
||||
/// Texture fetch constant index (0..=31).
|
||||
@@ -54,23 +101,47 @@ pub mod op {
|
||||
}
|
||||
|
||||
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||
// Fetch dword0 bitfields (Xenos `ucode.h:740-749` vfetch / `844-845`
|
||||
// tfetch): opcode_value:5, src_reg:6, src_reg_am:1, dst_reg:6,
|
||||
// dst_reg_am:1, (fetch_valid_only|must_be_one):1, const_index:5 @ bit20,
|
||||
// ... The prior decoder read `const_index` from bit 5 (which is actually
|
||||
// `src_reg`), so every fetch reported the wrong fetch-constant slot — the
|
||||
// logo `tfetch2D ..., tf0` was read as `tf1`, and slot 1's empty constant
|
||||
// failed to decode → no texture. The texture-fetch `dimension` lives in
|
||||
// dword2 bits 14..15, not dword1.
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
let opcode = (w0 & 0x1F) as u8;
|
||||
match opcode {
|
||||
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||
const_index_sel: ((w0 >> 25) & 0x3) as u8,
|
||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||
dest_write_mask: (w1 & 0xF) as u8,
|
||||
// dword1[16:21] = VertexFormat. dword2: stride[0:7],
|
||||
// offset (in dwords) [8:?] — empirically the attribute offset of
|
||||
// the textured logo VS lands in dword2[8:15] (pos@4, UV@3,
|
||||
// 3-float@0 in a 6-dword vertex). signed/normalized live higher.
|
||||
format: ((w1 >> 16) & 0x3F) as u8,
|
||||
stride: (w2 & 0xFF) as u8,
|
||||
offset: (w2 >> 8) & 0xFF,
|
||||
// GPUBUG-113: canary ucode.h:757-758,764 — signed=fomat_comp_all
|
||||
// (w1 bit12), normalized=(num_format_all==0) (w1 bit13),
|
||||
// mini-fetch=(w1 bit30). The previous bit24/25 reads landed inside
|
||||
// `exp_adjust`, so signedness/normalization were effectively random.
|
||||
is_signed: ((w1 >> 12) & 1) != 0,
|
||||
is_normalized: ((w1 >> 13) & 1) == 0,
|
||||
is_mini_fetch: ((w1 >> 30) & 1) != 0,
|
||||
raw: words,
|
||||
}),
|
||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
dimension: ((w1 >> 29) & 0x3) as u8,
|
||||
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||
dest_write_mask: (w1 & 0xF) as u8,
|
||||
dimension: ((w2 >> 14) & 0x3) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
_ => FetchInstruction::Unknown { opcode, raw: words },
|
||||
@@ -83,8 +154,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn decode_vertex_fetch() {
|
||||
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
|
||||
// opcode=0 (vertex). Xenos dword0: src_reg@bit5, dst_reg@bit12,
|
||||
// const_index@bit20. fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (2 << 5) | (7 << 12) | (5 << 20);
|
||||
let v = decode_fetch([w0, 0, 0]);
|
||||
match v {
|
||||
FetchInstruction::Vertex(vf) => {
|
||||
@@ -96,13 +168,69 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vertex_fetch_const_index_sel_and_reg_offset() {
|
||||
// iterate-3X (GPUBUG-110): the real publisher-logo vfetch (w0 =
|
||||
// 0x2DF82000) encodes const_index=31, const_index_sel=2. Its fetch
|
||||
// constant lives at dword offset `31*6 + 2*2 = 190` (reg 0x48BE), not
|
||||
// `31*6 = 186` (reg 0x48BA, which held the unused 0x1 slot). Dropping
|
||||
// the sel field made the logo geometry resolve as "no vertex buffer".
|
||||
let v = decode_fetch([0x2DF8_2000, 0, 0]);
|
||||
match v {
|
||||
FetchInstruction::Vertex(vf) => {
|
||||
assert_eq!(vf.fetch_const, 31, "const_index");
|
||||
assert_eq!(vf.const_index_sel, 2, "const_index_sel");
|
||||
assert_eq!(vf.const_reg_offset(), 190, "reg offset = 31*6 + 2*2");
|
||||
}
|
||||
other => panic!("expected Vertex, got {other:?}"),
|
||||
}
|
||||
// sel=0 collapses to the legacy `fetch_const*6` offset (back-compat).
|
||||
let v0 = decode_fetch([0u32 | (5 << 20), 0, 0]);
|
||||
if let FetchInstruction::Vertex(vf) = v0 {
|
||||
assert_eq!(vf.const_index_sel, 0);
|
||||
assert_eq!(vf.const_reg_offset(), 30);
|
||||
} else {
|
||||
panic!("expected Vertex");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vertex_fetch_signed_normalized_mini_bits() {
|
||||
// GPUBUG-113: canary ucode.h:757-758,764 — is_signed=fomat_comp_all
|
||||
// (w1 bit12), is_normalized=(num_format_all==0) (w1 bit13),
|
||||
// is_mini_fetch=(w1 bit30). Validate each bit independently.
|
||||
let mk = |w1: u32| match decode_fetch([0, w1, 0]) {
|
||||
FetchInstruction::Vertex(vf) => vf,
|
||||
_ => panic!("vertex"),
|
||||
};
|
||||
// No bits: unsigned, normalized, full fetch.
|
||||
let v = mk(0);
|
||||
assert!(!v.is_signed);
|
||||
assert!(v.is_normalized);
|
||||
assert!(!v.is_mini_fetch);
|
||||
// bit12 → signed.
|
||||
assert!(mk(1 << 12).is_signed);
|
||||
// bit13 (num_format_all=1) → NOT normalized.
|
||||
assert!(!mk(1 << 13).is_normalized);
|
||||
// bit30 → mini fetch.
|
||||
assert!(mk(1 << 30).is_mini_fetch);
|
||||
// The old (wrong) bits 24/25 must NOT affect signed/normalized.
|
||||
assert!(!mk(1 << 24).is_signed);
|
||||
assert!(mk(1 << 25).is_normalized);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_texture_fetch() {
|
||||
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
|
||||
let t = decode_fetch([w0, (2u32 << 29), 0]);
|
||||
// opcode=1 (texture). const_index@bit20=3, src@bit5=1, dst@bit12=4.
|
||||
// dimension lives in dword2 bits 14..15.
|
||||
let w0 = 1u32 | (1 << 5) | (4 << 12) | (3 << 20);
|
||||
let w2 = 2u32 << 14;
|
||||
let t = decode_fetch([w0, 0, w2]);
|
||||
match t {
|
||||
FetchInstruction::Texture(tf) => {
|
||||
assert_eq!(tf.fetch_const, 3);
|
||||
assert_eq!(tf.src_register, 1);
|
||||
assert_eq!(tf.dest_register, 4);
|
||||
assert_eq!(tf.dimension, 2);
|
||||
}
|
||||
other => panic!("expected Texture, got {other:?}"),
|
||||
|
||||
@@ -48,6 +48,9 @@ pub mod cf_kind {
|
||||
pub const COND_JMP: u32 = 6;
|
||||
pub const COND_CALL: u32 = 7;
|
||||
pub const RETURN: u32 = 8;
|
||||
/// Non-executing CF clause: `kNop` padding or `kMarkVsFetchDone` hint.
|
||||
/// The WGSL CF walker treats this as a no-op (advance, do not reject).
|
||||
pub const NOP: u32 = 9;
|
||||
pub const UNKNOWN: u32 = 15;
|
||||
}
|
||||
|
||||
@@ -136,6 +139,7 @@ fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
||||
}
|
||||
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
||||
Return => (cf_kind::RETURN, 0, 0),
|
||||
Nop | MarkVsFetchDone => (cf_kind::NOP, 0, 0),
|
||||
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
||||
}
|
||||
}
|
||||
@@ -164,9 +168,11 @@ pub struct ParsedShader {
|
||||
}
|
||||
|
||||
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
||||
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
||||
/// is encoded in the first word's low 12 bits of the last exec clause —
|
||||
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
||||
/// microcode buffer (control flow + instructions). The CF block is implicitly
|
||||
/// bounded: we walk clause-pair rows until one terminates the shader (an
|
||||
/// `Exec`/`CondExec` clause with the END bit set, per Xenos). Everything after
|
||||
/// that row is the instruction block; exec/loop addresses are then rebased to
|
||||
/// be relative to it.
|
||||
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
let mut cf = Vec::new();
|
||||
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
||||
@@ -175,22 +181,50 @@ pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
while i + 2 < raw_dwords.len() {
|
||||
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
||||
let (first, second) = a;
|
||||
let seen_exit = matches!(
|
||||
first,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
) || matches!(
|
||||
second,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
);
|
||||
// The CF block ends after the clause that terminates the shader: an
|
||||
// `Exec` with the END bit set (Xenos `kExecEnd`/`kCondExec*End`), a
|
||||
// synthetic `Exit`, or an `Unknown` opcode (decode ran off the CF
|
||||
// block into instruction data — stop defensively). `Nop` padding
|
||||
// does NOT terminate. (Previously this stopped on the first `Exit`,
|
||||
// but with the corrected opcode table opcode 1 is `kExec`, not exit,
|
||||
// so real exec clauses kept the parse going as intended.)
|
||||
let terminates = |cf: &ControlFlowInstruction| {
|
||||
matches!(
|
||||
cf,
|
||||
ControlFlowInstruction::Exec { is_end: true, .. }
|
||||
| ControlFlowInstruction::Exit
|
||||
| ControlFlowInstruction::Unknown { .. }
|
||||
)
|
||||
};
|
||||
let seen_end = terminates(&first) || terminates(&second);
|
||||
cf.push(first);
|
||||
cf.push(second);
|
||||
i += 3;
|
||||
if seen_exit {
|
||||
if seen_end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Everything after `i` dwords is the instruction block.
|
||||
let instructions = raw_dwords[i..].to_vec();
|
||||
// Xenos exec/loop `address` fields are absolute instruction-triple indices
|
||||
// counted from shader dword 0, but `instructions` here begins *after* the
|
||||
// CF block. Rebase those addresses to be relative to the instruction block
|
||||
// (subtract the CF triple count) so `address * 3` indexes `instructions`
|
||||
// directly. (Without this, every exec read 3 dwords too far per CF triple —
|
||||
// the publisher-logo `tfetch` triple was skipped → flat splash.)
|
||||
let cf_triples = (i / 3) as u32;
|
||||
for clause in cf.iter_mut() {
|
||||
match clause {
|
||||
ControlFlowInstruction::Exec { address, .. } => {
|
||||
*address = address.saturating_sub(cf_triples);
|
||||
}
|
||||
ControlFlowInstruction::LoopStart { address, .. }
|
||||
| ControlFlowInstruction::LoopEnd { address, .. } => {
|
||||
*address = address.saturating_sub(cf_triples);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
ParsedShader { cf, instructions }
|
||||
}
|
||||
|
||||
@@ -235,15 +269,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trivial_exit_clause_stops_parsing() {
|
||||
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
||||
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
||||
let w0 = 0u32; // clause A body
|
||||
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
||||
let w2 = 0u32;
|
||||
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
||||
fn exec_end_clause_stops_parsing() {
|
||||
// Row: clause B = kExecEnd (opcode 2) terminates the CF block.
|
||||
// 48-bit payload of B occupies hi16(word1) + word2; opcode lives in
|
||||
// bits 44..47 of that payload. Put opcode 2 there: payload bit 44 set
|
||||
// for the `2` → (2 << 44). In B's framing, bits 16..47 come from
|
||||
// word2, so word2 bit (44-16)=28 region holds the opcode nibble.
|
||||
let b_payload: u64 = 2u64 << 44; // kExecEnd
|
||||
// B = lo16 from hi16(word1), hi from word2. Reconstruct word1/word2.
|
||||
let word1 = ((b_payload & 0xFFFF) as u32) << 16; // B's low 16 bits → hi16(word1)
|
||||
let word2 = ((b_payload >> 16) & 0xFFFF_FFFF) as u32;
|
||||
let p = parse_shader(&[0, word1, word2, 0xDEAD_BEEF]);
|
||||
assert!(!p.cf.is_empty());
|
||||
// Exit detected → remaining dword is instruction data.
|
||||
// ExecEnd detected in the first row → remaining dword is instruction data.
|
||||
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ xenia-cpu = { workspace = true }
|
||||
xenia-vfs = { workspace = true }
|
||||
xenia-hid = { workspace = true }
|
||||
xenia-gpu = { workspace = true }
|
||||
xenia-apu = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
@@ -182,7 +182,7 @@ pub fn register_exports(state: &mut KernelState) {
|
||||
state.register_export(Xboxkrnl, 0x01F7, "XAudioGetVoiceCategoryVolumeChangeMask", stub_return_zero);
|
||||
state.register_export(Xboxkrnl, 0x01F8, "XAudioGetVoiceCategoryVolume", stub_success);
|
||||
state.register_export(Xboxkrnl, 0x0224, "XMACreateContext", xma_create_context);
|
||||
state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", stub_success);
|
||||
state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", xma_release_context);
|
||||
|
||||
// Crypto
|
||||
state.register_export(Xboxkrnl, 0x0192, "XeCryptSha", stub_success);
|
||||
@@ -486,12 +486,20 @@ fn ke_query_performance_frequency(ctx: &mut PpcContext, _mem: &GuestMemory, _sta
|
||||
ctx.gpr[3] = 50_000_000; // 50 MHz
|
||||
}
|
||||
|
||||
fn ke_query_system_time(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
|
||||
fn ke_query_system_time(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
let time_ptr = ctx.gpr[3] as u32;
|
||||
if time_ptr != 0 {
|
||||
let fake_time: u64 = 132_500_000_000_000_000; // ~2021 FILETIME
|
||||
mem.write_u32(time_ptr, (fake_time >> 32) as u32);
|
||||
mem.write_u32(time_ptr + 4, fake_time as u32);
|
||||
// ITERATE-2J — advance with the same deterministic clock the
|
||||
// KeTimeStampBundle uses (1 global_clock unit ≈ 100 ns) so a guest
|
||||
// that polls KeQuerySystemTime for elapsed time also sees forward
|
||||
// progress instead of a frozen constant. FILETIME base (~2021) +
|
||||
// 100-ns-unit clock.
|
||||
const FILETIME_BASE: u64 = 132_500_000_000_000_000;
|
||||
let hw_id = state.scheduler.current_hw_id().unwrap_or(0);
|
||||
let now = state.now_basis_at(hw_id);
|
||||
let system_time = FILETIME_BASE.wrapping_add(now);
|
||||
mem.write_u32(time_ptr, (system_time >> 32) as u32);
|
||||
mem.write_u32(time_ptr + 4, system_time as u32);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1644,6 +1652,79 @@ fn nt_set_information_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut
|
||||
return;
|
||||
}
|
||||
|
||||
// XFileRenameInformation (10): move the backing file to a new path.
|
||||
// Sylpheed's asset-cache decompresses each packed resource to a staging
|
||||
// `cache:\<hash><tail>.tmp` then renames it into its final nested path
|
||||
// `cache:\<hash>\<dir>\<file>`. Without an actual host-FS rename the
|
||||
// nested target stays empty, the later read-back of the decompressed
|
||||
// asset (e.g. the title logo texture `\69d8e45c\e\534ffea`) misses, and
|
||||
// the logo never loads. Mirror canary `xboxkrnl_io_info.cc:226`
|
||||
// (`X_FILE_RENAME_INFORMATION{ replace_existing@0, root_dir_handle@4,
|
||||
// ansi_string@8 }` → `file->Rename(TranslateAnsiPath(ansi_string))`).
|
||||
if info_class == 10 {
|
||||
// Read the target path from the embedded ANSI_STRING at info_ptr+8.
|
||||
let target_raw = match crate::path::read_ansi_string(mem, info_ptr + 8) {
|
||||
Some(s) if !s.is_empty() => s,
|
||||
_ => {
|
||||
const STATUS_OBJECT_NAME_INVALID: u64 = 0xC000_0033;
|
||||
ctx.gpr[3] = STATUS_OBJECT_NAME_INVALID;
|
||||
return;
|
||||
}
|
||||
};
|
||||
// Resolve the destination against the host cache backing dir. We only
|
||||
// support renames within the writable `cache:` mount (the only place
|
||||
// a guest can create files); disc/synth entries are read-only.
|
||||
let new_host = state.resolve_cache_path(&target_raw);
|
||||
// Current backing host path of the handle.
|
||||
let old_host = match state.objects.get(&handle) {
|
||||
Some(KernelObject::File { host_path: Some(hp), .. }) => Some(hp.clone()),
|
||||
Some(KernelObject::File { .. }) => None,
|
||||
_ => {
|
||||
ctx.gpr[3] = STATUS_INVALID_HANDLE;
|
||||
return;
|
||||
}
|
||||
};
|
||||
let status: u64 = match (old_host, new_host) {
|
||||
(Some(old), Some(new)) => {
|
||||
if let Some(parent) = new.parent() {
|
||||
let _ = std::fs::create_dir_all(parent);
|
||||
}
|
||||
match std::fs::rename(&old, &new) {
|
||||
Ok(()) => {
|
||||
// Update the handle so subsequent I/O targets the new
|
||||
// host path + guest path.
|
||||
if let Some(KernelObject::File { path, host_path, .. }) =
|
||||
state.objects.get_mut(&handle)
|
||||
{
|
||||
*path = crate::path::normalize_path(&target_raw);
|
||||
*host_path = Some(new.clone());
|
||||
}
|
||||
tracing::info!(
|
||||
"NtSetInformationFile rename cache {:?} -> {:?} ({:?})",
|
||||
old, new, target_raw
|
||||
);
|
||||
STATUS_SUCCESS
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"NtSetInformationFile rename {:?} -> {:?} failed: {}",
|
||||
old, new, e
|
||||
);
|
||||
STATUS_UNSUCCESSFUL
|
||||
}
|
||||
}
|
||||
}
|
||||
// Non-cache (read-only VFS) source/target: acknowledge without a
|
||||
// host move, matching the prior permissive behaviour.
|
||||
_ => STATUS_SUCCESS,
|
||||
};
|
||||
if iosb_ptr != 0 {
|
||||
write_io_status_block(mem, iosb_ptr, status as u32, info_length);
|
||||
}
|
||||
ctx.gpr[3] = status;
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle lookup.
|
||||
let Some(KernelObject::File { size, position, host_path, .. }) = state.objects.get_mut(&handle) else {
|
||||
ctx.gpr[3] = STATUS_INVALID_HANDLE;
|
||||
@@ -2875,10 +2956,12 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
|
||||
// packets directly into ring memory at the current WPTR (the GPU
|
||||
// backend lives on a worker thread under `--gpu-thread` so we can't
|
||||
// read its `ring.base` from the kernel side without a channel hop).
|
||||
// Per canary: size_log2 is log2(size in BYTES), so size in dwords =
|
||||
// 2^size_log2 / 4 = 1 << (size_log2 - 2).
|
||||
// Per canary `CommandProcessor::InitializeRingBuffer`: the ring is
|
||||
// `1 << (size_log2 + 3)` bytes = `1 << (size_log2 + 1)` dwords (`r4` is
|
||||
// log2 of the size in quadwords). Kept in sync with
|
||||
// `GpuSystem::initialize_ring_buffer`. (Currently bookkeeping-only.)
|
||||
state.ring_base = ptr;
|
||||
state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
|
||||
state.ring_size_dwords = 1u32 << (size_log2 + 1);
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
@@ -2989,52 +3072,86 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// xboxkrnl_video.cc:479. Currently skipped (see below).
|
||||
let _ = fetch_dwords; // silence unused — will be live again under the deferred path
|
||||
|
||||
// The original M2b path zero-filled buffer_ptr (in the system command
|
||||
// buffer) and bumped WPTR by 64 to expose the game's own ring writes.
|
||||
// Keep that untouched — the game still expects buffer_ptr to be a
|
||||
// skippable scratch area, and the bump still exposes any game-batched
|
||||
// PM4 packets for the drain.
|
||||
// iterate-2V: mirror xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:518-548)
|
||||
// FAITHFULLY. The game reserves 64 dwords (256 bytes) in the primary ring
|
||||
// at `buffer_ptr`; canary writes a `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
||||
// fetch-constant patch followed by `PM4_TYPE3(PM4_XE_SWAP)`, then pads with
|
||||
// NOPs — and **NEVER touches `CP_RB_WPTR`**. The game advances the primary
|
||||
// ring write-pointer itself via its own doorbell once it has finished
|
||||
// populating the reserved slot, so VdSwap only fills the bytes.
|
||||
//
|
||||
// iterate-2V FIX (the bug this removes): a prior revision bumped the
|
||||
// primary ring `CP_RB_WPTR` out-of-band here (`extend_write_ptr_by(64)`).
|
||||
// But `buffer_ptr` (~0x4add6efc) is NOT inside the primary ring (base
|
||||
// ~0x4adcd000, 8192 dwords) — it lives ~10k dwords past it, in the
|
||||
// renderer indirect-buffer region. The bogus WPTR bump pushed the GPU
|
||||
// read-pointer PAST the guest's real write-pointer, the drain treated the
|
||||
// overshoot as a circular wrap, and **re-executed the splash's draw
|
||||
// indirect-buffers ~2×** — inflating draws to 78 (real splash ≈ 28; 12
|
||||
// INDIRECT_BUFFERs vs the real 6). Canary's `VdSwap_entry` writes the
|
||||
// block and returns; the swap-complete CP interrupt comes only from the
|
||||
// game's own in-stream `PM4_INTERRUPT` packets, never from VdSwap.
|
||||
if buffer_ptr != 0 {
|
||||
for i in 0..64u32 {
|
||||
mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
|
||||
let mut off = 0u32;
|
||||
let mut put = |i: &mut u32, v: u32| {
|
||||
mem.write_u32(buffer_ptr + *i * 4, v);
|
||||
*i += 1;
|
||||
};
|
||||
// PM4_TYPE0 fetch-constant slot-0 patch (6 dwords payload). The
|
||||
// base_address field is patched to the physical frontbuffer so the
|
||||
// bloom/blur "sample frame N for frame N+1" path reads the right page.
|
||||
let mut patched = fetch_dwords;
|
||||
patched[1] = (patched[1] & 0x0000_0FFF) | ((frontbuffer_addr >> 12) << 12);
|
||||
put(
|
||||
&mut off,
|
||||
xenia_gpu::pm4::make_packet_type0(
|
||||
xenia_gpu::gpu_system::CONST_BASE_FETCH as u16,
|
||||
6,
|
||||
),
|
||||
);
|
||||
for d in patched {
|
||||
put(&mut off, d);
|
||||
}
|
||||
// PM4_TYPE3(PM4_XE_SWAP, 4 dwords): signature, frontbuffer_phys, w, h.
|
||||
put(
|
||||
&mut off,
|
||||
xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
|
||||
);
|
||||
put(&mut off, xenia_gpu::pm4::SWAP_SIGNATURE);
|
||||
put(&mut off, frontbuffer_addr);
|
||||
put(&mut off, width);
|
||||
put(&mut off, height);
|
||||
// Pad the remainder with NOP (Type-2) packets.
|
||||
while off < 64 {
|
||||
put(&mut off, xenia_gpu::pm4::make_packet_type2());
|
||||
}
|
||||
}
|
||||
state.gpu.extend_write_ptr_by(64);
|
||||
// NOTE: We deliberately do NOT bump `CP_RB_WPTR` here (see the iterate-2V
|
||||
// comment above). The drain below consumes only the packets the game has
|
||||
// legitimately advanced the write-pointer over.
|
||||
|
||||
// GPUBUG-DRAIN-001: notify the swap directly.
|
||||
//
|
||||
// Per xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:438-521), the
|
||||
// textbook approach is to inject `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
||||
// (fetch-constant slot-0 patch for the Sylpheed bloom/blur "frame N+1"
|
||||
// sample) followed by `PM4_TYPE3(PM4_XE_SWAP)` directly into the
|
||||
// primary ring at WPTR, then let the natural drain consume them.
|
||||
//
|
||||
// That works in **pure lockstep** (drain runs at every kernel callback
|
||||
// boundary, ring has at most a few hundred packets pending). It
|
||||
// **does not** work under `--parallel` (CPU + GPU ring contention) —
|
||||
// observed empirically: vd_swap's `drain_to_current_wptr` consumes
|
||||
// 8-10 million game-batched IB packets in the 900 ms inline-deadline
|
||||
// window without reaching our tail-injected PM4_XE_SWAP. Under
|
||||
// threaded backend the worker has the same deadline. Either:
|
||||
// (a) the safety-net direct notify (below) fires and gets the swap
|
||||
// counted — but if the worker *eventually* drains past our
|
||||
// injected packet later it would double-count,
|
||||
// (b) we extend the deadline so far that vd_swap blocks for many
|
||||
// seconds — unreasonable for a kernel callback.
|
||||
//
|
||||
// Skip the ring injection unconditionally and post `notify_xe_swap`
|
||||
// directly. The drain still runs (game packets execute as normal).
|
||||
// **Trade-off**: the slot-0 fetch-constant patch is deferred —
|
||||
// tracked as GPUBUG-FETCH-PATCH-001. Sylpheed currently has draws=0,
|
||||
// so a stale slot 0 has no observable effect.
|
||||
// Drain the ring up to whatever the game has actually submitted; any
|
||||
// in-stream `PM4_INTERRUPT` / draw packets execute in order. The
|
||||
// reserved-slot PM4_XE_SWAP is consumed by the GPU only once the game
|
||||
// advances its own doorbell over it. The swap-counter safety net below
|
||||
// keeps host swap bookkeeping live in the meantime.
|
||||
let drained = state.gpu.drain_to_current_wptr(mem);
|
||||
tracing::debug!(drained, "VdSwap: drained PM4 packets");
|
||||
|
||||
// Direct swap notification. Inline mode bumps `swaps_seen`
|
||||
// synchronously; threaded mode posts a `GpuCommand::NotifyXeSwap`
|
||||
// and the worker bumps it asynchronously.
|
||||
// Safety net: if the drain did NOT reach our PM4_XE_SWAP this call (e.g.
|
||||
// an undersized inline deadline left game-batched packets pending), still
|
||||
// bump the host swap counter so the UI present + swap stats stay live.
|
||||
// Skip when the in-stream PM4_XE_SWAP already recorded this frontbuffer
|
||||
// (avoids double-counting). This path does NOT raise a CP interrupt.
|
||||
if frontbuffer_addr != 0 && width > 0 && height > 0 {
|
||||
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
||||
let already_swapped = state
|
||||
.gpu
|
||||
.as_inline_mut()
|
||||
.map(|g| g.last_swap.map(|s| s.frontbuffer_phys) == Some(frontbuffer_addr))
|
||||
.unwrap_or(false);
|
||||
if !already_swapped {
|
||||
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
||||
}
|
||||
}
|
||||
|
||||
// The remaining vd_swap work (UI publish: shader blobs, constants,
|
||||
@@ -3072,27 +3189,34 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
);
|
||||
ui.publish_assets(blobs, constants);
|
||||
|
||||
// P5: try to decode the primary texture (fetch constant slot 0).
|
||||
// Slot 0 is the convention most games use for their main bound
|
||||
// texture at draw time; full N-slot binding waits for P6+. If the
|
||||
// slot is unset or the format isn't supported (magenta stub kicks
|
||||
// in host-side), we skip.
|
||||
//
|
||||
// Texture fetch constants live at `CONST_BASE_FETCH + slot*6` in
|
||||
// the register file; we read the 6 dwords, decode the key, hit
|
||||
// the CPU cache (with page-version freshness), and clone the
|
||||
// decoded bytes across the bridge.
|
||||
const TEX_SLOT: u32 = 0;
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (i, slot) in fetch6.iter_mut().enumerate() {
|
||||
*slot = gpu_inline
|
||||
.register_file
|
||||
.read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32);
|
||||
}
|
||||
let published = if let Some(key) = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)
|
||||
{
|
||||
// Span over the entire tiled texture footprint to pick the
|
||||
// max page version covering it.
|
||||
// P5b: publish the texture the last draw's *active pixel shader*
|
||||
// actually sampled. The GPU draw handler decodes the PS's real
|
||||
// `tfetch` fetch-constant slots into `last_draw_textures`; we publish
|
||||
// the first (the UI binds a single texture today). When the last draw
|
||||
// used a flat (no-tfetch) shader the list is empty, so we fall back to
|
||||
// the legacy slot-0 probe to preserve behavior on flat-only frames.
|
||||
// The legacy single-texture `publish_texture` bridge wants
|
||||
// `(TextureKey, bytes)`; `last_draw_textures` now also carries the
|
||||
// content version (for the per-draw host-cache re-upload). Drop it here.
|
||||
let published = gpu_inline
|
||||
.last_draw_textures
|
||||
.first()
|
||||
.map(|(k, _v, b)| (*k, b.clone()))
|
||||
.or_else(|| {
|
||||
// Fallback: probe fetch constant slot 0 directly. Texture fetch
|
||||
// constants live at `CONST_BASE_FETCH + slot*6` in the register
|
||||
// file; read 6 dwords, decode the key, hit the CPU cache with
|
||||
// page-version freshness, clone the bytes across the bridge.
|
||||
const TEX_SLOT: u32 = 0;
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (i, slot) in fetch6.iter_mut().enumerate() {
|
||||
*slot = gpu_inline
|
||||
.register_file
|
||||
.read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32);
|
||||
}
|
||||
let key = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)?;
|
||||
// Span over the entire tiled texture footprint to pick the max
|
||||
// page version covering it.
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
@@ -3110,12 +3234,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
});
|
||||
metrics::gauge!("gpu.texture_cache.entries")
|
||||
.set(gpu_inline.texture_cache.len() as f64);
|
||||
ui.publish_texture(published);
|
||||
|
||||
// iterate-3O: publish this frame's captured per-draw geometry and
|
||||
// reset the accumulator for the next frame. The UI replays these as
|
||||
// real guest draws (real vertices + prim type) instead of synthetic
|
||||
// placeholder shapes. `frame_captures` is `Some` only under `--ui`.
|
||||
if let Some(caps) = gpu_inline.frame_captures.as_mut() {
|
||||
let drained = std::mem::take(caps);
|
||||
metrics::counter!("gpu.geometry.published").increment(drained.len() as u64);
|
||||
ui.publish_geometry(drained);
|
||||
}
|
||||
}
|
||||
// Notify the UI.
|
||||
if let Some(ui) = state.ui.clone() {
|
||||
@@ -3161,13 +3293,18 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// safer to cap the read at the known total size to avoid OOB.
|
||||
let mut tiled = Vec::with_capacity(total_tiled_bytes);
|
||||
let mut ok = true;
|
||||
// The frontbuffer is a guest *physical* address; project onto the
|
||||
// committed backing window (see `xenia_gpu::physical_to_backing`)
|
||||
// so the present reads the pixels the GPU resolved, not a stale /
|
||||
// zero mirror page.
|
||||
let fb_backing = xenia_gpu::physical_to_backing(swap.frontbuffer_phys);
|
||||
for i in 0..total_tiled_bytes {
|
||||
// read_u8 is cheap — the VirtualMemory handler returns 0
|
||||
// for unmapped pages so we get a recognisable dark frame
|
||||
// rather than a crash if the address turned out bogus.
|
||||
let addr = swap.frontbuffer_phys.wrapping_add(i as u32);
|
||||
let addr = fb_backing.wrapping_add(i as u32);
|
||||
tiled.push(mem.read_u8(addr));
|
||||
if addr < swap.frontbuffer_phys {
|
||||
if addr < fb_backing {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
@@ -3261,6 +3398,7 @@ fn xaudio_register_render_driver(ctx: &mut PpcContext, mem: &GuestMemory, state:
|
||||
callback_pc,
|
||||
callback_arg,
|
||||
wrapped_callback_arg: wrapped,
|
||||
submitted_frames: 0,
|
||||
};
|
||||
let Some(index) = state.xaudio.register(client) else {
|
||||
tracing::warn!("XAudioRegisterRenderDriverClient: client table full");
|
||||
@@ -3369,18 +3507,75 @@ fn xaudio_unregister_render_driver(ctx: &mut PpcContext, _mem: &GuestMemory, sta
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
/// Mirrors canary `XAudioSubmitRenderDriverFrame_entry` →
|
||||
/// `AudioSystem::SubmitFrame(driver_ptr & 0xFFFF, samples)`:
|
||||
/// the guest render-driver mixer (`sub_824DC350`) calls this once per audio
|
||||
/// frame with `r3 = driver_id` (`0x4155_xxxx`) and `r4 = sample buffer`.
|
||||
/// Canary forwards `samples` to the client's `AudioDriver`; the driver's
|
||||
/// playback-completion callback later releases the client semaphore, which is
|
||||
/// the buffer-consumed pacing our XAudio callback ticker
|
||||
/// (`tick_instr` + `try_inject_audio_callback`) already drives. SubmitFrame
|
||||
/// returns void and the caller discards r3 / reads no field SubmitFrame
|
||||
/// writes, so faithfully we validate the client index and account the frame
|
||||
/// (observational; never read back by the guest). Always returns
|
||||
/// `X_ERROR_SUCCESS`, matching canary. Deterministic: only this guest-driven
|
||||
/// export mutates state; no wall-clock, no host thread.
|
||||
fn xaudio_submit_render_driver_frame(
|
||||
ctx: &mut PpcContext,
|
||||
_mem: &GuestMemory,
|
||||
_state: &mut KernelState,
|
||||
state: &mut KernelState,
|
||||
) {
|
||||
let driver_id = ctx.gpr[3] as u32;
|
||||
let index = (driver_id & XAUDIO_DRIVER_INDEX_MASK) as usize;
|
||||
let registered = state.xaudio.record_submit(index);
|
||||
if !registered {
|
||||
// Canary logs and submits silence to keep the callback chain alive
|
||||
// for an unregistered/invalid index; our ticker keeps the chain
|
||||
// alive independently, so a debug log suffices.
|
||||
tracing::debug!(
|
||||
driver_id = format_args!("{driver_id:#010x}"),
|
||||
index,
|
||||
"XAudioSubmitRenderDriverFrame: unregistered client index"
|
||||
);
|
||||
} else if state.xaudio.submitted_frames(index) == 1 {
|
||||
tracing::info!(
|
||||
driver_id = format_args!("{driver_id:#010x}"),
|
||||
index,
|
||||
"XAudioSubmitRenderDriverFrame: first frame submitted by guest mixer"
|
||||
);
|
||||
}
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn xma_create_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
let handle = state.alloc_handle();
|
||||
tracing::info!("XMACreateContext: handle={:#x}", handle);
|
||||
ctx.gpr[3] = handle as u64;
|
||||
/// Mirrors xenia-canary `XMACreateContext_entry(lpdword_t context_out_ptr)`:
|
||||
/// allocate a context from the register-mapped array, write its guest pointer
|
||||
/// to `*context_out_ptr`, and return `X_STATUS_SUCCESS` (or `X_STATUS_NO_MEMORY`
|
||||
/// when the 320-slot array is exhausted).
|
||||
fn xma_create_context(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
let out_ptr = ctx.gpr[3] as u32;
|
||||
let context_ptr = state.xma.lock().unwrap().allocate_context();
|
||||
if out_ptr != 0 {
|
||||
mem.write_u32(out_ptr, context_ptr);
|
||||
}
|
||||
tracing::info!(
|
||||
out_ptr = format_args!("{out_ptr:#010x}"),
|
||||
context_ptr = format_args!("{context_ptr:#010x}"),
|
||||
"XMACreateContext"
|
||||
);
|
||||
ctx.gpr[3] = if context_ptr == 0 {
|
||||
0xC000_0017 // X_STATUS_NO_MEMORY
|
||||
} else {
|
||||
0 // X_STATUS_SUCCESS
|
||||
};
|
||||
}
|
||||
|
||||
/// Mirrors xenia-canary `XMAReleaseContext_entry(lpvoid_t context_ptr)`:
|
||||
/// free the context slot and return 0.
|
||||
fn xma_release_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
let context_ptr = ctx.gpr[3] as u32;
|
||||
state.xma.lock().unwrap().release_context(context_ptr);
|
||||
tracing::info!(context_ptr = format_args!("{context_ptr:#010x}"), "XMAReleaseContext");
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
// ===== Xex =====
|
||||
@@ -4276,7 +4471,8 @@ fn nt_yield_execution(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut Ker
|
||||
}
|
||||
|
||||
fn ke_resume_thread(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
|
||||
let raw = ctx.gpr[3] as u32;
|
||||
let handle = resolve_pseudo_handle(state, raw);
|
||||
match state.scheduler.find_by_handle(handle) {
|
||||
Some(r) => {
|
||||
state.scheduler.resume_ref(r);
|
||||
@@ -4292,13 +4488,18 @@ fn nt_resume_thread(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelS
|
||||
// r3 = handle, r4 = prev_suspend_count_ptr
|
||||
let handle = ctx.gpr[3] as u32;
|
||||
let prev_ptr = ctx.gpr[4] as u32;
|
||||
let prev = state
|
||||
.scheduler
|
||||
.find_by_handle(handle)
|
||||
.map(|r| state.scheduler.resume_ref(r))
|
||||
.unwrap_or(0);
|
||||
if prev_ptr != 0 {
|
||||
mem.write_u32(prev_ptr, prev);
|
||||
match state.scheduler.find_by_handle(handle) {
|
||||
Some(r) => {
|
||||
let prev = state.scheduler.resume_ref(r);
|
||||
if prev_ptr != 0 {
|
||||
mem.write_u32(prev_ptr, prev);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
if prev_ptr != 0 {
|
||||
mem.write_u32(prev_ptr, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
ctx.gpr[3] = STATUS_SUCCESS;
|
||||
}
|
||||
@@ -5534,6 +5735,67 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// `NtSetInformationFile` class 10 (`XFileRenameInformation`) must move
|
||||
/// the backing host file to the new `cache:` path and update the handle.
|
||||
/// Mirrors Sylpheed's asset-cache `.tmp` → `\<hash>\<dir>\<file>` move;
|
||||
/// without it the nested target stays empty and the decompressed asset
|
||||
/// (logo texture) never reads back. Faithful to canary `file->Rename`.
|
||||
#[test]
|
||||
fn nt_set_information_file_rename_moves_cache_file() {
|
||||
let (mut ctx, mut mem, mut state) = fresh();
|
||||
// Real temp cache root + a staging `.tmp` file with known bytes.
|
||||
let root = std::env::temp_dir().join(format!("xenia-rs-rename-test-{}", std::process::id()));
|
||||
let _ = std::fs::remove_dir_all(&root);
|
||||
std::fs::create_dir_all(&root).unwrap();
|
||||
let old_host = root.join("69d8e45ce534ffea.tmp");
|
||||
std::fs::write(&old_host, b"LOGOTEX!").unwrap();
|
||||
state.cache_root = Some(root.clone());
|
||||
// Open handle whose backing host_path is the staging file.
|
||||
let handle = state.alloc_handle_for(KernelObject::File {
|
||||
path: "69d8e45ce534ffea.tmp".to_string(),
|
||||
size: 8,
|
||||
position: 0,
|
||||
data: Arc::new(Vec::new()),
|
||||
dir_enum_pos: None,
|
||||
host_path: Some(old_host.clone()),
|
||||
});
|
||||
// X_FILE_RENAME_INFORMATION { replace@0, root_dir@4, ANSI_STRING@8 }.
|
||||
// ANSI_STRING { len u16, max u16, buf u32 } at info_ptr+8; buffer holds
|
||||
// the target path "cache:\69d8e45c\e\534ffea".
|
||||
let info_ptr = SCRATCH_BASE + 0x100;
|
||||
let str_buf = SCRATCH_BASE + 0x200;
|
||||
let target = b"cache:\\69d8e45c\\e\\534ffea";
|
||||
for (i, b) in target.iter().enumerate() {
|
||||
mem.write_u8(str_buf + i as u32, *b);
|
||||
}
|
||||
mem.write_u32(info_ptr, 0); // replace_existing
|
||||
mem.write_u32(info_ptr + 4, 0); // root_dir_handle
|
||||
mem.write_u16(info_ptr + 8, target.len() as u16); // ANSI_STRING.Length
|
||||
mem.write_u16(info_ptr + 10, target.len() as u16); // MaximumLength
|
||||
mem.write_u32(info_ptr + 12, str_buf); // Buffer
|
||||
let iosb_ptr = SCRATCH_BASE + 0x140;
|
||||
ctx.gpr[3] = handle as u64;
|
||||
ctx.gpr[4] = iosb_ptr as u64;
|
||||
ctx.gpr[5] = info_ptr as u64;
|
||||
ctx.gpr[6] = 16;
|
||||
ctx.gpr[7] = 10; // XFileRenameInformation
|
||||
nt_set_information_file(&mut ctx, &mut mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
|
||||
// Staging file gone; nested target exists with the same bytes.
|
||||
let new_host = root.join("69d8e45c").join("e").join("534ffea");
|
||||
assert!(!old_host.exists(), "staging .tmp should be moved away");
|
||||
assert_eq!(std::fs::read(&new_host).unwrap(), b"LOGOTEX!");
|
||||
// Handle now points at the new host + guest path.
|
||||
match state.objects.get(&handle) {
|
||||
Some(KernelObject::File { host_path: Some(hp), path, .. }) => {
|
||||
assert_eq!(hp, &new_host);
|
||||
assert_eq!(path, "cache:/69d8e45c/e/534ffea");
|
||||
}
|
||||
_ => panic!("file handle lost or host_path missing"),
|
||||
}
|
||||
let _ = std::fs::remove_dir_all(&root);
|
||||
}
|
||||
|
||||
/// Read-only VFS — truncating to a different size must fail with
|
||||
/// `STATUS_UNSUCCESSFUL`, matching Canary's error path when
|
||||
/// `file->SetLength(...)` can't honour the request.
|
||||
|
||||
@@ -30,6 +30,12 @@ use xenia_cpu::ThreadRef;
|
||||
pub const INTERRUPT_SOURCE_VSYNC: u32 = 0;
|
||||
pub const INTERRUPT_SOURCE_CP: u32 = 1;
|
||||
|
||||
/// The processor the graphics ISR impersonates for a v-sync interrupt.
|
||||
/// Canary hard-codes this: `MarkVblank` → `DispatchInterruptCallback(0, 2)`
|
||||
/// (graphics_system.cc:478). CP interrupts instead use the bit index of the
|
||||
/// `PM4_INTERRUPT` `cpu_mask`.
|
||||
pub const VSYNC_TARGET_CPU: u8 = 2;
|
||||
|
||||
/// Guest-registered V-sync / graphics-interrupt callback (from
|
||||
/// `VdSetGraphicsInterruptCallback`).
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -145,9 +151,16 @@ pub type PendingLocalIrq = [std::sync::atomic::AtomicU8;
|
||||
pub struct InterruptState {
|
||||
/// Registered callback (set by `VdSetGraphicsInterruptCallback`).
|
||||
pub callback: Option<GraphicsInterruptCallback>,
|
||||
/// Bounded FIFO of pending interrupt sources awaiting injection.
|
||||
/// Push-back on queue, pop-front on inject. Over-cap pushes drop.
|
||||
pub pending: VecDeque<u32>,
|
||||
/// Bounded FIFO of pending interrupts awaiting injection, as
|
||||
/// `(source, target_cpu)`. Push-back on queue, pop-front on inject.
|
||||
/// Over-cap pushes drop. `target_cpu` is the processor the graphics
|
||||
/// ISR must impersonate (canary `XThread::SetActiveCpu` / the
|
||||
/// `DispatchInterruptCallback(source, cpu)` argument): the bit index
|
||||
/// of the CP `PM4_INTERRUPT` `cpu_mask` for source=1, and a fixed `2`
|
||||
/// for vsync (canary `DispatchInterruptCallback(0, 2)`). The ISR reads
|
||||
/// it from the PCR (`[r13+268]`) to clear the matching per-CPU bit of
|
||||
/// the swap-acknowledge fence.
|
||||
pub pending: VecDeque<(u32, u8)>,
|
||||
/// When `Some`, some HW thread is currently running a callback; on
|
||||
/// return-to-sentinel we restore this and clear the flag.
|
||||
pub saved: Option<SavedCallbackCtx>,
|
||||
@@ -170,6 +183,28 @@ pub struct InterruptState {
|
||||
/// ticker. `tick_vsync_instr` diffs against this to advance
|
||||
/// `vsync_accumulator`.
|
||||
pub last_instr_count: u64,
|
||||
/// **iterate-3AJ — present-anchored vsync.** Set `true` once the guest
|
||||
/// has presented at least one frame (a `VdSwap`). Before this, the
|
||||
/// vsync ticker uses the legacy fixed instruction-quantum cadence so
|
||||
/// the boot present-loop bootstrap (iterate-2W) still gets the vsyncs
|
||||
/// it needs *before* the first present. After this, vsync is anchored
|
||||
/// to the guest's real present rate (≈1 vblank per present, as on real
|
||||
/// hardware where the title double-buffers at vblank), with only a
|
||||
/// small capped instruction-quantum *fallback* for frames where the
|
||||
/// guest genuinely stops presenting (heavy asset load). This stops the
|
||||
/// proxy from firing ~66 vsyncs during one heavy load frame, which
|
||||
/// collapsed the splash-logo intro fade-in (the guest's vsync counter
|
||||
/// jumped 0→66 in one frame instead of ramping smoothly).
|
||||
pub vsync_present_anchored: bool,
|
||||
/// Last observed guest present (`VdSwap`) count. `tick_vsync_instr`
|
||||
/// diffs the live count against this each call to emit one vblank per
|
||||
/// new present once `vsync_present_anchored` is set.
|
||||
pub last_present_count: u64,
|
||||
/// How many *fallback* (non-present-driven) vsyncs have fired in the
|
||||
/// current dry (no-present) window. Reset to 0 whenever a present
|
||||
/// occurs. Capped at [`DRY_FALLBACK_CAP`] so one heavy non-presenting
|
||||
/// frame cannot fire a long burst of vsyncs (the fade-in regression).
|
||||
pub dry_fallback_fired: u32,
|
||||
/// Wall-clock anchor for the production v-sync ticker. `None` until
|
||||
/// the first `tick_vsync_wallclock` call (lazy init so unit tests
|
||||
/// that never invoke that function don't construct an Instant).
|
||||
@@ -195,6 +230,21 @@ pub struct InterruptState {
|
||||
/// determinism.
|
||||
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
|
||||
|
||||
/// **iterate-3AJ — present-anchored vsync fallback.**
|
||||
///
|
||||
/// Once the guest is in its present loop (`vsync_present_anchored`), each
|
||||
/// guest present emits exactly one vblank — vsync *is* the present cadence,
|
||||
/// as on real Xbox 360 hardware where the title double-buffers at vblank.
|
||||
/// For a frame where the guest stops presenting (e.g. the ~1.1 s splash
|
||||
/// asset-load), we still need *some* vsyncs to keep timers / the present
|
||||
/// loop alive, but firing one per [`VSYNC_INSTR_PERIOD`] would reproduce the
|
||||
/// ~66-vsync spike that collapsed the fade-in. So the fallback fires one
|
||||
/// vblank per `VSYNC_INSTR_PERIOD` of *non-presenting* instructions, but at
|
||||
/// most [`DRY_FALLBACK_CAP`] per dry window (the counter resets on each
|
||||
/// present). A heavy load frame therefore advances the guest vsync counter
|
||||
/// by ≤ `DRY_FALLBACK_CAP` (a small ramp like canary's 0/5/10/2/1…), not 66.
|
||||
pub const DRY_FALLBACK_CAP: u32 = 4;
|
||||
|
||||
/// Wall-clock period for the **production** v-sync ticker. 16.667 ms
|
||||
/// targets exactly 60 Hz. KRNBUG-D08 — converting from the
|
||||
/// instruction-count proxy fixes the `--parallel` rate drop while
|
||||
@@ -211,8 +261,9 @@ impl InterruptState {
|
||||
});
|
||||
}
|
||||
|
||||
/// Queue an interrupt for the next safe injection point.
|
||||
pub fn queue_interrupt(&mut self, source: u32) {
|
||||
/// Queue an interrupt for the next safe injection point. `cpu` is the
|
||||
/// processor the ISR must impersonate (see `pending`).
|
||||
pub fn queue_interrupt(&mut self, source: u32, cpu: u8) {
|
||||
if self.callback.is_none() {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
@@ -221,37 +272,102 @@ impl InterruptState {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
}
|
||||
self.pending.push_back(source);
|
||||
self.pending.push_back((source, cpu));
|
||||
}
|
||||
|
||||
/// Peek at the next pending source without removing it.
|
||||
pub fn peek_next(&self) -> Option<u32> {
|
||||
self.pending.front().copied()
|
||||
self.pending.front().map(|&(source, _)| source)
|
||||
}
|
||||
|
||||
/// Peek at the target CPU of the next pending interrupt.
|
||||
pub fn peek_next_cpu(&self) -> Option<u8> {
|
||||
self.pending.front().map(|&(_, cpu)| cpu)
|
||||
}
|
||||
|
||||
/// Pop the next pending source (called by the injector after it has
|
||||
/// committed to dispatching it).
|
||||
pub fn take_next(&mut self) -> Option<u32> {
|
||||
self.pending.pop_front()
|
||||
self.pending.pop_front().map(|(source, _)| source)
|
||||
}
|
||||
|
||||
/// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
|
||||
/// that need a deterministic clock source. Production code calls
|
||||
/// `tick_vsync_wallclock` instead. Returns `true` if at least one
|
||||
/// v-sync was queued.
|
||||
pub fn tick_vsync_instr(&mut self, current_instr_count: u64) -> bool {
|
||||
/// **Present-anchored** instruction-paced v-sync ticker (the lockstep
|
||||
/// production path; also used by unit tests for a deterministic clock).
|
||||
///
|
||||
/// `current_instr_count` is the running retired-instruction count.
|
||||
/// `present_count` is the guest's running `VdSwap` count (monotonic).
|
||||
///
|
||||
/// Two regimes:
|
||||
///
|
||||
/// 1. **Bootstrap** (`!vsync_present_anchored`, i.e. before the guest's
|
||||
/// first present): legacy fixed-quantum cadence — one vsync per
|
||||
/// [`VSYNC_INSTR_PERIOD`] retired instructions. The boot present loop
|
||||
/// (iterate-2W) needs vsyncs delivered *before* it can present, so
|
||||
/// this regime is unchanged from the original ticker. The first
|
||||
/// observed present flips `vsync_present_anchored`.
|
||||
///
|
||||
/// 2. **Present-anchored** (after the first present): one vblank per
|
||||
/// guest present (vsync *is* the present cadence on real hardware),
|
||||
/// plus a small capped instruction-quantum fallback ([`DRY_FALLBACK_CAP`]
|
||||
/// per dry window) so a frame where the guest stops presenting (heavy
|
||||
/// asset load) still ticks a *few* vsyncs — not ~66, which collapsed
|
||||
/// the splash fade-in.
|
||||
///
|
||||
/// Returns `true` if at least one v-sync was queued.
|
||||
pub fn tick_vsync_instr(&mut self, current_instr_count: u64, present_count: u64) -> bool {
|
||||
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
||||
self.last_instr_count = current_instr_count;
|
||||
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
|
||||
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
||||
return false;
|
||||
|
||||
let new_presents = present_count.saturating_sub(self.last_present_count);
|
||||
self.last_present_count = present_count;
|
||||
if new_presents > 0 {
|
||||
self.vsync_present_anchored = true;
|
||||
}
|
||||
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
||||
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
||||
for _ in 0..periods {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
|
||||
// Regime 1 — bootstrap: legacy fixed instruction quantum. Preserves
|
||||
// the iterate-2W present-loop bootstrap exactly (vsyncs must fire
|
||||
// before the guest can present).
|
||||
if !self.vsync_present_anchored {
|
||||
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
||||
return false;
|
||||
}
|
||||
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
||||
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
||||
for _ in 0..periods {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
true
|
||||
|
||||
// Regime 2 — present-anchored.
|
||||
let mut queued = false;
|
||||
|
||||
if new_presents > 0 {
|
||||
// One vblank per guest present. `queue_interrupt` caps the FIFO,
|
||||
// so a burst of presents in one round can't flood. A fresh
|
||||
// present resets the dry-window state.
|
||||
for _ in 0..new_presents {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
self.vsync_accumulator = 0;
|
||||
self.dry_fallback_fired = 0;
|
||||
queued = true;
|
||||
} else if self.vsync_accumulator >= VSYNC_INSTR_PERIOD
|
||||
&& self.dry_fallback_fired < DRY_FALLBACK_CAP
|
||||
{
|
||||
// Dry frame (no present this tick): the guest stopped presenting
|
||||
// (heavy load). Tick a *capped* number of fallback vsyncs so
|
||||
// timers/the present loop stay alive without re-introducing the
|
||||
// ~66-vsync spike. Consume one period per fired vsync so the
|
||||
// accumulator paces the few fallbacks.
|
||||
self.vsync_accumulator -= VSYNC_INSTR_PERIOD;
|
||||
self.dry_fallback_fired += 1;
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
queued = true;
|
||||
}
|
||||
|
||||
queued
|
||||
}
|
||||
|
||||
/// **Production** — wall-clock v-sync ticker. Fires
|
||||
@@ -288,7 +404,7 @@ impl InterruptState {
|
||||
self.last_vsync_instant = Some(anchor + advance);
|
||||
let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP);
|
||||
for _ in 0..to_queue {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -306,7 +422,7 @@ mod tests {
|
||||
#[test]
|
||||
fn queue_interrupt_drops_without_callback() {
|
||||
let mut s = InterruptState::default();
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 1);
|
||||
assert!(s.pending.is_empty());
|
||||
}
|
||||
@@ -315,9 +431,9 @@ mod tests {
|
||||
fn queue_interrupt_fifo_preserves_order() {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_CP);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_CP, 2);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 0);
|
||||
// FIFO: take_next hands them out in push order.
|
||||
assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
@@ -331,11 +447,11 @@ mod tests {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
for _ in 0..INTERRUPT_QUEUE_CAP {
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
// Over-cap: drops rather than evicting the oldest.
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 2);
|
||||
assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
|
||||
}
|
||||
@@ -345,9 +461,10 @@ mod tests {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
|
||||
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1));
|
||||
// present_count = 0 → bootstrap regime (legacy fixed quantum).
|
||||
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1, 0));
|
||||
assert!(s.pending.is_empty());
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD));
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD, 0));
|
||||
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
}
|
||||
|
||||
@@ -357,10 +474,59 @@ mod tests {
|
||||
// be delivered, not lost.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10));
|
||||
// present_count = 0 → bootstrap regime drains all 3 periods at once.
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10, 0));
|
||||
assert_eq!(s.pending.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_instr_present_anchors_after_first_present() {
|
||||
// iterate-3AJ: once the guest presents, vsync tracks presents (one
|
||||
// vblank per present), NOT the fixed instruction quantum.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
// Bootstrap: instruction quantum fires (present_count still 0).
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD, 0));
|
||||
assert_eq!(s.pending.len(), 1);
|
||||
let _ = s.take_next();
|
||||
// First present flips to anchored: exactly one vblank for the present.
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 2, 1));
|
||||
assert!(s.vsync_present_anchored);
|
||||
assert_eq!(s.pending.len(), 1);
|
||||
let _ = s.take_next();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_instr_heavy_dry_frame_capped_not_spiking() {
|
||||
// iterate-3AJ: the regression. A heavy non-presenting frame retires
|
||||
// ~10M instructions; the OLD ticker fired ~66 vsyncs (10M/150k) in
|
||||
// that single frame, jumping the guest vsync counter 0→66 and
|
||||
// skipping the fade-in. The present-anchored ticker caps the dry
|
||||
// window at DRY_FALLBACK_CAP.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
// Enter anchored mode via one present.
|
||||
let mut instr: u64 = VSYNC_INSTR_PERIOD;
|
||||
assert!(s.tick_vsync_instr(instr, 1));
|
||||
while s.take_next().is_some() {}
|
||||
// Simulate a 10M-instruction frame with NO new present, ticked in
|
||||
// chunks (as coord_pre_round would). Count fallback vsyncs queued.
|
||||
let mut fallback = 0usize;
|
||||
for _ in 0..100 {
|
||||
instr += 100_000; // 100 chunks × 100k = 10M instructions
|
||||
if s.tick_vsync_instr(instr, 1) {
|
||||
while s.take_next().is_some() {
|
||||
fallback += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
fallback, DRY_FALLBACK_CAP as usize,
|
||||
"a heavy dry frame must cap fallback vsyncs at DRY_FALLBACK_CAP, \
|
||||
not fire ~66"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_wallclock_first_call_sets_anchor() {
|
||||
// First call seeds the anchor and never fires. KRNBUG-D08:
|
||||
|
||||
@@ -13,7 +13,7 @@ use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
/// u16 Length
|
||||
/// u16 MaximumLength
|
||||
/// u32 Buffer (guest pointer)
|
||||
fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
|
||||
pub fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
|
||||
if ptr == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
@@ -17,6 +17,16 @@ impl PcrWriter for GuestMemoryPcr<'_> {
|
||||
// `GuestMemory::write_u32` takes `&self` post-M2 trait flip; the
|
||||
// wrapping `&'a GuestMemory` is sufficient.
|
||||
self.0.write_u32(pcr_base + 0x2C, hw_id as u32);
|
||||
// PRCB.current_cpu byte at PCR+0x10C (prcb_data@0x100 + current_cpu@0xC).
|
||||
// Canary writes `GetFakeCpuNumber(affinity)` here (xthread.cc:847
|
||||
// `pcr->prcb_data.current_cpu = cpu_index`), which equals the HW thread
|
||||
// id we already compute. Guest spin-barriers (e.g. sub_824D1328, used by
|
||||
// the audio/update pump threads at entries 0x824D2878/0x824D2940) index a
|
||||
// per-HW-thread occupancy array by `lbz r11, 268(r13)` = this byte. Left
|
||||
// unwritten it stayed 0 for every thread, so all threads collided on
|
||||
// slot 0 and the multi-thread rendezvous signature never assembled —
|
||||
// the pump threads spun forever and never fired their KeSetEvent loops.
|
||||
self.0.write_u8(pcr_base + 0x10C, hw_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,6 +161,11 @@ pub struct KernelState {
|
||||
/// graphics interrupts is enforced by the injector's
|
||||
/// `is_in_callback()` guard.
|
||||
pub xaudio: crate::xaudio::XAudioState,
|
||||
/// Register-mapped XMA context array (apu stage 1). Shared with the
|
||||
/// `0x7FEA0000` MMIO region installed by the app and with the
|
||||
/// `XMACreateContext`/`XMAReleaseContext` exports, so it lives behind an
|
||||
/// `Arc<Mutex<…>>`. Stage 1 records kicks; stage 3 will decode them.
|
||||
pub xma: std::sync::Arc<std::sync::Mutex<xenia_apu::XmaDecoder>>,
|
||||
/// AUDIT-032 Plan B (default true). When true, the round prologue
|
||||
/// runs the XAudio ticker + `try_inject_audio_callback`. Pre-fix this
|
||||
/// was off by default because injection used random-victim selection
|
||||
@@ -209,6 +224,17 @@ pub struct KernelState {
|
||||
/// only). Used by `xex_get_procedure_address` to resolve ordinals back
|
||||
/// to callable thunks.
|
||||
thunks_by_ordinal: HashMap<(ModuleId, u16), u32>,
|
||||
|
||||
/// Perf (Tier-A #4): inclusive [min, max] guest-address band that
|
||||
/// contains every registered import thunk. Import thunks sit in a
|
||||
/// small contiguous region of the XEX; almost every executing PC is
|
||||
/// ordinary guest code OUTSIDE this band. The per-slot-visit prologue
|
||||
/// looks up `thunk_map.get(&pc)` (a `HashMap<u32,…>` → `hash_one` per
|
||||
/// call, ~3.2M visits boot-to-splash). Range-rejecting against this
|
||||
/// band first turns the common (non-thunk) case into a pair of integer
|
||||
/// compares and skips the hash entirely. `None` until the first thunk
|
||||
/// is registered (no band → reject everything, matching an empty map).
|
||||
thunk_addr_band: Option<(u32, u32)>,
|
||||
/// First-Pixels diagnostic latch. Set the first time
|
||||
/// `RtlRaiseException` fires with code `0xE06D7363` (MSVC C++ throw)
|
||||
/// so the deep stack-walk + `runtime_error` decode in
|
||||
@@ -354,6 +380,25 @@ pub struct KernelState {
|
||||
/// [`Self::fire_due_silph_autosignals`] on the first visit where
|
||||
/// the pending queue is non-empty but no entry is due yet.
|
||||
pub silph_autosignal_diag_logged: bool,
|
||||
/// ITERATE-2J — guest VA of the `KeTimeStampBundle` block (xboxkrnl
|
||||
/// data export ordinal 0x00AD). Set during the import-patch pass in
|
||||
/// `xenia-app`. Zero until then. The guest's worker-hub channel
|
||||
/// dispatch loop polls `[block+0x10]` (`tick_count`, milliseconds) and
|
||||
/// gates dispatch on a `tick_count + 66` deadline; if the block is
|
||||
/// never re-written that deadline never elapses and the hub spins
|
||||
/// forever (the tid14 0x109c starvation gate). The run loop ticks this
|
||||
/// block every round from the deterministic `global_clock` via
|
||||
/// [`Self::update_timestamp_bundle`].
|
||||
pub timestamp_bundle_addr: u32,
|
||||
|
||||
/// Perf (Tier-B #5) throttle state for [`Self::update_timestamp_bundle`].
|
||||
/// Holds the `clock` value at which the bundle was last actually written;
|
||||
/// `u64::MAX` is the "never written" sentinel (forces the first write).
|
||||
/// `AtomicU64` (not `Cell`) so the `&self` update path stays `Sync` for
|
||||
/// the parallel `Arc<Mutex<KernelState>>` usage. Only ever advanced
|
||||
/// forward under the kernel lock, so `Relaxed` ordering is sufficient and
|
||||
/// the sequence is deterministic.
|
||||
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64,
|
||||
}
|
||||
|
||||
/// ITERATE-2C Phase D — one queued auto-signal. `deadline_cycle` is
|
||||
@@ -409,6 +454,9 @@ impl KernelState {
|
||||
ui: None,
|
||||
interrupts: crate::interrupts::InterruptState::default(),
|
||||
xaudio: crate::xaudio::XAudioState::default(),
|
||||
// apu stage 1 — un-initialized until the app reserves the context
|
||||
// array and calls `xma.lock().init(va, phys)`.
|
||||
xma: std::sync::Arc::new(std::sync::Mutex::new(xenia_apu::XmaDecoder::new())),
|
||||
// AUDIT-032: dedicated audio worker per client (Plan B in
|
||||
// `xaudio_register_render_driver`) — not victim hijack, so safe
|
||||
// to enable by default. Previously gated off because the
|
||||
@@ -419,6 +467,7 @@ impl KernelState {
|
||||
audit: HandleAudit::default(),
|
||||
reservations,
|
||||
thunks_by_ordinal: HashMap::new(),
|
||||
thunk_addr_band: None,
|
||||
cxx_throw_logged: false,
|
||||
ring_base: 0,
|
||||
ring_size_dwords: 0,
|
||||
@@ -444,6 +493,8 @@ impl KernelState {
|
||||
silph_autosignal_pending: Vec::new(),
|
||||
last_cycle_hint: 0,
|
||||
silph_autosignal_diag_logged: false,
|
||||
timestamp_bundle_addr: 0,
|
||||
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64::new(u64::MAX),
|
||||
};
|
||||
crate::exports::register_exports(&mut state);
|
||||
crate::xam::register_exports(&mut state);
|
||||
@@ -563,6 +614,25 @@ impl KernelState {
|
||||
/// emits each ordinal once per module).
|
||||
pub fn register_thunk(&mut self, module: ModuleId, ordinal: u16, address: u32) {
|
||||
self.thunks_by_ordinal.insert((module, ordinal), address);
|
||||
// Widen the thunk address band (Tier-A #4) so the hot prologue can
|
||||
// range-reject non-thunk PCs before hashing the thunk map.
|
||||
self.thunk_addr_band = Some(match self.thunk_addr_band {
|
||||
Some((lo, hi)) => (lo.min(address), hi.max(address)),
|
||||
None => (address, address),
|
||||
});
|
||||
}
|
||||
|
||||
/// Perf (Tier-A #4). Cheap pre-filter for the per-slot-visit import-thunk
|
||||
/// dispatch: `false` guarantees `pc` is NOT a registered thunk (so the
|
||||
/// caller can skip the `thunk_map.get(&pc)` hash). `true` means `pc` lies
|
||||
/// within the registered thunk address band and the map must be consulted
|
||||
/// for an exact match. Conservative — never a false negative.
|
||||
#[inline]
|
||||
pub fn pc_in_thunk_band(&self, pc: u32) -> bool {
|
||||
match self.thunk_addr_band {
|
||||
Some((lo, hi)) => pc >= lo && pc <= hi,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve a `(module, ordinal)` to its registered thunk address.
|
||||
@@ -862,6 +932,82 @@ impl KernelState {
|
||||
self.last_cycle_hint = now_cycle;
|
||||
}
|
||||
|
||||
/// ITERATE-2J — tick the `KeTimeStampBundle` block (xboxkrnl ordinal
|
||||
/// 0x00AD) from the deterministic monotonic clock so the guest sees a
|
||||
/// clock that *advances*.
|
||||
///
|
||||
/// `clock` is the scheduler's `global_clock` — a pure function of
|
||||
/// retired guest instructions (see [`Self::now_basis_at`] /
|
||||
/// `Scheduler::global_clock`). Lockstep floors it up to
|
||||
/// `stats.instruction_count` each round; parallel sums per-block
|
||||
/// retired counts. Using it (rather than wall-clock) keeps every
|
||||
/// guest-visible time value a deterministic function of guest progress,
|
||||
/// so lockstep stays byte-reproducible.
|
||||
///
|
||||
/// ## Cadence
|
||||
/// The existing kernel time math (`parse_timeout` in `exports.rs`)
|
||||
/// already treats **1 `global_clock` unit ≈ 100 ns**: it converts a
|
||||
/// signed 100-ns `LARGE_INTEGER` timeout to a deadline by dividing the
|
||||
/// magnitude by 100 and adding it to `now` (= `global_clock`). To stay
|
||||
/// coherent with that, this method uses the same scale:
|
||||
///
|
||||
/// * `interrupt_time` / `system_time` (100-ns units): `clock` (with a
|
||||
/// FILETIME epoch base added to `system_time`).
|
||||
/// * `tick_count` (milliseconds): `clock / INSTRUCTIONS_PER_MS` where
|
||||
/// `INSTRUCTIONS_PER_MS = 10_000` (10_000 × 100 ns = 1 ms).
|
||||
///
|
||||
/// At 10_000 clock-units/ms, the guest's `tick_count + 66` ms hub
|
||||
/// deadline elapses by ~660_000 retired instructions — very early in a
|
||||
/// ~1 B-instruction boot — while a 16 ms `KeWait` timeout
|
||||
/// (`parse_timeout`: 160_000 units) still resolves to 16 ms of
|
||||
/// tick_count, so no timeout collapses to "instant". The two readers
|
||||
/// share one scale.
|
||||
pub fn update_timestamp_bundle(&self, mem: &GuestMemory, clock: u64) {
|
||||
let block = self.timestamp_bundle_addr;
|
||||
if block == 0 {
|
||||
return;
|
||||
}
|
||||
const INSTRUCTIONS_PER_MS: u64 = 10_000;
|
||||
// Perf (Tier-B #5): the bundle is updated once per scheduler round
|
||||
// (~every 7 retired instructions), but the four guest BE memory
|
||||
// writes are ~8.6% of boot-to-splash. `clock` is the retired-
|
||||
// instruction count, so consecutive rounds rewrite essentially the
|
||||
// same staircase. Throttle to a 0.25 ms quantum: only re-write when
|
||||
// `clock` advanced by >= INSTRUCTIONS_PER_MS/4 (2500 units) since the
|
||||
// last write. This keeps `tick_count` (ms, changes every 10_000
|
||||
// units) ALWAYS fresh and `interrupt_time`/`system_time` monotone at
|
||||
// 0.25 ms granularity — finer than any guest deadline math needs
|
||||
// (`parse_timeout` works in whole ms; the hub gate is `+66 ms`). The
|
||||
// fade-in (3AH-proven vsync-counter driven, NOT this bundle) is
|
||||
// untouched. Throttle threshold is well below 1 ms so no guest-
|
||||
// visible ms boundary is ever skipped.
|
||||
const BUNDLE_QUANTUM: u64 = INSTRUCTIONS_PER_MS / 4; // 2500 units = 0.25 ms
|
||||
{
|
||||
use std::sync::atomic::Ordering;
|
||||
let last = self.timestamp_bundle_last_clock.load(Ordering::Relaxed);
|
||||
// Always allow the first write (last == u64::MAX sentinel) and any
|
||||
// write that crosses the quantum. Never go backwards.
|
||||
if last != u64::MAX && clock < last.saturating_add(BUNDLE_QUANTUM) {
|
||||
return;
|
||||
}
|
||||
self.timestamp_bundle_last_clock
|
||||
.store(clock, Ordering::Relaxed);
|
||||
}
|
||||
// FILETIME epoch base (~2021) so `system_time` is a plausible
|
||||
// absolute wall-clock; matches the constant used by
|
||||
// `ke_query_system_time`. interrupt_time is "since boot" so it
|
||||
// starts at the clock origin (no epoch offset).
|
||||
const FILETIME_BASE: u64 = 132_500_000_000_000_000;
|
||||
let interrupt_time: u64 = clock;
|
||||
let system_time: u64 = FILETIME_BASE.wrapping_add(clock);
|
||||
let tick_count: u32 = (clock / INSTRUCTIONS_PER_MS) as u32;
|
||||
// BE writes (write_u64/write_u32 use to_be_bytes) — guest is BE.
|
||||
mem.write_u64(block, interrupt_time); // +0x00 interrupt_time
|
||||
mem.write_u64(block + 0x08, system_time); // +0x08 system_time
|
||||
mem.write_u32(block + 0x10, tick_count); // +0x10 tick_count (ms)
|
||||
mem.write_u32(block + 0x14, 0); // +0x14 padding
|
||||
}
|
||||
|
||||
/// ITERATE-2C Phase D — register a freshly-allocated event for
|
||||
/// auto-signal after the configured delay, **iff** the creating
|
||||
/// thread matches the silph::UImpl tid=13 chain that wedges in
|
||||
@@ -970,6 +1116,24 @@ impl KernelState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Perf gate (Tier-A quick-win #3). `true` iff any of the four
|
||||
/// per-slot-visit diagnostic probe registries
|
||||
/// (`ctor_probe_pcs` / `branch_probe_pcs` / `audit_pc_probe_pcs`
|
||||
/// / `lr_trace_pcs`) holds at least one PC. The common headless
|
||||
/// run leaves all four empty, so the prologue can skip the four
|
||||
/// `fire_*_if_match` calls entirely with this single predicted
|
||||
/// branch — avoiding 4× call overhead per slot-visit (~3.2M
|
||||
/// visits over boot-to-splash) when no probe is configured.
|
||||
/// Purely a fast-path guard; each `fire_*` still re-checks its own
|
||||
/// set, so behaviour is identical whether or not the caller gates.
|
||||
#[inline]
|
||||
pub fn any_probe_active(&self) -> bool {
|
||||
!self.ctor_probe_pcs.is_empty()
|
||||
|| !self.branch_probe_pcs.is_empty()
|
||||
|| !self.audit_pc_probe_pcs.is_empty()
|
||||
|| !self.lr_trace_pcs.is_empty()
|
||||
}
|
||||
|
||||
/// Diagnostic. If the live PC for HW slot `hw_id` is in
|
||||
/// `self.ctor_probe_pcs`, emit a single `CTOR-PROBE` line with
|
||||
/// the current cycle, tid, hw_id, sp, r3, lr, plus an 8-frame
|
||||
|
||||
@@ -57,6 +57,11 @@ pub fn allocate_thread_image(
|
||||
mem.write_u32(pcr_base, tls_base);
|
||||
mem.write_u32(pcr_base + 0x2C, hw_thread_id as u32);
|
||||
mem.write_u32(pcr_base + 0x100, 0x1000);
|
||||
// +0x10C prcb_data.current_cpu — canary `pcr->prcb_data.current_cpu`
|
||||
// (PRCB@0x100 + current_cpu@0xC). Guest spin-barriers index a
|
||||
// per-HW-thread slot array by `lbz r11, 268(r13)` = this byte; it
|
||||
// must equal the HW thread id (== PCR+0x2C). See state.rs PcrWriter.
|
||||
mem.write_u8(pcr_base + 0x10C, hw_thread_id);
|
||||
mem.write_u32(pcr_base + 0x150, 0);
|
||||
|
||||
Some(ThreadImage {
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64};
|
||||
|
||||
use xenia_gpu::draw_capture::DrawCapture;
|
||||
use xenia_gpu::texture_cache::TextureKey;
|
||||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||||
use xenia_hid::GamepadState;
|
||||
@@ -133,6 +134,14 @@ pub struct UiBridge {
|
||||
/// reverts to its magenta stub.
|
||||
pub publish_texture:
|
||||
Arc<dyn Fn(Option<(TextureKey, Vec<u8>)>) + Send + Sync>,
|
||||
/// iterate-3O real-render slice: at each `VdSwap`, the kernel hands the
|
||||
/// UI the per-draw geometry captured this frame (one [`DrawCapture`] per
|
||||
/// `PM4_DRAW_INDX*`), including the real guest vertex window. The UI
|
||||
/// replays them through the Xenos wgpu pipeline so the splash renders its
|
||||
/// actual geometry instead of synthetic placeholder shapes. Empty in the
|
||||
/// degenerate case (no draws or capture disabled).
|
||||
pub publish_geometry:
|
||||
Arc<dyn Fn(Vec<DrawCapture>) + Send + Sync>,
|
||||
}
|
||||
|
||||
impl UiBridge {
|
||||
@@ -182,4 +191,9 @@ impl UiBridge {
|
||||
pub fn publish_texture(&self, tex: Option<(TextureKey, Vec<u8>)>) {
|
||||
(self.publish_texture)(tex);
|
||||
}
|
||||
|
||||
/// Hand this frame's captured per-draw geometry to the UI.
|
||||
pub fn publish_geometry(&self, caps: Vec<DrawCapture>) {
|
||||
(self.publish_geometry)(caps);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,14 @@ pub const XAUDIO_MAX_CLIENTS: usize = 8;
|
||||
/// no-op anyway).
|
||||
pub const XAUDIO_SYNTHETIC_HANDLE_BASE: u32 = 0xF000_0000;
|
||||
|
||||
/// The scheduler's deadlock force-wake skips waiters parked solely on
|
||||
/// handles at/above [`xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR`]
|
||||
/// so it never destroys a parked audio worker. Keep these in lockstep:
|
||||
/// every `synthetic_park_handle` must fall inside that protected range.
|
||||
const _: () = assert!(
|
||||
XAUDIO_SYNTHETIC_HANDLE_BASE >= xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR
|
||||
);
|
||||
|
||||
/// Compute the synthetic park-handle for client slot `i`.
|
||||
pub const fn synthetic_park_handle(i: usize) -> u32 {
|
||||
XAUDIO_SYNTHETIC_HANDLE_BASE | (i as u32)
|
||||
@@ -68,6 +76,16 @@ pub struct XAudioClient {
|
||||
/// [audio_system.cc:225-228](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L225-L228)
|
||||
/// + [audio_system.cc:139-141](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L139-L141).
|
||||
pub wrapped_callback_arg: u32,
|
||||
/// Count of frames the guest has handed us via
|
||||
/// `XAudioSubmitRenderDriverFrame` for this client. Canary's
|
||||
/// `AudioSystem::SubmitFrame` forwards the sample buffer to the client's
|
||||
/// driver, whose playback completion later releases the client semaphore
|
||||
/// — the pacing our callback ticker emulates. The guest mixer
|
||||
/// (`sub_824DC350`) discards SubmitFrame's return and reads no field it
|
||||
/// writes, so this counter is purely observational (logging / liveness),
|
||||
/// never read back by the guest. Deterministic: incremented only inside
|
||||
/// the guest-driven export call.
|
||||
pub submitted_frames: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -138,6 +156,35 @@ impl XAudioState {
|
||||
self.clients.get(index).copied().flatten()
|
||||
}
|
||||
|
||||
/// Faithful counterpart to canary `AudioSystem::SubmitFrame`: the guest
|
||||
/// driver client `index` handed us one frame of samples. Canary forwards
|
||||
/// `samples` to the client's `AudioDriver`, whose playback-completion
|
||||
/// callback later releases the client semaphore — the buffer-consumed
|
||||
/// pacing our [`tick_instr`]/[`try_inject_audio_callback`] path already
|
||||
/// emulates. SubmitFrame itself returns void and the guest mixer
|
||||
/// (`sub_824DC350`) reads no field from it, so all we faithfully need to
|
||||
/// do is validate the client and account the frame. Returns `true` iff
|
||||
/// `index` is a registered client (canary submits silence / warns
|
||||
/// otherwise). Deterministic — only the guest-driven export mutates this.
|
||||
pub fn record_submit(&mut self, index: usize) -> bool {
|
||||
match self.clients.get_mut(index) {
|
||||
Some(Some(c)) => {
|
||||
c.submitted_frames = c.submitted_frames.saturating_add(1);
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn submitted_frames(&self, index: usize) -> u64 {
|
||||
self.clients
|
||||
.get(index)
|
||||
.copied()
|
||||
.flatten()
|
||||
.map(|c| c.submitted_frames)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
pub fn any_registered(&self) -> bool {
|
||||
self.clients.iter().any(|c| c.is_some())
|
||||
}
|
||||
@@ -230,6 +277,7 @@ mod tests {
|
||||
callback_pc: 0x8200_0000 + arg,
|
||||
callback_arg: arg,
|
||||
wrapped_callback_arg: 0x4000_0000 + arg,
|
||||
submitted_frames: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -89,6 +89,14 @@ pub struct GuestMemory {
|
||||
mem_watch_addrs: Vec<u32>,
|
||||
/// Count of fires observed (for tests / hand-off telemetry).
|
||||
mem_watch_count: AtomicU64,
|
||||
/// Monotonic count of MMIO accesses (every scalar load/store that
|
||||
/// resolves to a registered MMIO region bumps this by 1). A pure,
|
||||
/// deterministic function of guest execution — the superblock runner
|
||||
/// samples it before/after each block to detect an MMIO touch and
|
||||
/// end the run there (so MMIO ordering vs other HW threads stays at
|
||||
/// the same fine lockstep granularity as before). Relaxed because the
|
||||
/// lockstep path is single-threaded and only needs monotonicity.
|
||||
mmio_access_count: AtomicU64,
|
||||
}
|
||||
|
||||
/// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit
|
||||
@@ -133,9 +141,26 @@ impl GuestMemory {
|
||||
writes_total: AtomicU64::new(0),
|
||||
mem_watch_addrs: Vec::new(),
|
||||
mem_watch_count: AtomicU64::new(0),
|
||||
mmio_access_count: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
/// Monotonic count of MMIO accesses since boot. Used by the superblock
|
||||
/// runner to detect that a just-executed block touched MMIO (so it can
|
||||
/// end the superblock there and keep MMIO ordering at lockstep
|
||||
/// granularity). Deterministic function of guest execution.
|
||||
#[inline]
|
||||
pub fn mmio_access_count(&self) -> u64 {
|
||||
self.mmio_access_count
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn bump_mmio_access(&self) {
|
||||
self.mmio_access_count
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Current version watermark for the page containing `addr`. Bumped by
|
||||
/// any write through `write_u8/16/32/64`. Not affected by MMIO writes
|
||||
/// (those don't touch the backing texture memory).
|
||||
@@ -357,7 +382,8 @@ impl GuestMemory {
|
||||
/// from `GuestMemory` without a wider plumbing change).
|
||||
pub fn write_bulk(&self, addr: u32, buf: &[u8]) {
|
||||
let len = buf.len() as u32;
|
||||
let old_lane = self.capture_mem_watch_old(addr, len);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, len) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len());
|
||||
@@ -374,7 +400,7 @@ impl GuestMemory {
|
||||
// the page works.
|
||||
self.bump_page_version(page * PAGE_SIZE);
|
||||
}
|
||||
self.check_mem_watch(addr, len, old_lane);
|
||||
if watch { self.check_mem_watch(addr, len, old_lane); }
|
||||
}
|
||||
|
||||
/// Check if a guest address has been allocated/committed. Acquire load
|
||||
@@ -487,6 +513,7 @@ impl MemoryAccess for GuestMemory {
|
||||
// MMIO dispatch must come first — a byte read at an MMIO-mapped
|
||||
// address should invoke the callback, not the backing memory.
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
return (mmio.read_callback)(addr) as u8;
|
||||
}
|
||||
if !self.is_mapped(addr) { return 0; }
|
||||
@@ -497,6 +524,7 @@ impl MemoryAccess for GuestMemory {
|
||||
#[inline]
|
||||
fn read_u16(&self, addr: u32) -> u16 {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.read_callback)(addr) as u16
|
||||
} else if !self.is_mapped(addr) {
|
||||
0
|
||||
@@ -509,6 +537,7 @@ impl MemoryAccess for GuestMemory {
|
||||
#[inline]
|
||||
fn read_u32(&self, addr: u32) -> u32 {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.read_callback)(addr)
|
||||
} else if !self.is_mapped(addr) {
|
||||
0
|
||||
@@ -521,6 +550,7 @@ impl MemoryAccess for GuestMemory {
|
||||
#[inline]
|
||||
fn read_u64(&self, addr: u32) -> u64 {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
let hi = (mmio.read_callback)(addr) as u64;
|
||||
let lo = (mmio.read_callback)(addr.wrapping_add(4)) as u64;
|
||||
(hi << 32) | lo
|
||||
@@ -536,23 +566,31 @@ impl MemoryAccess for GuestMemory {
|
||||
// MMIO dispatch first — a byte write at an MMIO-mapped address
|
||||
// must invoke the callback, not the backing memory.
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, val as u32);
|
||||
return;
|
||||
}
|
||||
if !self.is_mapped(addr) { return; }
|
||||
let old_lane = self.capture_mem_watch_old(addr, 1);
|
||||
// Perf (Tier-A #1): the mem-watch capture/report pair are out-of-line
|
||||
// calls; on the common (no-watch) path each was a real call that
|
||||
// immediately returned. Gate both behind one predicted branch so the
|
||||
// hot store does no call work unless a watch is actually armed.
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 1) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe { *ptr = val };
|
||||
self.bump_page_version(addr);
|
||||
self.check_mem_watch(addr, 1, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 1, old_lane); }
|
||||
}
|
||||
|
||||
fn write_u16(&self, addr: u32, val: u16) {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, val as u32);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 2);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 2) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2);
|
||||
@@ -564,16 +602,18 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 1) {
|
||||
self.bump_page_version(addr.wrapping_add(1));
|
||||
}
|
||||
self.check_mem_watch(addr, 2, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 2, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
fn write_u32(&self, addr: u32, val: u32) {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, val);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 4);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 4) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4);
|
||||
@@ -582,17 +622,19 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 3) {
|
||||
self.bump_page_version(addr.wrapping_add(3));
|
||||
}
|
||||
self.check_mem_watch(addr, 4, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 4, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
fn write_u64(&self, addr: u32, val: u64) {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, (val >> 32) as u32);
|
||||
(mmio.write_callback)(addr.wrapping_add(4), val as u32);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 8);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 8) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8);
|
||||
@@ -601,7 +643,7 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 7) {
|
||||
self.bump_page_version(addr.wrapping_add(7));
|
||||
}
|
||||
self.check_mem_watch(addr, 8, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 8, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -181,10 +181,11 @@ impl App {
|
||||
y += line_h;
|
||||
let (fbw, fbh) = rs.frontbuffer_size();
|
||||
let render_line = format!(
|
||||
"Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} tex-cache={:>3} fb={}x{}",
|
||||
"Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} real-geo={:>5} tex-cache={:>3} fb={}x{}",
|
||||
rs.xenos_dispatches_translator,
|
||||
rs.xenos_dispatches_interpreter,
|
||||
rs.translated_pipeline_count(),
|
||||
rs.real_geometry_draws(),
|
||||
rs.host_texture_count(),
|
||||
fbw,
|
||||
fbh,
|
||||
@@ -368,53 +369,28 @@ impl ApplicationHandler<SwapEvent> for App {
|
||||
.map(|s| s.frame_index)
|
||||
.unwrap_or(0);
|
||||
if frame_idx != self.last_xenos_swap_frame {
|
||||
rs.clear_frontbuffer([0.04, 0.04, 0.06, 1.0]);
|
||||
// iterate-3AE: clear to BLACK, matching canary's
|
||||
// splash background. The old navy `[0.04,0.04,0.06]`
|
||||
// was an iterate-3S debug placeholder never matched
|
||||
// to the guest. The splash background-fill draw is a
|
||||
// full-screen Xbox-360 RectangleList (3 verts → a HW
|
||||
// rectangle covering the whole screen); the UI replay
|
||||
// draws it as a single triangle (the 4th implied
|
||||
// corner isn't synthesized), so only the diagonal
|
||||
// half is covered. With a navy clear the uncovered
|
||||
// half showed a navy diagonal in the brief
|
||||
// pre/inter-logo transition frames (where that fill
|
||||
// is the only coverage). Canary's background there is
|
||||
// black, and the guest's fill itself resolves to
|
||||
// black, so a black clear makes the uncovered half
|
||||
// match — the transition is uniformly black like the
|
||||
// oracle. (Full RectangleList→rectangle expansion is
|
||||
// the deeper fix and a separate follow-up; under a
|
||||
// black clear the half-coverage is invisible.)
|
||||
rs.clear_frontbuffer([0.0, 0.0, 0.0, 1.0]);
|
||||
self.last_xenos_swap_frame = frame_idx;
|
||||
}
|
||||
let delta = (draws_total - already) as u32;
|
||||
let (verts_hint, prim_kind, vs_key, ps_key) = self
|
||||
.last_swap_info
|
||||
.map(|s| {
|
||||
(
|
||||
s.last_draw_vertex_count.max(3),
|
||||
s.last_draw_prim,
|
||||
s.vs_blob_key,
|
||||
s.ps_blob_key,
|
||||
)
|
||||
})
|
||||
.unwrap_or((3, 4, 0, 0));
|
||||
// Look up blobs + constants from the bridge and
|
||||
// pack into the WGSL-interpreter layout. Empty
|
||||
// slices produce zero-clause packed buffers — the
|
||||
// WGSL walker short-circuits and the placeholder
|
||||
// export path still renders.
|
||||
let raw_vs: Vec<u32> = self
|
||||
.handles
|
||||
.shader_blobs
|
||||
.lock()
|
||||
.ok()
|
||||
.and_then(|g| g.get(&vs_key).cloned())
|
||||
.unwrap_or_default();
|
||||
let raw_ps: Vec<u32> = self
|
||||
.handles
|
||||
.shader_blobs
|
||||
.lock()
|
||||
.ok()
|
||||
.and_then(|g| g.get(&ps_key).cloned())
|
||||
.unwrap_or_default();
|
||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||
// First time we see a blob key, run the static
|
||||
// metrics analyzer. Keyed on (stage_tag, blob_key)
|
||||
// because the guest can reuse a key across stages.
|
||||
if self.seen_shader_blobs.insert((0u8, vs_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||
}
|
||||
if self.seen_shader_blobs.insert((1u8, ps_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||
}
|
||||
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||
let constants = self
|
||||
.handles
|
||||
.xenos_constants
|
||||
@@ -431,19 +407,72 @@ impl ApplicationHandler<SwapEvent> for App {
|
||||
.ok()
|
||||
.and_then(|g| g.clone());
|
||||
rs.bind_primary_texture(tex_payload);
|
||||
rs.dispatch_xenos_draws(
|
||||
already,
|
||||
delta,
|
||||
verts_hint,
|
||||
prim_kind,
|
||||
vs_key,
|
||||
ps_key,
|
||||
&parsed_vs,
|
||||
&parsed_ps,
|
||||
&vs_packed,
|
||||
&ps_packed,
|
||||
&constants,
|
||||
);
|
||||
|
||||
// iterate-3O real-render slice: prefer replaying the
|
||||
// *real* captured guest geometry. The kernel publishes
|
||||
// one `DrawCapture` per `PM4_DRAW_INDX*` this frame
|
||||
// (real vertices + prim type + shader keys). Fall back
|
||||
// to the legacy synthetic dispatch only when no capture
|
||||
// is available (e.g. capture disabled), so we never
|
||||
// regress to a blank screen.
|
||||
let captures: Vec<xenia_gpu::draw_capture::DrawCapture> = self
|
||||
.handles
|
||||
.geometry
|
||||
.lock()
|
||||
.map(|g| g.clone())
|
||||
.unwrap_or_default();
|
||||
let blobs: std::collections::HashMap<u32, Vec<u32>> = self
|
||||
.handles
|
||||
.shader_blobs
|
||||
.lock()
|
||||
.map(|g| g.clone())
|
||||
.unwrap_or_default();
|
||||
if !captures.is_empty() {
|
||||
rs.dispatch_xenos_captures(
|
||||
&captures,
|
||||
&blobs,
|
||||
&constants,
|
||||
&mut self.seen_shader_blobs,
|
||||
);
|
||||
} else {
|
||||
// Legacy synthetic-geometry fallback (placeholder).
|
||||
let (verts_hint, prim_kind, vs_key, ps_key) = self
|
||||
.last_swap_info
|
||||
.map(|s| {
|
||||
(
|
||||
s.last_draw_vertex_count.max(3),
|
||||
s.last_draw_prim,
|
||||
s.vs_blob_key,
|
||||
s.ps_blob_key,
|
||||
)
|
||||
})
|
||||
.unwrap_or((3, 4, 0, 0));
|
||||
let raw_vs = blobs.get(&vs_key).cloned().unwrap_or_default();
|
||||
let raw_ps = blobs.get(&ps_key).cloned().unwrap_or_default();
|
||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||
if self.seen_shader_blobs.insert((0u8, vs_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||
}
|
||||
if self.seen_shader_blobs.insert((1u8, ps_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||
}
|
||||
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||
rs.dispatch_xenos_draws(
|
||||
already,
|
||||
delta,
|
||||
verts_hint,
|
||||
prim_kind,
|
||||
vs_key,
|
||||
ps_key,
|
||||
&parsed_vs,
|
||||
&parsed_ps,
|
||||
&vs_packed,
|
||||
&ps_packed,
|
||||
&constants,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Self::ingest_frontbuffer(
|
||||
|
||||
@@ -18,6 +18,7 @@ use std::sync::Mutex;
|
||||
|
||||
use crossbeam_utils::atomic::AtomicCell;
|
||||
use winit::event_loop::EventLoopProxy;
|
||||
use xenia_gpu::draw_capture::DrawCapture;
|
||||
use xenia_gpu::texture_cache::TextureKey;
|
||||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||||
use xenia_hid::GamepadState;
|
||||
@@ -66,6 +67,10 @@ pub struct UiHandles {
|
||||
/// fetch-constant slot 0 into linear bytes that the UI should
|
||||
/// upload into the host cache and bind at `@group(1) @binding(0)`.
|
||||
pub primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>>,
|
||||
/// iterate-3O: the most recent frame's captured per-draw geometry. The
|
||||
/// redraw path drains this to replay real guest draws. Replaced wholesale
|
||||
/// each `VdSwap`.
|
||||
pub geometry: Arc<Mutex<Vec<DrawCapture>>>,
|
||||
}
|
||||
|
||||
/// Swap event posted by the CPU-side `VdSwap` handler via
|
||||
@@ -89,6 +94,7 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
||||
let xenos_constants = Arc::new(Mutex::new(XenosConstantsBlock::default()));
|
||||
let primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>> =
|
||||
Arc::new(Mutex::new(None));
|
||||
let geometry: Arc<Mutex<Vec<DrawCapture>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let kernel_bridge = UiBridge {
|
||||
gamepad: {
|
||||
@@ -144,6 +150,14 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
||||
}
|
||||
})
|
||||
},
|
||||
publish_geometry: {
|
||||
let geo = Arc::clone(&geometry);
|
||||
Arc::new(move |caps| {
|
||||
if let Ok(mut lock) = geo.lock() {
|
||||
*lock = caps;
|
||||
}
|
||||
})
|
||||
},
|
||||
};
|
||||
|
||||
let handles = UiHandles {
|
||||
@@ -155,6 +169,7 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
||||
shader_blobs,
|
||||
xenos_constants,
|
||||
primary_texture,
|
||||
geometry,
|
||||
};
|
||||
(handles, kernel_bridge)
|
||||
}
|
||||
|
||||
@@ -84,6 +84,9 @@ pub struct RenderState {
|
||||
/// the shader, or (c) we're running the slow interpreter path.
|
||||
pub xenos_dispatches_translator: u64,
|
||||
pub xenos_dispatches_interpreter: u64,
|
||||
/// iterate-3O: running total of replayed draws that carried a real guest
|
||||
/// vertex window (vs. the procedural fallback). Surfaced on the HUD.
|
||||
real_geometry_draws: u64,
|
||||
/// One-shot latch so we emit a tracing::info! on the **first** real
|
||||
/// draw dispatch rather than spamming every frame. Pairs with the
|
||||
/// "first translator compile" latch below.
|
||||
@@ -447,6 +450,7 @@ impl RenderState {
|
||||
fallback_rgb: [0.06, 0.06, 0.09],
|
||||
xenos_pipeline,
|
||||
xenos_draws_rendered: 0,
|
||||
real_geometry_draws: 0,
|
||||
xenos_dispatches_translator: 0,
|
||||
xenos_dispatches_interpreter: 0,
|
||||
first_dispatch_logged: false,
|
||||
@@ -657,26 +661,39 @@ impl RenderState {
|
||||
draw_index: idx,
|
||||
vertex_count: vertex_count_hint.max(3),
|
||||
prim_kind,
|
||||
// Synthetic fallback path: no real vertex window.
|
||||
vertex_base_dwords: 0,
|
||||
// No real geometry → no NDC transform (procedural positions are
|
||||
// already in clip space).
|
||||
ndc_scale: [0.0, 0.0],
|
||||
ndc_offset: [0.0, 0.0],
|
||||
};
|
||||
// Synthetic visualizer path (legacy): no captured render state, so
|
||||
// use the opaque default.
|
||||
let rstate = crate::xenos_pipeline::RenderState::OPAQUE;
|
||||
if use_translated
|
||||
&& let Some(p) = self.xenos_pipeline.translated_pipeline(vs_key, ps_key) {
|
||||
self.xenos_pipeline.render_one_with_pipeline(
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
p,
|
||||
);
|
||||
metrics::counter!("gpu.shader.use", "path" => "translator")
|
||||
.increment(1);
|
||||
served_translator += 1;
|
||||
continue;
|
||||
}
|
||||
&& self.xenos_pipeline.render_one_translated(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
vs_key,
|
||||
ps_key,
|
||||
rstate,
|
||||
)
|
||||
{
|
||||
metrics::counter!("gpu.shader.use", "path" => "translator").increment(1);
|
||||
served_translator += 1;
|
||||
continue;
|
||||
}
|
||||
self.xenos_pipeline.render_one(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
rstate,
|
||||
);
|
||||
metrics::counter!("gpu.shader.use", "path" => "interpreter").increment(1);
|
||||
served_interpreter += 1;
|
||||
@@ -707,12 +724,201 @@ impl RenderState {
|
||||
}
|
||||
}
|
||||
|
||||
/// iterate-3O real-render slice: replay a batch of *real* captured guest
|
||||
/// draws. Unlike [`dispatch_xenos_draws`] (synthetic placeholder geometry),
|
||||
/// each [`DrawCapture`] carries the actual guest vertex window, primitive
|
||||
/// type, host vertex count, and the real (vs, ps) keys. Per capture we:
|
||||
/// 1. upload the captured guest vertex bytes into `vertex_buffer` (b4),
|
||||
/// 2. upload the matching VS/PS microcode + per-frame constants,
|
||||
/// 3. render through the translated (P7) pipeline if it compiled, else
|
||||
/// the interpreter — with `vertex_base_dwords` set so the shader
|
||||
/// rebases its absolute fetch address into the uploaded window.
|
||||
///
|
||||
/// Returns the number of captures that had a real vertex window (vs. the
|
||||
/// procedural fallback), for HUD reporting. `shader_blobs` / `constants`
|
||||
/// come from the bridge; `seen` records which blobs have had static
|
||||
/// metrics emitted (one-shot per blob, matching the legacy path).
|
||||
pub fn dispatch_xenos_captures(
|
||||
&mut self,
|
||||
captures: &[xenia_gpu::draw_capture::DrawCapture],
|
||||
shader_blobs: &std::collections::HashMap<u32, Vec<u32>>,
|
||||
constants: &xenia_gpu::xenos_constants::XenosConstantsBlock,
|
||||
seen: &mut std::collections::HashSet<(u8, u32)>,
|
||||
) -> u32 {
|
||||
if captures.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
let mut real_count = 0u32;
|
||||
// iterate-3X (GPUBUG-111): each captured draw uploads its OWN vertex
|
||||
// window + per-draw constants + shader via `queue.write_buffer`. In
|
||||
// wgpu all `write_buffer` calls staged before a single `queue.submit`
|
||||
// are applied *before any* command in that submit executes — so a single
|
||||
// encoder for the whole batch made every draw read only the LAST draw's
|
||||
// vertex buffer / uniforms (the splash logo quad sampled the fullscreen
|
||||
// background quad's vertices → nothing rendered where the logo was).
|
||||
// Submit ONE encoder PER draw so each draw's writes land before its own
|
||||
// pass. The frontbuffer uses `LoadOp::Load`, so per-draw submits still
|
||||
// composite over each other exactly like before.
|
||||
for cap in captures {
|
||||
// iterate-3T: bind this draw's REAL decoded texture (keyed off the
|
||||
// active PS's tfetch slot, attached in `gpu_system`) so the textured
|
||||
// logo samples the artwork. `None` reverts to the magenta stub for
|
||||
// flat draws. Each `set_texture_view` rebuilds the tex bind group;
|
||||
// the subsequent `render_one*` reads it, so per-draw binding works
|
||||
// even though all draws share one encoder.
|
||||
{
|
||||
let Self {
|
||||
device,
|
||||
queue,
|
||||
xenos_pipeline,
|
||||
host_texture_cache,
|
||||
..
|
||||
} = self;
|
||||
match cap.textures.first() {
|
||||
Some((key, version, bytes)) => {
|
||||
// iterate-3AD: use the decoder's real content `version`
|
||||
// (from `span_max_version`) so the host cache re-uploads
|
||||
// when the guest fills MORE of an evolving atlas. The
|
||||
// publisher and the 2nd splash logo share one K8888
|
||||
// surface (base 0x4dbee000); the 2nd logo's texels land
|
||||
// AFTER the first upload. With the old hardcoded
|
||||
// `version_when_uploaded = 1`, the same `TextureKey`
|
||||
// never re-uploaded, so the 2nd logo sampled its (then
|
||||
// still-zero) atlas region as black. The real version
|
||||
// increases as the guest writes, triggering re-upload.
|
||||
let cached = xenia_gpu::texture_cache::CachedTexture {
|
||||
key: *key,
|
||||
version_when_uploaded: *version,
|
||||
bytes: bytes.clone(),
|
||||
};
|
||||
host_texture_cache.upload(device, queue, &cached);
|
||||
if let Some(view) = host_texture_cache.view_for(key) {
|
||||
xenos_pipeline.set_texture_view(device, Some(view));
|
||||
}
|
||||
}
|
||||
None => xenos_pipeline.set_texture_view(device, None),
|
||||
}
|
||||
}
|
||||
let raw_vs = shader_blobs.get(&cap.vs_key).cloned().unwrap_or_default();
|
||||
let raw_ps = shader_blobs.get(&cap.ps_key).cloned().unwrap_or_default();
|
||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||
if seen.insert((0u8, cap.vs_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||
}
|
||||
if seen.insert((1u8, cap.ps_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||
}
|
||||
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||
// Upload this draw's shader + constants + real vertex window.
|
||||
self.xenos_pipeline.upload_shader_and_constants(
|
||||
&self.queue,
|
||||
&vs_packed,
|
||||
&ps_packed,
|
||||
constants,
|
||||
);
|
||||
if cap.has_real_vertices && !cap.vertex_dwords.is_empty() {
|
||||
self.xenos_pipeline
|
||||
.upload_vertex_data(&self.queue, &cap.vertex_dwords);
|
||||
real_count += 1;
|
||||
}
|
||||
let use_translated = cap.vs_key != 0
|
||||
&& cap.ps_key != 0
|
||||
&& ensure_translated_pipeline(
|
||||
&mut self.xenos_pipeline,
|
||||
&self.device,
|
||||
cap.vs_key,
|
||||
cap.ps_key,
|
||||
&parsed_vs,
|
||||
&parsed_ps,
|
||||
);
|
||||
let base = if cap.has_real_vertices {
|
||||
cap.window_base_dwords
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let req = DrawRequest {
|
||||
draw_index: cap.draw_index,
|
||||
vertex_count: cap.host_vertex_count.max(3),
|
||||
prim_kind: cap.prim_code,
|
||||
vertex_base_dwords: base,
|
||||
// iterate-3S: apply the per-draw guest viewport → host NDC
|
||||
// transform only when we have real geometry (otherwise the
|
||||
// procedural fallback already emits clip-space positions).
|
||||
ndc_scale: if cap.has_real_vertices { cap.ndc_scale } else { [0.0, 0.0] },
|
||||
ndc_offset: if cap.has_real_vertices { cap.ndc_offset } else { [0.0, 0.0] },
|
||||
};
|
||||
// iterate-3Y: replay this draw's real color/blend/write-mask state
|
||||
// (captured from `RB_BLENDCONTROL0` / `RB_COLOR_MASK`) so overlays
|
||||
// composite the way the guest intends instead of opaquely
|
||||
// overwriting the logo.
|
||||
let rstate = crate::xenos_pipeline::RenderState {
|
||||
blend_control: cap.blend_control,
|
||||
color_mask: cap.color_mask,
|
||||
};
|
||||
let mut encoder = self
|
||||
.device
|
||||
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("xenos capture replay (per-draw)"),
|
||||
});
|
||||
let served_translated = use_translated
|
||||
&& self.xenos_pipeline.render_one_translated(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
cap.vs_key,
|
||||
cap.ps_key,
|
||||
rstate,
|
||||
);
|
||||
if served_translated {
|
||||
self.xenos_dispatches_translator =
|
||||
self.xenos_dispatches_translator.saturating_add(1);
|
||||
} else {
|
||||
self.xenos_pipeline.render_one(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
rstate,
|
||||
);
|
||||
self.xenos_dispatches_interpreter =
|
||||
self.xenos_dispatches_interpreter.saturating_add(1);
|
||||
}
|
||||
self.queue.submit(std::iter::once(encoder.finish()));
|
||||
}
|
||||
self.xenos_draws_rendered = self
|
||||
.xenos_draws_rendered
|
||||
.saturating_add(captures.len() as u64);
|
||||
self.real_geometry_draws = self
|
||||
.real_geometry_draws
|
||||
.saturating_add(real_count as u64);
|
||||
if !self.first_dispatch_logged {
|
||||
self.first_dispatch_logged = true;
|
||||
tracing::info!(
|
||||
captures = captures.len(),
|
||||
real_vertex_draws = real_count,
|
||||
"first Xenos capture batch replayed (real geometry)"
|
||||
);
|
||||
}
|
||||
real_count
|
||||
}
|
||||
|
||||
/// Count of distinct translator pipelines compiled so far. Surfaced
|
||||
/// on the HUD as `xlated=N` to make "is P7 working?" observable.
|
||||
pub fn translated_pipeline_count(&self) -> usize {
|
||||
self.xenos_pipeline.translated_pipeline_count()
|
||||
}
|
||||
|
||||
/// Running count of captured draws that carried a real vertex window
|
||||
/// (surfaced on the HUD). Updated by [`dispatch_xenos_captures`].
|
||||
pub fn real_geometry_draws(&self) -> u64 {
|
||||
self.real_geometry_draws
|
||||
}
|
||||
|
||||
/// Clear the frontbuffer to `[r,g,b,a]` in linear space. Matches the
|
||||
/// fallback clear the outer swapchain render does so the two stages
|
||||
/// agree on "no draws yet = dark navy".
|
||||
|
||||
@@ -36,7 +36,142 @@ struct DrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
/// iterate-3O: guest dword base of the uploaded `vertex_buffer` window.
|
||||
/// The WGSL subtracts this from the absolute vertex-fetch address.
|
||||
vertex_base_dwords: u32,
|
||||
/// iterate-3S: guest→host NDC XY transform (mirrors canary
|
||||
/// `GetHostViewportInfo`). `clip.xy = pos.xy * ndc_scale + ndc_offset*pos.w`.
|
||||
/// Y is pre-flipped for wgpu. 16 bytes so the block stays 16-byte aligned.
|
||||
ndc_scale: [f32; 2],
|
||||
ndc_offset: [f32; 2],
|
||||
}
|
||||
|
||||
/// iterate-3Y: the per-draw host color/blend/write-mask render state, decoded
|
||||
/// from the guest registers (`RB_BLENDCONTROL0` / `RB_COLOR_MASK`). Used both
|
||||
/// as part of the pipeline-cache key and to build the `wgpu::ColorTargetState`.
|
||||
/// Mirrors canary's `GetColorBlendStateForRenderTarget` (D3D12
|
||||
/// `pipeline_cache.cc`): the factors come straight from `RB_BLENDCONTROL`,
|
||||
/// and a zero write-mask forces the no-blend `One,Zero` equation.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct RenderState {
|
||||
/// `RB_BLENDCONTROL0` raw value (RT0). `0x00010001` (One,Zero / One,Zero,
|
||||
/// Add) is the opaque case.
|
||||
pub blend_control: u32,
|
||||
/// RT0 nibble of `RB_COLOR_MASK` (bit0=R … bit3=A). 0 = write nothing.
|
||||
pub color_mask: u8,
|
||||
}
|
||||
|
||||
impl RenderState {
|
||||
/// Fully-opaque, all-channels state (the legacy fixed behaviour). Used for
|
||||
/// procedural/synthetic draws that have no captured guest state.
|
||||
pub const OPAQUE: RenderState = RenderState {
|
||||
blend_control: 0x0001_0001,
|
||||
color_mask: 0xF,
|
||||
};
|
||||
|
||||
/// Map a Xenos `BlendFactor` (5-bit field) to a wgpu `BlendFactor`,
|
||||
/// mirroring canary `kBlendFactorMap` (D3D12 `pipeline_cache.cc:1504`).
|
||||
fn map_factor(f: u32) -> wgpu::BlendFactor {
|
||||
match f {
|
||||
0 => wgpu::BlendFactor::Zero,
|
||||
1 => wgpu::BlendFactor::One,
|
||||
4 => wgpu::BlendFactor::Src,
|
||||
5 => wgpu::BlendFactor::OneMinusSrc,
|
||||
6 => wgpu::BlendFactor::SrcAlpha,
|
||||
7 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||
8 => wgpu::BlendFactor::Dst,
|
||||
9 => wgpu::BlendFactor::OneMinusDst,
|
||||
10 => wgpu::BlendFactor::DstAlpha,
|
||||
11 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||||
12 => wgpu::BlendFactor::Constant,
|
||||
13 => wgpu::BlendFactor::OneMinusConstant,
|
||||
14 => wgpu::BlendFactor::Constant,
|
||||
15 => wgpu::BlendFactor::OneMinusConstant,
|
||||
16 => wgpu::BlendFactor::SrcAlphaSaturated,
|
||||
// 2/3 and >16 are undefined on Xenos; canary maps to Zero.
|
||||
_ => wgpu::BlendFactor::Zero,
|
||||
}
|
||||
}
|
||||
|
||||
/// Map a Xenos `BlendFactor` for the *alpha* channel, mirroring canary
|
||||
/// `kBlendFactorAlphaMap` (color-mode factors collapse to alpha).
|
||||
fn map_factor_alpha(f: u32) -> wgpu::BlendFactor {
|
||||
match f {
|
||||
4 => wgpu::BlendFactor::SrcAlpha,
|
||||
5 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||
8 => wgpu::BlendFactor::DstAlpha,
|
||||
9 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||||
other => Self::map_factor(other),
|
||||
}
|
||||
}
|
||||
|
||||
fn map_op(o: u32) -> wgpu::BlendOperation {
|
||||
match o {
|
||||
0 => wgpu::BlendOperation::Add,
|
||||
1 => wgpu::BlendOperation::Subtract,
|
||||
2 => wgpu::BlendOperation::Min,
|
||||
3 => wgpu::BlendOperation::Max,
|
||||
4 => wgpu::BlendOperation::ReverseSubtract,
|
||||
_ => wgpu::BlendOperation::Add,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the `wgpu::ColorTargetState` for this draw.
|
||||
fn color_target(&self, format: wgpu::TextureFormat) -> wgpu::ColorTargetState {
|
||||
let bc = self.blend_control;
|
||||
let color_src = bc & 0x1F;
|
||||
let color_op = (bc >> 5) & 0x7;
|
||||
let color_dst = (bc >> 8) & 0x1F;
|
||||
let alpha_src = (bc >> 16) & 0x1F;
|
||||
let alpha_op = (bc >> 21) & 0x7;
|
||||
let alpha_dst = (bc >> 24) & 0x1F;
|
||||
|
||||
// wgpu requires `blend: None` when nothing would be written; also the
|
||||
// `One,Zero,Add` identity is the opaque case (canary's no-blend), which
|
||||
// we express as `blend: None` so it's a plain overwrite.
|
||||
let is_opaque = color_src == 1
|
||||
&& color_dst == 0
|
||||
&& color_op == 0
|
||||
&& alpha_src == 1
|
||||
&& alpha_dst == 0
|
||||
&& alpha_op == 0;
|
||||
let blend = if is_opaque {
|
||||
None
|
||||
} else {
|
||||
Some(wgpu::BlendState {
|
||||
color: wgpu::BlendComponent {
|
||||
src_factor: Self::map_factor(color_src),
|
||||
dst_factor: Self::map_factor(color_dst),
|
||||
operation: Self::map_op(color_op),
|
||||
},
|
||||
alpha: wgpu::BlendComponent {
|
||||
src_factor: Self::map_factor_alpha(alpha_src),
|
||||
dst_factor: Self::map_factor_alpha(alpha_dst),
|
||||
operation: Self::map_op(alpha_op),
|
||||
},
|
||||
})
|
||||
};
|
||||
|
||||
let mut write_mask = wgpu::ColorWrites::empty();
|
||||
if self.color_mask & 0x1 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::RED;
|
||||
}
|
||||
if self.color_mask & 0x2 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::GREEN;
|
||||
}
|
||||
if self.color_mask & 0x4 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::BLUE;
|
||||
}
|
||||
if self.color_mask & 0x8 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::ALPHA;
|
||||
}
|
||||
|
||||
wgpu::ColorTargetState {
|
||||
format,
|
||||
blend,
|
||||
write_mask,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
|
||||
@@ -48,6 +183,13 @@ pub struct DrawRequest {
|
||||
pub vertex_count: u32,
|
||||
/// Xenos primitive-type code; shader may branch on it in P3b+.
|
||||
pub prim_kind: u32,
|
||||
/// iterate-3O: guest dword base of the per-draw vertex window uploaded to
|
||||
/// `vertex_buffer` (b4). 0 = no real vertex window (procedural fallback).
|
||||
pub vertex_base_dwords: u32,
|
||||
/// iterate-3S: guest→host NDC XY transform (Y pre-flipped). When all-zero
|
||||
/// the shader leaves the position untransformed (procedural fallback).
|
||||
pub ndc_scale: [f32; 2],
|
||||
pub ndc_offset: [f32; 2],
|
||||
}
|
||||
|
||||
/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
|
||||
@@ -57,7 +199,16 @@ const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS
|
||||
const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;
|
||||
|
||||
pub struct XenosPipeline {
|
||||
/// Interpreter pipeline with the legacy fixed (alpha-blend) state. Kept as
|
||||
/// the default; per-state variants are built lazily in `interp_cache`.
|
||||
pipeline: wgpu::RenderPipeline,
|
||||
/// iterate-3Y: the interpreter WGSL module, retained so per-render-state
|
||||
/// interpreter pipelines can be compiled on demand.
|
||||
interp_shader: wgpu::ShaderModule,
|
||||
/// iterate-3Y: interpreter pipelines keyed on the per-draw `RenderState`
|
||||
/// (blend + write mask), so flat/alpha/opaque draws composite correctly
|
||||
/// even when their (vs,ps) didn't translate.
|
||||
interp_cache: std::collections::HashMap<RenderState, wgpu::RenderPipeline>,
|
||||
draw_ctx_buffer: wgpu::Buffer,
|
||||
constants_buffer: wgpu::Buffer,
|
||||
vs_ucode_buffer: wgpu::Buffer,
|
||||
@@ -78,7 +229,12 @@ pub struct XenosPipeline {
|
||||
/// so every (vs, ps) pair gets compiled once and re-used for every
|
||||
/// subsequent draw. Interpreter pipeline remains the fallback.
|
||||
pipeline_layout: wgpu::PipelineLayout,
|
||||
translated_cache: std::collections::HashMap<(u32, u32), wgpu::RenderPipeline>,
|
||||
/// iterate-3Y: cached translator pipelines keyed on the shader pair AND the
|
||||
/// per-draw render state, so the same (vs,ps) with different blend/mask
|
||||
/// composites correctly. The translated WGSL module is itself cached per
|
||||
/// (vs,ps) so re-translation only happens once.
|
||||
translated_cache: std::collections::HashMap<(u32, u32, RenderState), wgpu::RenderPipeline>,
|
||||
translated_modules: std::collections::HashMap<(u32, u32), wgpu::ShaderModule>,
|
||||
pub target_format: wgpu::TextureFormat,
|
||||
}
|
||||
|
||||
@@ -193,7 +349,9 @@ impl XenosPipeline {
|
||||
draw_index: 0,
|
||||
vertex_count: 3,
|
||||
prim_kind: 4,
|
||||
_pad: 0,
|
||||
vertex_base_dwords: 0,
|
||||
ndc_scale: [0.0, 0.0],
|
||||
ndc_offset: [0.0, 0.0],
|
||||
};
|
||||
let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||
label: Some("xenos draw ctx"),
|
||||
@@ -242,8 +400,13 @@ impl XenosPipeline {
|
||||
usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
|
||||
view_formats: &[],
|
||||
});
|
||||
// Magenta (255, 0, 255, 255) so a missing-texture read visibly stands
|
||||
// out on-screen when the interpreter does sample it.
|
||||
// iterate-3Y: transparent black (0,0,0,0). When a textured draw's
|
||||
// real texture can't be resolved (e.g. its sampler slot is shadowed by
|
||||
// a vertex-fetch constant), sampling a *transparent* texel makes the
|
||||
// draw a no-op under its real premultiplied-alpha blend — instead of
|
||||
// fabricating an opaque magenta that overpaints everything (the old
|
||||
// debug stub). This removes a fake rather than adding one: we never
|
||||
// invent visible pixels for an unresolved texture.
|
||||
queue.write_texture(
|
||||
wgpu::ImageCopyTexture {
|
||||
texture: &dummy_tex,
|
||||
@@ -251,7 +414,7 @@ impl XenosPipeline {
|
||||
origin: wgpu::Origin3d::ZERO,
|
||||
aspect: wgpu::TextureAspect::All,
|
||||
},
|
||||
&[0xFFu8, 0x00, 0xFF, 0xFF],
|
||||
&[0x00u8, 0x00, 0x00, 0x00],
|
||||
wgpu::ImageDataLayout {
|
||||
offset: 0,
|
||||
bytes_per_row: Some(4),
|
||||
@@ -359,6 +522,8 @@ impl XenosPipeline {
|
||||
|
||||
Self {
|
||||
pipeline,
|
||||
interp_shader: shader,
|
||||
interp_cache: std::collections::HashMap::new(),
|
||||
draw_ctx_buffer,
|
||||
constants_buffer,
|
||||
vs_ucode_buffer,
|
||||
@@ -371,31 +536,22 @@ impl XenosPipeline {
|
||||
dummy_view,
|
||||
pipeline_layout: layout,
|
||||
translated_cache: std::collections::HashMap::new(),
|
||||
translated_modules: std::collections::HashMap::new(),
|
||||
target_format,
|
||||
}
|
||||
}
|
||||
|
||||
/// P7 — does the translator cache already have a pipeline for this
|
||||
/// (vs, ps) pair?
|
||||
/// P7 — has the translator already produced a WGSL *module* for this
|
||||
/// (vs, ps) pair? (A per-render-state pipeline may still need building.)
|
||||
pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
|
||||
self.translated_cache
|
||||
self.translated_modules
|
||||
.contains_key(&(vs_blob_key, ps_blob_key))
|
||||
}
|
||||
|
||||
/// P7 — fetch a cached translator pipeline. `None` if not yet built.
|
||||
pub fn translated_pipeline(
|
||||
&self,
|
||||
vs_blob_key: u32,
|
||||
ps_blob_key: u32,
|
||||
) -> Option<&wgpu::RenderPipeline> {
|
||||
self.translated_cache
|
||||
.get(&(vs_blob_key, ps_blob_key))
|
||||
}
|
||||
|
||||
/// P7 — compile a translator-produced WGSL module into a
|
||||
/// `wgpu::RenderPipeline` and insert it into the cache keyed on
|
||||
/// `(vs_blob_key, ps_blob_key)`. Returns `true` on success. Duplicate
|
||||
/// inserts are no-ops. Emits `gpu.shader.compile_ok` on success.
|
||||
/// P7 — compile a translator-produced WGSL module and cache it keyed on
|
||||
/// `(vs_blob_key, ps_blob_key)`. The actual `RenderPipeline` (which also
|
||||
/// depends on the per-draw blend/mask state) is built lazily by
|
||||
/// [`render_one_translated`]. Returns `true` on success.
|
||||
pub fn insert_translated(
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
@@ -404,7 +560,7 @@ impl XenosPipeline {
|
||||
wgsl: &str,
|
||||
) -> bool {
|
||||
let key = (vs_blob_key, ps_blob_key);
|
||||
if self.translated_cache.contains_key(&key) {
|
||||
if self.translated_modules.contains_key(&key) {
|
||||
return true;
|
||||
}
|
||||
let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||
@@ -420,31 +576,42 @@ impl XenosPipeline {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
self.translated_modules.insert(key, shader);
|
||||
metrics::counter!("gpu.shader.compile_ok").increment(1);
|
||||
true
|
||||
}
|
||||
|
||||
/// iterate-3Y: ensure a translator pipeline exists for `(vs,ps,rstate)`,
|
||||
/// building it from the cached module + the per-draw color/blend target.
|
||||
fn ensure_translated_for_state(
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
vs_key: u32,
|
||||
ps_key: u32,
|
||||
rstate: RenderState,
|
||||
) -> bool {
|
||||
let pkey = (vs_key, ps_key, rstate);
|
||||
if self.translated_cache.contains_key(&pkey) {
|
||||
return true;
|
||||
}
|
||||
let Some(module) = self.translated_modules.get(&(vs_key, ps_key)) else {
|
||||
return false;
|
||||
};
|
||||
let target = rstate.color_target(self.target_format);
|
||||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||||
label: Some("xenos translated pipeline"),
|
||||
layout: Some(&self.pipeline_layout),
|
||||
vertex: wgpu::VertexState {
|
||||
module: &shader,
|
||||
module,
|
||||
entry_point: "vs_main",
|
||||
compilation_options: Default::default(),
|
||||
buffers: &[],
|
||||
},
|
||||
fragment: Some(wgpu::FragmentState {
|
||||
module: &shader,
|
||||
module,
|
||||
entry_point: "fs_main",
|
||||
compilation_options: Default::default(),
|
||||
targets: &[Some(wgpu::ColorTargetState {
|
||||
format: self.target_format,
|
||||
blend: Some(wgpu::BlendState {
|
||||
color: wgpu::BlendComponent {
|
||||
src_factor: wgpu::BlendFactor::SrcAlpha,
|
||||
dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||
operation: wgpu::BlendOperation::Add,
|
||||
},
|
||||
alpha: wgpu::BlendComponent::OVER,
|
||||
}),
|
||||
write_mask: wgpu::ColorWrites::ALL,
|
||||
})],
|
||||
targets: &[Some(target)],
|
||||
}),
|
||||
primitive: wgpu::PrimitiveState {
|
||||
topology: wgpu::PrimitiveTopology::TriangleList,
|
||||
@@ -460,30 +627,78 @@ impl XenosPipeline {
|
||||
multiview: None,
|
||||
cache: None,
|
||||
});
|
||||
self.translated_cache.insert(key, pipeline);
|
||||
metrics::counter!("gpu.shader.compile_ok").increment(1);
|
||||
self.translated_cache.insert(pkey, pipeline);
|
||||
true
|
||||
}
|
||||
|
||||
/// Render one draw with the translator-produced pipeline instead of
|
||||
/// the interpreter. Mirrors [`render_one`] except the bound pipeline
|
||||
/// is swapped for `pipeline`.
|
||||
pub fn render_one_with_pipeline(
|
||||
&self,
|
||||
/// iterate-3Y: ensure an interpreter pipeline exists for `rstate`.
|
||||
fn ensure_interp_for_state(&mut self, device: &wgpu::Device, rstate: RenderState) {
|
||||
if self.interp_cache.contains_key(&rstate) {
|
||||
return;
|
||||
}
|
||||
let target = rstate.color_target(self.target_format);
|
||||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||||
label: Some("xenos interp pipeline (per-state)"),
|
||||
layout: Some(&self.pipeline_layout),
|
||||
vertex: wgpu::VertexState {
|
||||
module: &self.interp_shader,
|
||||
entry_point: "vs_main",
|
||||
compilation_options: Default::default(),
|
||||
buffers: &[],
|
||||
},
|
||||
fragment: Some(wgpu::FragmentState {
|
||||
module: &self.interp_shader,
|
||||
entry_point: "fs_main",
|
||||
compilation_options: Default::default(),
|
||||
targets: &[Some(target)],
|
||||
}),
|
||||
primitive: wgpu::PrimitiveState {
|
||||
topology: wgpu::PrimitiveTopology::TriangleList,
|
||||
strip_index_format: None,
|
||||
front_face: wgpu::FrontFace::Ccw,
|
||||
cull_mode: None,
|
||||
polygon_mode: wgpu::PolygonMode::Fill,
|
||||
unclipped_depth: false,
|
||||
conservative: false,
|
||||
},
|
||||
depth_stencil: None,
|
||||
multisample: wgpu::MultisampleState::default(),
|
||||
multiview: None,
|
||||
cache: None,
|
||||
});
|
||||
self.interp_cache.insert(rstate, pipeline);
|
||||
}
|
||||
|
||||
/// iterate-3Y: render one draw through the translator pipeline built for
|
||||
/// this draw's render state. Returns `false` if no module is cached for
|
||||
/// `(vs,ps)` (caller should fall back to the interpreter).
|
||||
pub fn render_one_translated(
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
queue: &wgpu::Queue,
|
||||
encoder: &mut wgpu::CommandEncoder,
|
||||
target_view: &wgpu::TextureView,
|
||||
req: DrawRequest,
|
||||
pipeline: &wgpu::RenderPipeline,
|
||||
) {
|
||||
vs_key: u32,
|
||||
ps_key: u32,
|
||||
rstate: RenderState,
|
||||
) -> bool {
|
||||
if !self.ensure_translated_for_state(device, vs_key, ps_key, rstate) {
|
||||
return false;
|
||||
}
|
||||
let cb = DrawConstants {
|
||||
draw_index: req.draw_index,
|
||||
vertex_count: req.vertex_count.max(3),
|
||||
prim_kind: req.prim_kind,
|
||||
_pad: 0,
|
||||
vertex_base_dwords: req.vertex_base_dwords,
|
||||
ndc_scale: req.ndc_scale,
|
||||
ndc_offset: req.ndc_offset,
|
||||
};
|
||||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||||
|
||||
let pipeline = self
|
||||
.translated_cache
|
||||
.get(&(vs_key, ps_key, rstate))
|
||||
.expect("just ensured");
|
||||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||||
label: Some("xenos translated draw"),
|
||||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||||
@@ -503,6 +718,7 @@ impl XenosPipeline {
|
||||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||||
pass.draw(0..rounded.max(3), 0..1);
|
||||
true
|
||||
}
|
||||
|
||||
/// Number of distinct translator pipelines cached. Surfaced to the HUD.
|
||||
@@ -594,22 +810,34 @@ impl XenosPipeline {
|
||||
queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
|
||||
}
|
||||
|
||||
/// Render one captured draw.
|
||||
/// Render one captured draw through the interpreter, using the per-draw
|
||||
/// `rstate` (blend/write-mask) so flat draws composite correctly even
|
||||
/// when their (vs,ps) didn't translate. `RenderState::OPAQUE` reproduces
|
||||
/// the legacy fixed behaviour for procedural/synthetic draws.
|
||||
pub fn render_one(
|
||||
&self,
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
queue: &wgpu::Queue,
|
||||
encoder: &mut wgpu::CommandEncoder,
|
||||
target_view: &wgpu::TextureView,
|
||||
req: DrawRequest,
|
||||
rstate: RenderState,
|
||||
) {
|
||||
self.ensure_interp_for_state(device, rstate);
|
||||
let cb = DrawConstants {
|
||||
draw_index: req.draw_index,
|
||||
vertex_count: req.vertex_count.max(3),
|
||||
prim_kind: req.prim_kind,
|
||||
_pad: 0,
|
||||
vertex_base_dwords: req.vertex_base_dwords,
|
||||
ndc_scale: req.ndc_scale,
|
||||
ndc_offset: req.ndc_offset,
|
||||
};
|
||||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||||
|
||||
let pipeline = self
|
||||
.interp_cache
|
||||
.get(&rstate)
|
||||
.expect("just ensured");
|
||||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||||
label: Some("xenos draw"),
|
||||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||||
@@ -624,7 +852,7 @@ impl XenosPipeline {
|
||||
timestamp_writes: None,
|
||||
occlusion_query_set: None,
|
||||
});
|
||||
pass.set_pipeline(&self.pipeline);
|
||||
pass.set_pipeline(pipeline);
|
||||
pass.set_bind_group(0, &self.bind_group, &[]);
|
||||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||||
@@ -638,6 +866,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn draw_constants_layout_matches_wgsl_uniform() {
|
||||
assert_eq!(std::mem::size_of::<DrawConstants>(), 16);
|
||||
assert_eq!(std::mem::size_of::<DrawConstants>(), 32);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user