Compare commits
47 Commits
audit-2BF/
...
iterate-4A
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
23189b95af | ||
|
|
acb29db444 | ||
|
|
dc1320cd4b | ||
|
|
9d24dd0eaa | ||
|
|
c62a355418 | ||
|
|
3f8d3b6f1c | ||
|
|
c0c6088e4d | ||
|
|
f6f3aac673 | ||
|
|
2a992db47b | ||
|
|
89b5c39d8a | ||
|
|
39723dfe37 | ||
|
|
da7c29b6d2 | ||
|
|
1b9918450f | ||
|
|
80fbff8bd1 | ||
|
|
6d8a2817a3 | ||
|
|
a3aa3cc7d6 | ||
|
|
6ff184694d | ||
|
|
504592ac13 | ||
|
|
6bb4355e3d | ||
|
|
3f5d5cf5f7 | ||
|
|
2f55d1fd7d | ||
|
|
a91f4c550b | ||
|
|
66bd805726 | ||
|
|
ad9c8e4cb8 | ||
|
|
873c197ff1 | ||
|
|
1ae472bd2b | ||
|
|
034ec8b47f | ||
|
|
93f60a3ba0 | ||
|
|
2bdb93e51e | ||
|
|
ed2e0e72fd | ||
|
|
f75bc96d17 | ||
|
|
de21c7a544 | ||
|
|
f3b7e8b760 | ||
|
|
7e2603a9e5 | ||
|
|
5aaadfec36 | ||
|
|
0332d1990d | ||
|
|
6271ba1f55 | ||
|
|
48b19e490f | ||
|
|
341196a111 | ||
|
|
b20c99f141 | ||
|
|
db90ad0f7d | ||
|
|
481591fdb2 | ||
|
|
52c30d82a7 | ||
|
|
229b46c765 | ||
|
|
40f208ea4e | ||
|
|
8683fb59ed | ||
|
|
b5885b8560 |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -11,3 +11,12 @@ audit-*.md
|
||||
*.stdout
|
||||
*.stderr
|
||||
*.log
|
||||
|
||||
# Runtime cache artifacts (vkd3d-proton / DXVK shader caches dropped into the
|
||||
# working dir by the Wine canary build)
|
||||
vkd3d-proton.cache*
|
||||
*.dxvk-cache
|
||||
|
||||
# local analysis-DB backups (regenerable; too large to track)
|
||||
*.db.bak*
|
||||
sylpheed.db.bak-*
|
||||
|
||||
101
Cargo.lock
generated
101
Cargo.lock
generated
@@ -418,6 +418,26 @@ version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.64.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"peeking_take_while",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash 1.1.0",
|
||||
"shlex",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-set"
|
||||
version = "0.6.0"
|
||||
@@ -600,6 +620,15 @@ dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
|
||||
dependencies = [
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
@@ -639,6 +668,17 @@ dependencies = [
|
||||
"inout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clang-sys"
|
||||
version = "1.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
|
||||
dependencies = [
|
||||
"glob",
|
||||
"libc",
|
||||
"libloading",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.6.0"
|
||||
@@ -1076,6 +1116,20 @@ version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
||||
|
||||
[[package]]
|
||||
name = "ffmpeg-sys-next"
|
||||
version = "6.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2529ad916d08c3562c754c21bc9b17a26c7882c0f5706cc2cd69472175f1620"
|
||||
dependencies = [
|
||||
"bindgen",
|
||||
"cc",
|
||||
"libc",
|
||||
"num_cpus",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.27"
|
||||
@@ -1317,6 +1371,12 @@ dependencies = [
|
||||
"xml-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "glob"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||
|
||||
[[package]]
|
||||
name = "glow"
|
||||
version = "0.13.1"
|
||||
@@ -1898,6 +1958,12 @@ version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "lazycell"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "lexical-core"
|
||||
version = "1.0.6"
|
||||
@@ -2139,6 +2205,12 @@ dependencies = [
|
||||
"sketches-ddsketch",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.9"
|
||||
@@ -2262,6 +2334,16 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.50.3"
|
||||
@@ -2325,6 +2407,16 @@ dependencies = [
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum"
|
||||
version = "0.7.6"
|
||||
@@ -2657,6 +2749,12 @@ version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.2"
|
||||
@@ -4961,8 +5059,10 @@ dependencies = [
|
||||
name = "xenia-apu"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ffmpeg-sys-next",
|
||||
"thiserror 2.0.18",
|
||||
"tracing",
|
||||
"xenia-memory",
|
||||
"xenia-types",
|
||||
]
|
||||
|
||||
@@ -5025,6 +5125,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"thiserror 2.0.18",
|
||||
"tracing",
|
||||
"xenia-apu",
|
||||
"xenia-cpu",
|
||||
"xenia-gpu",
|
||||
"xenia-hid",
|
||||
|
||||
133
HANDOFF-iterate-4A-milestone2.md
Normal file
133
HANDOFF-iterate-4A-milestone2.md
Normal file
@@ -0,0 +1,133 @@
|
||||
# Handoff — branch `iterate-4A/apu-xma-stage1` (Milestone 2: intro-video / XMA audio + RE tooling)
|
||||
|
||||
Reverse-engineering Project Sylpheed under this Rust Xbox-360 emulator (`xenia-rs`), using Wine
|
||||
xenia-canary as the ground-truth oracle. This branch carries **Milestone 2** work plus major
|
||||
RE-tooling improvements, on top of the (uncommitted-until-now) Milestone-1 renderer history.
|
||||
|
||||
> Method: first-divergence vs canary · fix causes not symptoms · NO faking/masking · measure the
|
||||
> oracle, never infer · refute before believing · ground every claim in evidence.
|
||||
|
||||
---
|
||||
|
||||
## 0. SET UP ON A NEW MACHINE (do this first)
|
||||
|
||||
### a) FFmpeg system libraries — **REQUIRED to build** (crate `xenia-apu` links them via pkg-config)
|
||||
The XMA audio decoder uses `ffmpeg-sys-next` (`crates/xenia-apu/Cargo.toml`:
|
||||
`ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] }`),
|
||||
which links the **system** FFmpeg dev libraries. Install them:
|
||||
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install -y libavcodec-dev libavformat-dev libavutil-dev libswresample-dev pkg-config ffmpeg
|
||||
```
|
||||
|
||||
Verify the toolchain (the XMA path needs the `xma1`/`xma2` decoders — present in distro FFmpeg ≥ ~2015):
|
||||
```bash
|
||||
pkg-config --modversion libavcodec # expect 60.x (this branch built against 60.31)
|
||||
ffmpeg -hide_banner -decoders | grep -iE 'xma1|xma2' # expect: A....D xma1 / A....D xma2
|
||||
```
|
||||
(Decoder note: distro FFmpeg has **no** `AV_CODEC_ID_XMAFRAMES`; we use `AV_CODEC_ID_XMA2` — see
|
||||
`crates/xenia-apu/src/xma2_codec.rs`.) On non-Debian distros install the equivalent `-dev` packages.
|
||||
|
||||
### b) The game ISO (gitignored — `*.iso`)
|
||||
Not in the repo. Place the Project Sylpheed ISO somewhere and create a `sylpheed.iso` symlink to it
|
||||
in the repo root (the run/test commands use `sylpheed.iso`):
|
||||
```bash
|
||||
ln -s "/path/to/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso" sylpheed.iso
|
||||
```
|
||||
⚠️ For **canary** runs, point at the REAL ISO path, not the symlink (Wine can't resolve the symlink).
|
||||
|
||||
### c) Build — **always cap parallelism** (a default `-j` build OOM-crashed a 15 GB box)
|
||||
```bash
|
||||
export CARGO_BUILD_JOBS=4 # NEVER default -j12; check `free -h` first, drop to -j2 if <4GB free
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
### d) Regenerate the static-analysis DB `sylpheed.db` (gitignored — `*.db`, ~586 MB, ~1h35m)
|
||||
Used by the RE/analysis queries (NOT needed to run the emulator). Rebuild from the ISO:
|
||||
```bash
|
||||
cargo run --release -- dis "/path/to/<the ISO>" --db sylpheed.db
|
||||
# analysis passes run in <1s; the ~1h35m is DuckDB persisting ~1.8M dispatch rows. Be patient.
|
||||
```
|
||||
This branch's analyzer fix (see §3) makes the regenerated DB include the previously-missing XMV
|
||||
engine vtables (`0x8200a1e8`/`0x8200a908`). A local pre-fix backup may exist as
|
||||
`sylpheed.db.bak-pre-vtablefix` (gitignored, not pushed).
|
||||
|
||||
---
|
||||
|
||||
## 1. WHAT'S ON THIS BRANCH (all in this one commit, on top of `acb29db` = iterate-3AL)
|
||||
**Milestone-1 renderer history** (publisher/dev splash renders) is in the ancestry (iterate-2x → 3M →
|
||||
3O → 3AL); pushing this branch carries it. **Milestone 2** + tooling added here:
|
||||
|
||||
### ✅ XMA AUDIO path — BUILT, WORKING, deterministic, tested
|
||||
- `crates/xenia-apu/src/xma.rs` — register-mapped XMA context system (MMIO `0x7FEA0000`, 320×64B
|
||||
context array, Kick/Lock/Clear decode). `xma_decode.rs` + `xma2_codec.rs` — the real FFmpeg
|
||||
`xma2` decoder (XMA_CONTEXT_DATA bitfields, BitStream packet parse, planar-f32→S16BE PCM).
|
||||
Decode runs synchronously on the CPU thread (deterministic, no host thread). Wired via
|
||||
`KernelState.xma` (`state.rs`), exports (`exports.rs`), `xaudio.rs` (`XAudioSubmitRenderDriverFrame`
|
||||
made faithful), `main.rs` (MMIO install + per-round pump).
|
||||
- **Audio-worker scheduler fix** (`main.rs` LR_HALT restore + `scheduler.rs`): the XAudio render
|
||||
callback worker was wrongly exited after ~2 deliveries → fixed → the guest now drives XMA decode.
|
||||
- Verified: real PCM out; golden `sylpheed_n50m` **re-baselined** (`crates/xenia-app/tests/golden/`)
|
||||
and PASSES; milestone-1 splash intact; apu/cpu/kernel tests pass.
|
||||
|
||||
### 🛠️ RE TOOLING (this branch's lasting wins)
|
||||
- **Runtime dispatch-recorder** `crates/xenia-cpu/src/dispatch_rec.rs` — records `(call-site → target,
|
||||
r3, lr)` for every indirect (`bcctr`-family) call. Off by default; enable with `XENIA_DISPATCH_REC=1`,
|
||||
optional filters `XENIA_DISPATCH_REC_TARGETS=<hex,…>` / `_SITES=<hex,…>`, dumps to
|
||||
`XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`). Deterministic, observe-only.
|
||||
- **Repaired static analyzer** `crates/xenia-analysis/src/vtables.rs` — the vtable extractor silently
|
||||
**fragmented vtables with non-function head slots** (missed the XMV engine vtable entirely →
|
||||
blocked ~6 investigations). Fixed via **vptr-write-anchoring** (find `addis/addi → stw rX,0(rThis)`
|
||||
constant-vptr installs; read the fnptr run from each anchor). Result on rebuild: vtables 722→1150,
|
||||
dispatch candidates 688K→1.83M, engine fully typed. (Requires the §0d DB rebuild to take effect.)
|
||||
- **Probe Heisenbug FIXED** (`main.rs run_superblock`) — `--audit-pc-probe-hex` / `--mem-watch` used to
|
||||
**disable superblock chaining**, which changed thread scheduling and *starved the movie subsystem*
|
||||
so the probes couldn't observe it. Now probes fire *inside* the chain loop → scheduling is identical
|
||||
armed-vs-unarmed (verified byte-identical golden) → the probe suite is finally usable on the movie
|
||||
subsystem. Also fixed a `--quiet` bug that swallowed armed `--trace-handles`/`--dump-addr` reports.
|
||||
|
||||
---
|
||||
|
||||
## 2. CURRENT STATE & WHERE TO CONTINUE (the video still doesn't play)
|
||||
**Audio works; the intro VIDEO doesn't play yet.** Root, runtime-pinned: a 2000ms readiness timeout
|
||||
(`sub_821B66B8`) abandons because the XMV engine (`0x40d101c0`, runtime vtable `0x8200a1e8`) never
|
||||
**primes** — engine begin-playback `sub_825076F0` (slot 21) is **never dispatched** (0×), so the
|
||||
per-frame full-start always takes its skip branch and the playback clock never starts.
|
||||
- **Classification: (B) guest-side state machine.** The gate fields are the engine's *correct* reset
|
||||
defaults → there is **NO honest our-side fix at the gate** (forcing them = masking, forbidden). The
|
||||
defect is upstream: the guest SM reaches "create decoder (success)" but never issues begin-playback.
|
||||
- **Latest narrowing (evidence, fixed probes):** ARM2-setup `sub_821B55D8` runs once, create-decoder
|
||||
`sub_824F8398` succeeds, and ARM2 then calls engine-setup wrappers
|
||||
**`sub_824F7778` / `sub_824F7630` / `sub_824F7558` / `sub_824F7538` / `sub_824FCB68`** (on
|
||||
`[movie+104]`=engine) — the begin-playback dispatch is gated **inside one of these**. Tracing them
|
||||
(now possible with the fixed probes) for the begin-playback gate + why ours never satisfies it is
|
||||
**the next step**. The likely ultimate unlock is **measuring canary** (same XEX reaches begin-playback)
|
||||
to find the upstream state/signal we don't produce.
|
||||
|
||||
Full, evidence-grounded detail (engine/vtable/slot map, the eliminations, the investigation arc, the
|
||||
method lessons) lives in the agent-memory grounding file referenced in the project memory index
|
||||
(`milestone2_xma_grounding`). Key anchors: engine `0x40d101c0` vtable `0x8200a1e8` — PUMP slot19
|
||||
`sub_825078D8`, begin-playback slot21 `sub_825076F0`, submit slot27 `sub_82505C08`, full-start slot40
|
||||
`sub_825061E0`; movie host `0x40bb0440` (engine at `[host+104]`); SM ARM1 `sub_821B4C98` → ARM2
|
||||
`sub_821B55D8` → ARM3 `sub_821B5FB8` → poll `sub_821B66B8`.
|
||||
|
||||
### Useful commands
|
||||
```bash
|
||||
# Headless run to the video state (~30-40s, ~1B instr); add diagnostic flags as needed:
|
||||
./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
|
||||
# Non-perturbing PC probes (now usable on the movie subsystem):
|
||||
RUST_LOG=warn,xenia_apu=info XENIA_AUDIT_PC_PROBE=0x825078d8,0x82505c08 \
|
||||
./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
|
||||
# Dispatch recorder (filtered):
|
||||
XENIA_DISPATCH_REC=1 XENIA_DISPATCH_REC_TARGETS=0x825076f0,0x82505c08 \
|
||||
./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
|
||||
# Golden / determinism check:
|
||||
CARGO_BUILD_JOBS=4 cargo test -p xenia-app --release --test sylpheed_oracles -- --ignored sylpheed_n50m
|
||||
# Visual (watch the splash; ASK a human to watch — never self-screenshot):
|
||||
./target/release/xenia-rs exec sylpheed.iso --ui
|
||||
```
|
||||
⚠️ Probe/run discipline: kill background runs by pid or `pkill -x xenia-rs` (NEVER `pkill -f`, it
|
||||
self-matches the launcher). Runs are deterministic (instruction-count clock).
|
||||
|
||||
🤖 Generated with [Claude Code](https://claude.com/claude-code)
|
||||
0
audit-runs/audit-009/branch-probe.trace
Normal file
0
audit-runs/audit-009/branch-probe.trace
Normal file
131
audit-runs/audit-059-handle-disambiguation/FINDINGS.md
Normal file
131
audit-runs/audit-059-handle-disambiguation/FINDINGS.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# AUDIT-059 — handle disambiguation (iterate 2.BD)
|
||||
|
||||
**Date:** 2026-06-06. **Engines:** ours `target/release/xenia-rs -n 50M` (3.9 s wall, 50M instr, 40k import calls), canary Wine `xenia_canary.exe --mute=true --audit_handle_lifecycle=true` (~35 s wall, 34k log lines, 0 fatals).
|
||||
|
||||
## Verdict — HANDOFF's wedge handles are stale
|
||||
|
||||
HANDOFF said: *"opt_callback signals 0x108c, tid=1 wedges on 0x10e8."* Both IDs are now `<UNCREATED>` in ours, along with `0x1090 / 0x10dc / 0x10fc / 0x1104` (also in HANDOFF's adjacent list). The allocation order shifted since that snapshot.
|
||||
|
||||
## Real wedges, current code state
|
||||
|
||||
| Handle | Kind | Engine state | Waiter | Notes |
|
||||
|---|---|---|---|---|
|
||||
| **0x12a4** | `<UNCREATED>` | `<AUDIT_BLIND>`, waiters=1 | **tid=1 main**, pc=0x824ac578 | Wait went via `do_wait_single` but creation never hit `NtCreateEvent` — `KeInitializeEvent` path. **This is the iterate-2.BC wedge** (recorded as "0x10e8" in HANDOFF — same site, different ID). |
|
||||
| **0x12ac** | Event/Auto | `<NO_SIGNALS_DESPITE_WAITS>`, waiters=1 | **tid=13** silph UI cluster, pc=0x824ac578 lr=0x821cb1e0 | Frame trail: `0x821cb1e0 → 0x821cbae0 → 0x821cc454 → 0x821c4f18 → 0x82174a80`. Frames 3-5 carry `silph::UImpl@GamePart_Title` / `silph::VGamePart_Title` vtables — **audit-049's cluster, unchanged**. |
|
||||
| 0x12b8 | Event/Auto | NO_SIGNALS, waiters=1 | (tid TBD) | Sibling, 0xC bytes from 0x12ac. |
|
||||
| 0x1020 | Event/Manual | NO_SIGNALS, waiters=1 | — | γ-class. |
|
||||
| 0x1040 | Event/Auto | NO_SIGNALS, waits=32 (hot poll) | — | Heavy wait, no signal. |
|
||||
| 0x10a8 | Event/Auto | NO_SIGNALS, waits=7 | — | γ-class. |
|
||||
| 0x10e4 | Event/Manual | NO_SIGNALS, waiters=1, waits=2 | — | γ-class. |
|
||||
|
||||
**Working handles** (sanity baseline): 0x1028 (Sema, 8 waits / 7 signals / 7 wakes), 0x10d0 (Sema, 2 waits / 1 signal / 1 wake), 0x10f0 (Event/Auto, 1/1/1 ✓ marked `<SUSPECT>` but actually fine), 0x10e0 (Event/Manual, 32 primary signals from somewhere).
|
||||
|
||||
## GPU interrupt delivery — the iterate-2.BC delta confirmed
|
||||
|
||||
| Engine | gpu.interrupt.delivered (vsync) | EmulateCPInterruptDPC / vblank pump |
|
||||
|---|---:|---:|
|
||||
| **ours** | 54 (source=0) + 1 (source=1) | — |
|
||||
| **canary** | — | **4712** in 30 s ≈ 157 Hz |
|
||||
|
||||
**~87× ratio.** Confirms HANDOFF's diagnosis: ours' victim-thread injector dies once guest threads all park; canary's host frame-limiter thread keeps firing regardless.
|
||||
|
||||
## Canary signaler attribution
|
||||
|
||||
Top KeSetEvent guest_ptrs in canary (30 s window):
|
||||
|
||||
| guest_ptr | KeSetEvent fires | Inferred role |
|
||||
|---|---:|---|
|
||||
| `0x828A3254` | 5729 | Audio host-pump worker (per AUDIT-032: `r3=0x828A3230` region) |
|
||||
| `0x828A3244` | 5728 | Audio host-pump sibling |
|
||||
| `0x828A3244` + 16-byte stride | — | Static XEX-image audio event struct |
|
||||
| `0xBCE25234` | 1301 | **silph UI cluster PKEVENT** (heap-allocated, 0x10 stride). Likely ours' 0x12ac analog. |
|
||||
| `0xBCE25214 / 0xBCE25244 / 0xBCE25224` | 648 / 603 / 603 | Sibling silph UI PKEVENTs (0x10 stride struct). Likely ours' 0x12a4 / 0x12b8 / 0x1040 analogs. |
|
||||
|
||||
Ours signals every one of those equivalents **0 times**.
|
||||
|
||||
## Round 2 — LR-extended probes name the producer
|
||||
|
||||
Extended the canary probes with guest-LR capture (5 sites in `xboxkrnl_threading.cc`, 10 LOC). Re-ran the harness. Now each `KeSetEvent` line carries the guest function that signaled the event. Result for the silph UI cluster:
|
||||
|
||||
| PKEVENT | KeSetEvent count | Producer LR(s) |
|
||||
|---|---:|---|
|
||||
| `0xBCE25214` | 574 | `0x82508510` (single producer) |
|
||||
| `0xBCE25224` | 565 | `0x82508358` (single producer) |
|
||||
| `0xBCE25234` | 1153 | `0x82506C90` (579) + `0x82508524` (574) |
|
||||
| `0xBCE25244` | 570 | `0x82506F9C` (single producer) |
|
||||
| `0xBCE25284` | 1 | `0x82507ABC` (one-shot 5th-worker init?) |
|
||||
|
||||
All 6 producer LRs sit in `0x82506000–0x82509000`. **This is exactly the `sub_825070F0` worker thread cluster** that audit-057/058 already named:
|
||||
|
||||
> *audit-057: "sub_825070F0 (4 missing, initializes 4 workers w/ shared ctx 0xBCE25340, entries 0x82506528/58/88/B8)"*
|
||||
|
||||
The 4 worker entries (`0x82506528/58/88/B8`) are inside `sub_82506xxx` — exactly where the producer LRs `0x82506C90`/`0x82506F9C` live. The other producer LRs `0x825083xx` / `0x825085xx` are in downstream callees (workers call deeper code which itself calls KeSetEvent).
|
||||
|
||||
For comparison the audio host-pump pair gets a single sharp producer too:
|
||||
- `0x828A3254` × 5271 ← `lr=0x824D2A44`
|
||||
- `0x828A3244` × 5271 ← `lr=0x824D292C`
|
||||
|
||||
(These match AUDIT-032's PC `0x824D229C / r3=0x828A3230` region — already-understood audio host-pump.)
|
||||
|
||||
## Verdict — 2.BE is INSUFFICIENT for the silph UI wedge
|
||||
|
||||
The silph UI PKEVENTs are signaled exclusively by threads spawned by `sub_825070F0`. Per audit-057/058, **`sub_825070F0` fires 0× in ours** — those 4 worker threads never spawn. Therefore the PKEVENTs are never signaled. Therefore tid=13 (`0x12ac` in ours) wedges forever.
|
||||
|
||||
**`sub_825070F0`'s call chain is gated by the audit-009 "unreachability island"** — a CRT-driven fnptr-array bootstrap that ours fails to enumerate. VSync delivery is irrelevant to that bootstrap; the host frame-limiter thread does not drive CRT initializers.
|
||||
|
||||
Therefore:
|
||||
- **2.BE alone CANNOT unwedge tid=13.** It will close the 54-vs-4712 VSync delivery gap and may unblock things downstream of vsync, but the silph UI wedge has an independent missing-signaler root cause.
|
||||
- **2.BE may still unwedge tid=1 main on `0x12a4`** — that wait went via `KeInitializeEvent` (handle never hit `NtCreateEvent` in ours, hence `<AUDIT_BLIND>`). Whether `0x12a4`'s signaler depends on VSync is unknown without further probing.
|
||||
|
||||
## Implications for next moves
|
||||
|
||||
A single fix won't take us to draws > 0. We need at least two:
|
||||
|
||||
1. **2.BE (VSync delivery)** — still worth landing for the architectural correctness it brings, AND because it's the only fix that can unwedge tid=1 main's `0x12a4` if that's vsync-derived. ~60–80 LOC per Agent C's plan.
|
||||
2. **2.BF (sub_825070F0 activation)** — this is the audit-058 unfinished business. Options:
|
||||
- (a) **Static work:** trace canary's CRT-driven fnptr-array path that activates the silph UI bootstrap; backport the missing init into ours. High info, slow. Requires more probing.
|
||||
- (b) **Direct synthetic spawn:** ours injects host-side `ExCreateThread` calls for the 4 worker entries at boot completion, mirroring AUDIT-048's audio-host-pump precedent. Pragmatic; ~40 LOC; risks getting context (`0xBCE25340`) wrong.
|
||||
|
||||
A possible third move:
|
||||
|
||||
3. **Re-probe with LR on Wait paths** (we already added it but didn't grep for it) — to tell us whether tid=1's wait on `0x12a4` is the same LR as `sub_825070F0`-chain or a totally different signaler. If different, it's a 3rd missing producer.
|
||||
|
||||
## Round 4 — wait-side guest LR via one-frame back-chain walk
|
||||
|
||||
After fixing the PPC stack-walk offset (Xbox 360 stores saved LR at `[prev_sp - 8]`, not the `+4` AIX convention), wait-side LR comes through cleanly.
|
||||
|
||||
**Canary's top wait sites:**
|
||||
|
||||
| canary handle | wait count | guest_lr | LR region | mapping |
|
||||
|---|---:|---|---|---|
|
||||
| `F800005C` | 1635 | `0x8216EE14` | kernel early-boot infra | unrelated |
|
||||
| `F800000C` | 1597 | `0x824AFFC4` | xboxkrnl wrapper (scheduler / work-queue?) | unrelated |
|
||||
| **`F80000DC`** | **476** | **`0x821C7D3C`** | **silph::UImpl/GamePart** | **= ours' 0x12ac silph UI wedge** |
|
||||
| `F80000B0` | 6 across | `0x821CBAE0` + `0x821CC19C` + `0x822DFE2x/D0` | **exact match with audit-049's frame trail** | sibling silph UI wait |
|
||||
|
||||
Identity proof: ours' audit-049 frame trail for the silph UI wedge was `0x821cb1e0 / 0x821cbae0 / 0x821cc454 / 0x821c4f18 / 0x82174a80`. Round 4 captures `0x821CBAE0` and `0x821CC19C` (adjacent PCs) as wait LRs in canary — same cluster, same code.
|
||||
|
||||
**Refined verdict.** ours' `0x12a4` (tid=1 main, AUDIT_BLIND) and `0x12ac` (tid=13 silph UI) are 8 bytes apart — likely sibling KEVENT fields in the same silph UI struct. canary's analogs are in the `F80000xx` namespace, similarly clustered. The single fix that addresses both:
|
||||
|
||||
> **2.BF (b)** — synthetic host-side spawn of `sub_825070F0`'s 4 workers at the audit-058-identified context (`0xBCE25340`), entries `0x82506528/58/88/B8`. Once those workers run, they signal the silph UI PKEVENT cluster, unwedging BOTH tid=1 main and tid=13 silph UI in one shot.
|
||||
|
||||
2.BE (host-driven VSync ISR delivery) becomes follow-on work after the UI bootstrap completes and frame pacing actually matters.
|
||||
|
||||
## Open questions for iterate 2.BD′ / 2.BE planning
|
||||
|
||||
1. **Does 2.BE alone unwedge tid=13?** Cheapest verification path: land 2.BE and re-run audit-059, see whether `0x12ac` signal count goes 0 → non-zero.
|
||||
2. **What is the LR-pattern of canary's `KeSetEvent guest_ptr=0xBCE25234` callers?** The current probe doesn't capture LR — extending the cvar to do so on a filtered subset would let us name the producer function in canary's namespace.
|
||||
3. **Does the GPU frame-limiter's CP interrupt actually walk into the silph UI cluster?** I.e., does `EmulateCPInterruptDPC` → `interrupt_callback` → guest code ever hit `sub_821CB030` or its callees? An LR probe inside `EmulateCPInterruptDPC` would answer this.
|
||||
|
||||
## Artifacts
|
||||
|
||||
- `canary.log` 2.2 MB / 34,095 lines / 32,977 AUDIT-HLC lines
|
||||
- `canary.stdout` 2.2 MB (duplicate of canary.log due to log_file fallback)
|
||||
- `canary.stderr` 8.4 KB (Wine diagnostics)
|
||||
- `ours.log` 479 lines (focus ledger + thread diagnostics + final state)
|
||||
- `ours.stderr` 317 lines (kernel-call counters)
|
||||
- `vkd3d-proton.cache.write` 15 KB (build artifact, ignored)
|
||||
|
||||
Commits in play (xenia-canary, fork-local only):
|
||||
- `03362b59f` cross-build-wine (cross-compile toolchain)
|
||||
- `d031d7c51` audit-handle-lifecycle-probes (this audit's probes)
|
||||
116
audit-runs/audit-059-handle-disambiguation/ROUND_34_PLAN.md
Normal file
116
audit-runs/audit-059-handle-disambiguation/ROUND_34_PLAN.md
Normal file
@@ -0,0 +1,116 @@
|
||||
# Round 34 — silph_ui_synth.rs (cluster B sibling) — DEFERRED PLAN
|
||||
|
||||
## Background
|
||||
|
||||
Rounds 23-33 drove γ-cluster #2 down to the actual gate: **`sub_821741C8`** (silph worker-dispatch loop) fires 0× in ours / 471× in canary (tid=6). It's invoked via dynamic vtable slot 9 from `sub_821752C0` thunk. The vtable writer is in the audit-050 unreachability island — there's no static caller chain to hook into.
|
||||
|
||||
The fix shape is a synth module analogous to `silph_synth.rs` (rounds 18-21):
|
||||
- Synthesize a singleton-like object with the right vtable
|
||||
- Spawn a guest thread at the right entry with this object as r3
|
||||
- Let the dispatch chain do the rest
|
||||
|
||||
Rounds 18-21 took 4 rounds to land cluster A's analog and ended at "workers run live but idle" because of missing foreign-pointer fields. Cluster B will face similar challenges.
|
||||
|
||||
## Sub-round breakdown (estimated 5-8 rounds)
|
||||
|
||||
### 34.α — Probe canary's dispatcher singleton (1 round)
|
||||
Capture canary's runtime state at `sub_821741C8` entry:
|
||||
- `r3 = 0xBCA44C00` (canary tid=6's dispatcher singleton)
|
||||
- Dump `r3..r3+0x80` to identify all fields
|
||||
- Note vtable address at `[r3+0]`
|
||||
|
||||
```bash
|
||||
WINEDEBUG=-all wine xenia_canary.exe --mute=true --audit_handle_lifecycle=true \
|
||||
--audit_jit_prolog_pc=0x821741C8 --audit_jit_prolog_r3_bytes=128 \
|
||||
--audit_jit_prolog_mem_dump=<vtable_va_from_r3+0> \
|
||||
...
|
||||
```
|
||||
|
||||
### 34.β — Probe full vtable layout (1 round)
|
||||
Read the vtable bytes statically from the PE (canary's `[r3+0]` IS a static XEX VA — same trick as round 21):
|
||||
- Read 32-64 slots from PE at file offset = vtable VA - 0x82000000
|
||||
- Confirm slot 9 = `sub_821C7CB8` and `vtable+0x24` thunk to `sub_821741C8`
|
||||
- Look at all other slots — do any reference deep guest code that needs more init?
|
||||
|
||||
Cross-reference each slot's DB reach. If a slot is the dispatcher's own method body, it'll be called from within the chain — needs to exist.
|
||||
|
||||
### 34.γ — Skeleton synth + thread spawn (1 round)
|
||||
Create `crates/xenia-kernel/src/silph_ui_synth.rs` mirroring `silph_synth.rs` structure:
|
||||
```rust
|
||||
pub fn spawn_silph_ui_dispatcher(state: &mut KernelState, mem: &GuestMemory, scheduler: &mut Scheduler) -> Result<u32, &'static str> {
|
||||
if state.silph_ui_synth_done { return Ok(state.silph_ui_synth_ctx); }
|
||||
|
||||
// Allocate ~0x100-0x200 bytes for the dispatcher singleton
|
||||
let ctx = state.heap_alloc(0x200, 16)?;
|
||||
mem.write_zeros(ctx, 0x200);
|
||||
|
||||
// Install static-XEX vtable at [+0]
|
||||
mem.write_u32(ctx + 0x00, VTABLE_VA); // discovered in 34.β
|
||||
|
||||
// Other init fields from 34.α dump
|
||||
// ...
|
||||
|
||||
// Spawn dispatcher thread at sub_821748F0 with r3=ctx
|
||||
scheduler.spawn(SpawnParams{
|
||||
entry: 0x821748F0,
|
||||
start_context: ctx,
|
||||
create_suspended: false,
|
||||
...
|
||||
})?;
|
||||
|
||||
state.silph_ui_synth_done = true;
|
||||
state.silph_ui_synth_ctx = ctx;
|
||||
Ok(ctx)
|
||||
}
|
||||
```
|
||||
|
||||
Hook point: first reach of `sub_821CB030` in the existing silph factory chain (the call site that should normally trigger this dispatcher's creation in canary).
|
||||
|
||||
Add 3-mode env gate: `XENIA_SILPH_UI_SYNTH={unset|=suspend|=1}`.
|
||||
|
||||
### 34.δ — Run + diagnose first crash (1 round)
|
||||
Almost certainly crashes on a NULL deref of one of the singleton's fields. Use round 19's pattern:
|
||||
- Probe at thread entry + early BB heads
|
||||
- Identify the offset that's accessed
|
||||
- Compare to canary's value at that offset
|
||||
|
||||
### 34.ε..η — Iterate on field fills (2-4 rounds)
|
||||
Each crash identifies one more required field. Fill it. Re-run. Continue until workers idle (verdict D analog).
|
||||
|
||||
### 34.θ — Producer-side seeding (1 round)
|
||||
Even with the dispatcher running, work-items may not flow. Per round 32 it's pool 3 that's starved (271 fires in canary). The producers are `sub_821CBEA8 / sub_821D24A0 / sub_821CD458` — they may need their own bootstrap. Probe what triggers them in canary.
|
||||
|
||||
## Verification at each stage
|
||||
|
||||
After every commit:
|
||||
- `cargo test --release --workspace` — 765/765 must pass
|
||||
- `XENIA_CACHE_PERSIST=1 XENIA_SILPH_UI_SYNTH=1 ./target/release/xenia-rs exec <ISO> -n 50000000 --trace-handles-focus=0x1218,0x1224,0x12a4,0x12ac`
|
||||
- Check:
|
||||
- No crash
|
||||
- `sub_821741C8` fires
|
||||
- `sub_82450b68` r4=3 fires increase
|
||||
- Handle 0x1224 / 0x1218 transition out of NO_SIGNALS_DESPITE_WAITS
|
||||
- Eventually: `VdSwap > 1, draws > 0`
|
||||
|
||||
## Risk register
|
||||
|
||||
- **High**: dispatcher singleton may require many more fields than the analog WorkerCtx (rounds 18-21 needed 8 KEVENTs + ring + descriptors + index table; UI dispatcher likely has similar scope)
|
||||
- **High**: foreign-arena pointers in canary's heap (similar to round 19's `[+0x28/+0x2C/+0x30]`) may need their own synthesis
|
||||
- **Medium**: cluster B's worker may itself spawn threads which need contexts which need... cascading scope
|
||||
- **Low**: workspace tests breaking (probe infrastructure is solid)
|
||||
- **Low**: existing iterate-2BE work regressing (it's on a separate branch)
|
||||
|
||||
## Off-ramps
|
||||
|
||||
If we hit a wall at any sub-round, the off-ramps are:
|
||||
1. Land the infrastructure as opt-in (rounds 18-21 pattern) and ship cluster A + cluster B both as opt-in env vars
|
||||
2. Drop cluster B entirely and PR the iterate-2BE work to master (production-ready architectural fix)
|
||||
3. Pivot to lockstep diff of inflate function (round 30 hypothesis (i)) if cluster B keeps producing crash-fix layers
|
||||
|
||||
## Branch plan
|
||||
|
||||
New branch: `iterate-2BF/silph-ui-synth` off `iterate-2BF/synthetic-silph-spawn` HEAD `40f208e`. Each sub-round = 1 commit. All commits opt-in via env var; default behavior unchanged.
|
||||
|
||||
## When ready to execute
|
||||
|
||||
Dispatch with the prompt at the round-33 agent's recommendation, starting at sub-round 34.α.
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,66 @@
|
||||
AUDIT-PC-PROBE pc=0x8216ea68 tid=1 hw=0 cycle=5362918 lr=0x824ab8e0 r3=0x00000000 r11=0x00000000 [r3+0]=0x00000000 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
AUDIT-PC-PROBE pc=0x822f1aa8 tid=1 hw=0 cycle=6181256 lr=0x8216ee14 r3=0x40d09a40 r11=0x40111910 [r3+0]=0x00000021 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x40541a40 [r3+0x30]=0x00000000
|
||||
AUDIT-PC-PROBE pc=0x822f1b38 tid=1 hw=0 cycle=6181641 lr=0x822f1b38 r3=0x00000001 r11=0x824b0000 [r3+0]=0x00000000 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
AUDIT-PC-PROBE pc=0x821746b0 tid=1 hw=0 cycle=9229300 lr=0x82173c38 r3=0x40ba9a80 r11=0x00000000 [r3+0]=0x40111910 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
AUDIT-PC-PROBE pc=0x821748f0 tid=13 hw=1 cycle=0 lr=0xbcbcbcbc r3=0x4024a840 r11=0x00000000 [r3+0]=0x40ba9a80 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x4250dec0
|
||||
|
||||
=== Final State ===
|
||||
PC: 0x00000000
|
||||
LR: 0xbcbcbcbc
|
||||
CTR: 0x00000000
|
||||
CR: 0x00000000
|
||||
XER: CA=0 OV=0 SO=0
|
||||
|
||||
=== Thread diagnostics ===
|
||||
hw=0 idx=0 tid=1 state=Blocked(WaitAny { handles: [4208], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x700ff6e0
|
||||
r0=0x82153bf0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a72328
|
||||
r8=0x43b77284 r9=0x43b77328 r10=0x00000001 r11=0x00000103 r12=0x82173c64 r13=0x7fff0000
|
||||
hw=0 idx=1 tid=11 state=Blocked(WaitAny { handles: [2190094916, 2190094880], deadline: None }) pc=0x824d2a94 lr=0x824d2a94 sp=0x71497d90
|
||||
r0=0x00000000 r3=0x00000000 r4=0x71497de0 r5=0x00000001 r6=0x00000003 r7=0x00000001
|
||||
r8=0x00000000 r9=0x00000000 r10=0x71497df0 r11=0x828a3244 r12=0xbcbcbcbc r13=0x4b9f1000
|
||||
hw=1 idx=0 tid=2 state=Blocked(WaitAny { handles: [2189887804], deadline: None }) pc=0x824a95f8 lr=0x824a95f8 sp=0x710ffd20
|
||||
r0=0x0000030c r3=0x00000000 r4=0x00000003 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x00000001 r9=0x6f000000 r10=0x824a9178 r11=0x82870000 r12=0x824a94f0 r13=0x4acc3000
|
||||
hw=1 idx=1 tid=13 state=Blocked(WaitAny { handles: [4216], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x715a7a20
|
||||
r0=0x821511d0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b77334 r9=0x43b77334 r10=0x40541f80 r11=0x00000001 r12=0x821cb1e0 r13=0x4d1d4000
|
||||
hw=2 idx=0 tid=7 state=Blocked(WaitAny { handles: [1111821148], deadline: Some(42946672) }) pc=0x824cd4f4 lr=0x824cd4f4 sp=0x71187e60
|
||||
r0=0x00000000 r3=0x00000000 r4=0x00000003 r5=0x00000001 r6=0x00000000 r7=0x71187eb0
|
||||
r8=0x00000000 r9=0x00000000 r10=0x00000002 r11=0x00000002 r12=0xbcbcbcbc r13=0x4b1d6000
|
||||
hw=2 idx=1 tid=8 state=Blocked(WaitAny { handles: [4176, 4128], deadline: None }) pc=0x824ab214 lr=0x824ab214 sp=0x71287c90
|
||||
r0=0x00000000 r3=0x00000000 r4=0x71287cf0 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
r8=0x00000000 r9=0x00009030 r10=0x00000002 r11=0x00000020 r12=0x822f1ff0 r13=0x4b90a000
|
||||
hw=3 idx=0 tid=4 state=Blocked(WaitAny { handles: [4120], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7112fb80
|
||||
r0=0x821511a0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b7732c r9=0x828f0000 r10=0x00000008 r11=0x00000000 r12=0x8245a660 r13=0x4adc6000
|
||||
hw=3 idx=1 tid=5 state=Blocked(WaitAny { handles: [4224], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7116fbe0
|
||||
r0=0x821511a0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b7732c r9=0x828f0000 r10=0x00000001 r11=0x00000000 r12=0x82458b34 r13=0x4adc8000
|
||||
hw=4 idx=0 tid=9 state=Ready pc=0x824d140c lr=0x824d22b4 sp=0x71387df0
|
||||
r0=0x00000000 r3=0x4250dedc r4=0x4250e040 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x4b9ec000 r9=0x01010000 r10=0x01010000 r11=0x00000000 r12=0x824d22a8 r13=0x4b9ec000
|
||||
hw=5 idx=0 tid=3 state=Blocked(WaitAny { handles: [4112], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7111fdf0
|
||||
r0=0x82153bf0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x00000a10
|
||||
r8=0x00000010 r9=0x00000000 r10=0x00009030 r11=0x00000000 r12=0x82181988 r13=0x4adc4000
|
||||
hw=5 idx=1 tid=6 state=Ready pc=0x824ab214 lr=0x824ab214 sp=0x7117fc60
|
||||
r0=0x821511a0 r3=0x00000001 r4=0x7117fcc0 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
r8=0x7117fcb0 r9=0x00009030 r10=0x00000002 r11=0x00000020 r12=0x82458d68 r13=0x4adca000
|
||||
hw=5 idx=2 tid=10 state=Ready pc=0x824d1404 lr=0x824d22b4 sp=0x71487e00
|
||||
r0=0x00000000 r3=0x4250dedc r4=0x4250e040 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x4b9ee000 r9=0x01010000 r10=0x01010000 r11=0x00000000 r12=0x824d22a8 r13=0x4b9ee000
|
||||
hw=5 idx=3 tid=12 state=Ready pc=0x824aa6a4 lr=0x824aa6a4 sp=0x714a7da0
|
||||
r0=0x00000000 r3=0x000000ff r4=0x00000020 r5=0x714a7df4 r6=0x00000000 r7=0x00000000
|
||||
r8=0x00000000 r9=0x00000000 r10=0x00000000 r11=0x00000001 r12=0x8217898c r13=0x4d1d2000
|
||||
|
||||
-- Handle waiter lists --
|
||||
handle=0x00001020 Semaphore(0/2147483647) waiters(tid)=[8]
|
||||
handle=0x42450b5c Event(sig=false, mr=true) waiters(tid)=[7]
|
||||
handle=0x828a3244 Event(sig=false, mr=false) waiters(tid)=[11]
|
||||
handle=0x00001018 Semaphore(0/2147483647) waiters(tid)=[4]
|
||||
handle=0x8287093c Event(sig=false, mr=false) waiters(tid)=[2]
|
||||
handle=0x00001070 Thread(id=13, exit=None) waiters(tid)=[1]
|
||||
handle=0x00001080 Event(sig=false, mr=false) waiters(tid)=[5]
|
||||
handle=0x00001078 Event(sig=false, mr=false) waiters(tid)=[13]
|
||||
handle=0x828a3220 Event(sig=false, mr=true) waiters(tid)=[11]
|
||||
handle=0x00001050 Event(sig=false, mr=true) waiters(tid)=[8]
|
||||
handle=0x00001010 Event(sig=false, mr=true) waiters(tid)=[3]
|
||||
@@ -0,0 +1,167 @@
|
||||
# Round-A1..A4 findings — canary tid=6 spawn chain & divergence frontier
|
||||
|
||||
## Anchor reframe (round-37 misread corrected)
|
||||
|
||||
The "factory/registry layer divergence at [0x828E1F08]" framing is falsified.
|
||||
Both engines install the SAME static-XEX `.rdata` vtable `0x820A183C` at the
|
||||
singleton's `[+0]`. The instance VAs differ only because of ε-class allocator
|
||||
divergence (audit-043).
|
||||
|
||||
| Probe | Canary | Ours |
|
||||
|----------------------------|----------------------|----------------------|
|
||||
| `[0x828E1F08]` | 0xBC22C910 (heap) | 0x40111910 (heap) |
|
||||
| `[[0x828E1F08]+0]` vtable | 0x820A183C | 0x820A183C (SAME) |
|
||||
| `vtable[+0]` thunk | 0x82175330 | 0x82175330 (SAME) |
|
||||
| `vtable[+8]` thunk | 0x82175340 → b sub_821741C8 | SAME (vtable bytes from XEX `.rdata`) |
|
||||
|
||||
The thunks at 0x82175330+ are 8-byte `lwz r3, 8(r3); b <real_method>`
|
||||
trampolines. Slot 2 (`+0x08`) is the worker dispatch entry that round 33
|
||||
identified as 471× in canary tid=6 / 0× in ours.
|
||||
|
||||
## A.1 — Canary dispatcher loop is in sub_822F1AA8 on tid=6
|
||||
|
||||
Probe `--audit_jit_prolog_pc=0x821741C8 --audit_jit_prolog_r3_bytes=256` on
|
||||
canary (35 s):
|
||||
|
||||
- ~1678 fires of sub_821741C8 on **tid=6**
|
||||
- r3 at entry = `0xBCCC4A80` (the inner sub-object of the silph::UImpl
|
||||
singleton — extracted via the thunk's `lwz r3, 8(r3)`)
|
||||
- LR at entry = `0x822F1D5C` (return PC after the `bctrl` at 0x822F1D58 inside
|
||||
sub_822F1AA8)
|
||||
- Singleton's `[+C0..+D0]` UTF-16 spells "HF Frequency" (a UI label)
|
||||
|
||||
The dispatch site in canary (the `bctrl`) is at PC 0x822F1D58 inside
|
||||
sub_822F1AA8:
|
||||
```
|
||||
0x822F1D40: lwz r3, 7944(r25) ; r3 = [r25+0x1F08] = [0x828E1F08]
|
||||
0x822F1D4C: lwz r11, 0(r3) ; vtable
|
||||
0x822F1D50: lwz r11, 8(r11) ; vtable[+8] = thunk 0x82175340
|
||||
0x822F1D54: mtctr r11
|
||||
0x822F1D58: bctrl ; → 0x82175340 → b 0x821741C8
|
||||
```
|
||||
|
||||
## A.2 — Canary tid=6 spawn site is sub_821746B0 at PC 0x82174824
|
||||
|
||||
Enumeration of `ExCreateThread` calls in canary (35 s, 21 unique tuples):
|
||||
|
||||
```
|
||||
entry=821748F0 start_ctx=BC365700 lr=824AC5F0 guest_lr=82174828 ← silph dispatcher #1
|
||||
entry=821748F0 start_ctx=BC366DA0 lr=824AC5F0 guest_lr=82174828 ← silph dispatcher #2
|
||||
```
|
||||
|
||||
PC `0x82174824` is the `bl 0x82172370` (the `ExCreateThread` thunk) inside
|
||||
`sub_821746B0`. The setup is:
|
||||
```
|
||||
0x8217480C: lis r11, 0x8217
|
||||
0x82174810: li r7, 0
|
||||
0x82174814: li r6, 4 ; priority
|
||||
0x82174818: mr r5, r29 ; start_ctx
|
||||
0x8217481C: addi r4, r11, 18672 ; r4 = 0x821748F0 (entry)
|
||||
0x82174820: li r3, 0
|
||||
0x82174824: bl 0x82172370 ; ExCreateThread
|
||||
```
|
||||
|
||||
The entry `0x821748F0` is a thread main that calls `bl 0x821749C0` (the
|
||||
inner dispatch).
|
||||
|
||||
## A.3 — sub_822F1AA8 spawns a SECOND thread at 0x822F1B08
|
||||
|
||||
The dispatch-loop function `sub_822F1AA8` itself ALSO spawns a thread at
|
||||
PC 0x822F1B08 with entry=`sub_822F1EE0` and `start_ctx=BCE24A40`:
|
||||
```
|
||||
0x822F1AEC: lis r11, 0x822F
|
||||
0x822F1AFC: addi r4, r11, 7904 ; r4 = 0x822F1EE0
|
||||
0x822F1B08: bl 0x82172370 ; ExCreateThread
|
||||
```
|
||||
|
||||
sub_822F1EE0 → sub_822F1F20 contains its own atomic state-machine + wait loop.
|
||||
|
||||
## A.3' — sub_822F1AA8 has exactly 2 callers, both in sub_8216EA68
|
||||
|
||||
```
|
||||
source=0x8216ECCC source_func=0x8216EA68 kind=call
|
||||
source=0x8216EE10 source_func=0x8216EA68 kind=call
|
||||
```
|
||||
|
||||
So sub_8216EA68 is the only function that drives sub_822F1AA8.
|
||||
|
||||
## A.4 — Ours' divergence is INSIDE the spawned thread, NOT at the spawn
|
||||
|
||||
Mirror-probed ours at `sub_821746B0` body BB heads (parallel mode, 50M
|
||||
instructions, XENIA_CACHE_PERSIST=1):
|
||||
|
||||
| PC | Fires | Notes |
|
||||
|-------------|-------|------------------------------------------------|
|
||||
| 0x821746B0 | 1 | Entry. r3=0x40ba9a80 |
|
||||
| 0x821746E0 | 1 | After `bl 0x8284DCFC` (critical-section) |
|
||||
| 0x82174798 | 1 | After the early `beq` (r28==0 branch) |
|
||||
| 0x821747B8 | 1 | **Past the gate**: `[0x828E2B14]=0x40105000` non-NULL; `bl 0x82150EF8` returned r3=0x4024a840 (NON-NULL) |
|
||||
| 0x821747D8 | 1 | After the inner `bl 0x821723F0` |
|
||||
| 0x8217480C | 1 | Enters the spawn block |
|
||||
| 0x82174828 | 1 | **Post-`bl ExCreateThread`**, r3=0x1070 = thread handle |
|
||||
|
||||
**OURS DOES SPAWN THE THREAD VIA THIS SITE.** The returned handle 0x1070 is
|
||||
**tid=13's thread handle** (per round 37 final state). So **ours' tid=13 IS
|
||||
the same logical thread as canary's tid=6** — spawned by the identical call
|
||||
site with the same entry (0x821748F0).
|
||||
|
||||
## A.4 — Divergence is INSIDE the spawned thread's body
|
||||
|
||||
Round 37's frame trail for ours' tid=13 wedge:
|
||||
`0x821CB1E0 → 0x821CBAE0 → 0x821CC454 → 0x821C4F18 → 0x82174A80`
|
||||
|
||||
The LAST frame `0x82174A80` is **inside sub_821749C0** (= the inner dispatch
|
||||
called from sub_821748F0). It's right after the vtable dispatch at
|
||||
0x82174A78 (`bctrl` on `[r30+vtable][+16]`):
|
||||
|
||||
```
|
||||
0x82174a64: mr r3, r30 ; r3 = some object
|
||||
0x82174a68: lwz r11, 0(r30)
|
||||
0x82174a6c: lwz r4, 4(r29)
|
||||
0x82174a70: lwz r5, 8(r31)
|
||||
0x82174a74: lwz r11, 16(r11) ; r11 = vtable[+0x10]
|
||||
0x82174a78: mtctr r11
|
||||
0x82174a7c: bctrl ; dispatch
|
||||
0x82174a80: lwz r3, 0(r29) ; ← wedge frame top (LR after bctrl)
|
||||
```
|
||||
|
||||
So `sub_821749C0`'s vtable[+0x10] dispatch on tid=13/tid=6's `r30` object
|
||||
lands at audit-049 territory in ours (chain through sub_821CB030+0x128 that
|
||||
ends waiting forever on handle 0x1078). In canary, the same dispatch on the
|
||||
same object SHOULD land somewhere that ultimately reaches sub_822F1AA8's
|
||||
dispatch loop and runs sub_821741C8 1678× via vtable[+8].
|
||||
|
||||
**The object `r30` is the result of `bl 0x821CF3F0`** at PC 0x821749DC. So
|
||||
sub_821CF3F0 returns a registry-lookup object; the vtable on this object's
|
||||
slot +0x10 method's body determines whether the thread wedges or runs.
|
||||
|
||||
## Phase B classification
|
||||
|
||||
Class 3 — **Missing init-time precondition**. Ours reaches the spawn site,
|
||||
ours' tid=13 enters the chain, ours' tid=13 enters sub_821749C0, but the
|
||||
vtable[+0x10] dispatch at PC 0x82174A78 in ours lands in audit-049 territory
|
||||
(wait forever on 0x1078) rather than continuing through the canonical chain
|
||||
toward sub_822F1AA8's outer dispatch loop.
|
||||
|
||||
Possible classes to refine in next round:
|
||||
- **3a**: same vtable but state-dependent — `r30`'s field at a specific offset
|
||||
differs in ours vs canary, causing the method body to take a different
|
||||
branch.
|
||||
- **3b**: the vtable in `r30` is DIFFERENT in ours vs canary (e.g., ours has
|
||||
a base-class vtable but canary has a derived-class vtable).
|
||||
- **4**: synthesis fallback — spawn a SECOND thread that runs sub_822F1AA8's
|
||||
dispatch loop directly, bypassing the wedged sub_821749C0 chain.
|
||||
|
||||
## Next probe (A.4.5)
|
||||
|
||||
Probe both engines at sub_821749C0 entry filtering tid=13 (ours) / tid=6
|
||||
(canary), capturing:
|
||||
- `r3` and `r4` at entry (the factory-output object and the ctx)
|
||||
- After the `bl 0x821CF3F0` at 0x821749DC: capture r30 (= sub_821CF3F0
|
||||
return — the object whose vtable is dispatched at 0x82174A78)
|
||||
- At PC 0x82174A78 (the divergent bctrl): r30 + r30+0 (vtable) + vtable[+0x10]
|
||||
(the dispatch target)
|
||||
|
||||
If ours and canary have IDENTICAL `vtable[+0x10]` targets but the method
|
||||
body's behavior differs → class 3a (state divergence). If targets differ →
|
||||
class 3b (vtable identity divergence).
|
||||
@@ -0,0 +1,91 @@
|
||||
AUDIT-PC-PROBE pc=0x821746b0 tid=1 hw=0 cycle=9228833 lr=0x82173c38 r3=0x40ba9a80 r11=0x00000000 [r3+0]=0x40111910 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
AUDIT-MEM-READ addr=0x828e2b14 val=0x40105000 vtable=0x40105004 vtable[0]=0x40105008 vtable[24]=0x40105020 pc=0x821746b0 tid=1 cycle=9228833
|
||||
AUDIT-PC-PROBE pc=0x821746e0 tid=1 hw=0 cycle=9228856 lr=0x821746e0 r3=0x00000000 r11=0x00000000 [r3+0]=0x00000000 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
AUDIT-MEM-READ addr=0x828e2b14 val=0x40105000 vtable=0x40105004 vtable[0]=0x40105008 vtable[24]=0x40105020 pc=0x821746e0 tid=1 cycle=9228856
|
||||
AUDIT-PC-PROBE pc=0x82174798 tid=1 hw=0 cycle=9228859 lr=0x821746e0 r3=0x00000000 r11=0x00000000 [r3+0]=0x00000000 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
AUDIT-MEM-READ addr=0x828e2b14 val=0x40105000 vtable=0x40105004 vtable[0]=0x40105008 vtable[24]=0x40105020 pc=0x82174798 tid=1 cycle=9228859
|
||||
AUDIT-PC-PROBE pc=0x821747b8 tid=1 hw=0 cycle=9229012 lr=0x821747ac r3=0x4024a840 r11=0x4024a840 [r3+0]=0x4024ace0 [[r3+0]+24]=0x43777290 [r3+0x0C]=0x4024a820 [r3+0x30]=0x4250dec0
|
||||
AUDIT-MEM-READ addr=0x828e2b14 val=0x40105000 vtable=0x40105004 vtable[0]=0x40105008 vtable[24]=0x40105020 pc=0x821747b8 tid=1 cycle=9229012
|
||||
AUDIT-PC-PROBE pc=0x821747d8 tid=1 hw=0 cycle=9229440 lr=0x821747cc r3=0x4024a840 r11=0xffffffff [r3+0]=0x40ba9a80 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x4250dec0
|
||||
AUDIT-MEM-READ addr=0x828e2b14 val=0x40105000 vtable=0x40105004 vtable[0]=0x40105008 vtable[24]=0x40105020 pc=0x821747d8 tid=1 cycle=9229440
|
||||
AUDIT-PC-PROBE pc=0x8217480c tid=1 hw=0 cycle=9229443 lr=0x821747cc r3=0x4024a840 r11=0xffffffff [r3+0]=0x40ba9a80 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x4250dec0
|
||||
AUDIT-MEM-READ addr=0x828e2b14 val=0x40105000 vtable=0x40105004 vtable[0]=0x40105008 vtable[24]=0x40105020 pc=0x8217480c tid=1 cycle=9229443
|
||||
AUDIT-PC-PROBE pc=0x82174828 tid=1 hw=0 cycle=9229509 lr=0x82174828 r3=0x00001070 r11=0x824b0000 [r3+0]=0x00000000 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
AUDIT-MEM-READ addr=0x828e2b14 val=0x40105000 vtable=0x40105004 vtable[0]=0x40105008 vtable[24]=0x40105020 pc=0x82174828 tid=1 cycle=9229509
|
||||
|
||||
=== Final State ===
|
||||
PC: 0x824ac578
|
||||
LR: 0x824ac578
|
||||
CTR: 0x82153bf0
|
||||
CR: 0x24000028
|
||||
XER: CA=0 OV=0 SO=0
|
||||
r0 : 0x0000000082153bf0
|
||||
r1 : 0x00000000700ff6e0
|
||||
r2 : 0x0000000020000000
|
||||
r4 : 0x0000000000000001
|
||||
r7 : 0x0000000003a72328
|
||||
r8 : 0x0000000043b77284
|
||||
r9 : 0x0000000043b77328
|
||||
r10: 0x0000000000000001
|
||||
r11: 0x0000000000000103
|
||||
r12: 0x0000000082173c64
|
||||
r13: 0x000000007fff0000
|
||||
r18: 0x0000000040d09a7c
|
||||
r23: 0x00000000828f3844
|
||||
r26: 0x000000004024a620
|
||||
r27: 0x00000000820a17a8
|
||||
r31: 0x0000000000001070
|
||||
|
||||
=== Thread diagnostics ===
|
||||
hw=0 idx=0 tid=1 state=Blocked(WaitAny { handles: [4208], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x700ff6e0
|
||||
r0=0x82153bf0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a72328
|
||||
r8=0x43b77284 r9=0x43b77328 r10=0x00000001 r11=0x00000103 r12=0x82173c64 r13=0x7fff0000
|
||||
hw=0 idx=1 tid=11 state=Blocked(WaitAny { handles: [2190094916, 2190094880], deadline: None }) pc=0x824d2a94 lr=0x824d2a94 sp=0x71497d90
|
||||
r0=0x00000000 r3=0x00000000 r4=0x71497de0 r5=0x00000001 r6=0x00000003 r7=0x00000001
|
||||
r8=0x00000000 r9=0x00000000 r10=0x71497df0 r11=0x828a3244 r12=0xbcbcbcbc r13=0x4b9f1000
|
||||
hw=1 idx=0 tid=2 state=Blocked(WaitAny { handles: [2189887804], deadline: None }) pc=0x824a95f8 lr=0x824a95f8 sp=0x710ffd20
|
||||
r0=0x0000030c r3=0x00000000 r4=0x00000003 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x00000001 r9=0x6f000000 r10=0x824a9178 r11=0x82870000 r12=0x824a94f0 r13=0x4acc3000
|
||||
hw=1 idx=1 tid=13 state=Blocked(WaitAny { handles: [4216], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x715a7a20
|
||||
r0=0x821511d0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b77334 r9=0x43b77334 r10=0x40541f80 r11=0x00000001 r12=0x821cb1e0 r13=0x4d1d4000
|
||||
hw=2 idx=0 tid=7 state=Blocked(WaitAny { handles: [1111821148], deadline: Some(42946672) }) pc=0x824cd4f4 lr=0x824cd4f4 sp=0x71187e60
|
||||
r0=0x00000000 r3=0x00000000 r4=0x00000003 r5=0x00000001 r6=0x00000000 r7=0x71187eb0
|
||||
r8=0x00000000 r9=0x00000000 r10=0x00000002 r11=0x00000002 r12=0xbcbcbcbc r13=0x4b1d6000
|
||||
hw=2 idx=1 tid=8 state=Blocked(WaitAny { handles: [4176, 4132], deadline: None }) pc=0x824ab214 lr=0x824ab214 sp=0x71287c90
|
||||
r0=0x00000000 r3=0x00000000 r4=0x71287cf0 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
r8=0x00000000 r9=0x00009030 r10=0x00000002 r11=0x00000020 r12=0x822f1ff0 r13=0x4b90a000
|
||||
hw=3 idx=0 tid=4 state=Blocked(WaitAny { handles: [4120], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7112fb80
|
||||
r0=0x821511a0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b7732c r9=0x828f0000 r10=0x00000008 r11=0x00000000 r12=0x8245a660 r13=0x4adc6000
|
||||
hw=3 idx=1 tid=5 state=Blocked(WaitAny { handles: [4224], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7116fbe0
|
||||
r0=0x821511a0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b7732c r9=0x828f0000 r10=0x00000001 r11=0x00000000 r12=0x82458b34 r13=0x4adc8000
|
||||
hw=4 idx=0 tid=9 state=Ready pc=0x824d140c lr=0x824d22b4 sp=0x71387df0
|
||||
r0=0x00000000 r3=0x4250dedc r4=0x4250e040 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x4b9ec000 r9=0x01010000 r10=0x01010000 r11=0x00000000 r12=0x824d22a8 r13=0x4b9ec000
|
||||
hw=5 idx=0 tid=3 state=Blocked(WaitAny { handles: [4112], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7111fdf0
|
||||
r0=0x82153bf0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x00000a10
|
||||
r8=0x00000010 r9=0x00000000 r10=0x00009030 r11=0x00000000 r12=0x82181988 r13=0x4adc4000
|
||||
hw=5 idx=1 tid=6 state=Ready pc=0x824ab214 lr=0x824ab214 sp=0x7117fc60
|
||||
r0=0x821511a0 r3=0x00000001 r4=0x7117fcc0 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
r8=0x7117fcb0 r9=0x00009030 r10=0x00000002 r11=0x00000020 r12=0x82458d68 r13=0x4adca000
|
||||
hw=5 idx=2 tid=10 state=Ready pc=0x824d140c lr=0x824d22b4 sp=0x71487e00
|
||||
r0=0x00000000 r3=0x4250dedc r4=0x4250e040 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x4b9ee000 r9=0x01010000 r10=0x01010000 r11=0x00000000 r12=0x824d22a8 r13=0x4b9ee000
|
||||
hw=5 idx=3 tid=12 state=Ready pc=0x824aa6a4 lr=0x824aa6a4 sp=0x714a7da0
|
||||
r0=0x00000000 r3=0x000000ff r4=0x00000020 r5=0x714a7df4 r6=0x00000000 r7=0x00000000
|
||||
r8=0x00000000 r9=0x00000000 r10=0x00000000 r11=0x00000001 r12=0x8217898c r13=0x4d1d2000
|
||||
|
||||
-- Handle waiter lists --
|
||||
handle=0x00001024 Semaphore(0/2147483647) waiters(tid)=[8]
|
||||
handle=0x00001010 Event(sig=false, mr=true) waiters(tid)=[3]
|
||||
handle=0x00001070 Thread(id=13, exit=None) waiters(tid)=[1]
|
||||
handle=0x00001080 Event(sig=false, mr=false) waiters(tid)=[5]
|
||||
handle=0x828a3244 Event(sig=false, mr=false) waiters(tid)=[11]
|
||||
handle=0x00001018 Semaphore(0/2147483647) waiters(tid)=[4]
|
||||
handle=0x00001050 Event(sig=false, mr=true) waiters(tid)=[8]
|
||||
handle=0x00001078 Event(sig=false, mr=false) waiters(tid)=[13]
|
||||
handle=0x8287093c Event(sig=false, mr=false) waiters(tid)=[2]
|
||||
handle=0x828a3220 Event(sig=false, mr=true) waiters(tid)=[11]
|
||||
handle=0x42450b5c Event(sig=false, mr=true) waiters(tid)=[7]
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,136 @@
|
||||
# Phase A synthesis — canary tid=6 IS the main thread; the wedge is sub_822F1AA8's loop exit
|
||||
|
||||
## Top-line finding
|
||||
|
||||
**Canary's `tid=6` is canary's main thread.** Confirmed by probing `entry_point`
|
||||
(`sub_824AB748`) with `--audit_jit_prolog_pc=0x824AB748`: fires 1× on
|
||||
`tid=00000006` with `lr=BCBCBCBC` (= OS-initial / no caller). Ours numbers
|
||||
its main thread `tid=1`. Same logical thread; different label.
|
||||
|
||||
Therefore "tid=6 fires sub_821741C8 471×" (round 33) means **the main thread**
|
||||
loops inside `sub_822F1AA8` firing `sub_821741C8` ~1678×/30s in canary. In
|
||||
ours, the main thread (tid=1) runs `sub_822F1AA8` ONCE, exits the loop, and
|
||||
proceeds to thread-join on the spawned init thread (handle 0x1070 = tid=13),
|
||||
which is itself blocked forever on handle 0x1078.
|
||||
|
||||
## Call chain (identical in both engines, different runtime behavior)
|
||||
|
||||
```
|
||||
entry_point (sub_824AB748)
|
||||
│
|
||||
├─ sub_824ACB38 CRT-driven fnptr-array iterator (audit-050 region)
|
||||
├─ ...
|
||||
└─ sub_8216EA68 Many local calls including:
|
||||
├─ ExCreateThread(entry=sub_8217F0F8 ...) ; sibling thread
|
||||
├─ sub_822F1AA8(controller=...) ; FIRST call (PC 0x8216ECCC)
|
||||
└─ sub_822F1AA8(controller=0xBCE24A40 canary / ; SECOND call (PC 0x8216EE10)
|
||||
0x40d09a40 ours) ↑ this is the loop
|
||||
```
|
||||
|
||||
The SECOND call is what runs the dispatcher loop. Its LR = 0x8216EE14.
|
||||
Confirmed in both engines.
|
||||
|
||||
## sub_822F1AA8 loop structure
|
||||
|
||||
```
|
||||
0x822F1AA8: entry, r30 = r3 (controller)
|
||||
0x822F1AEC-0x822F1B08: ExCreateThread(entry=sub_822F1EE0, ctx=r30) → r29 = handle
|
||||
0x822F1B30-0x822F1B34: bl 0x824AA8B0(r3=r29) ; ?
|
||||
0x822F1B38-0x822F1B4C: first bctrl → vtable[+0] of [0x828E1F08]
|
||||
0x822F1B50-0x822F1B74: setup, bl 0x824AA330 INFINITE wait on [r22+32]
|
||||
0x822F1B80-0x822F1BA8: post-wait setup; [r30+0] |= 0x2
|
||||
0x822F1BB0-0x822F1BBC: TOP-OF-LOOP CHECK: if [r30+0] & 0x10000000 → goto 0x822F1E10 (exit)
|
||||
0x822F1BCC..0x822F1DEC: loop body (includes the vtable[+8] bctrl → sub_821741C8 at PC 0x822F1D58)
|
||||
0x822F1DEC-0x822F1DFC: bl 0x824AA330 INFINITE wait on [r23+0]
|
||||
0x822F1E00-0x822F1E0C: END-OF-ITERATION CHECK: if [r30+0] & 0x10000000 == 0 → goto 0x822F1BCC (re-loop)
|
||||
0x822F1E10-0x822F1E18: EXIT: [r30+0] |= 0x02000000 (set MSB-6 = LSB-25)
|
||||
0x822F1E1C-0x822F1E24: release something via bl 0x824AA2F0
|
||||
0x822F1E28-0x822F1E30: bl 0x824AA330 INFINITE on [r30+28] = SPAWNED THREAD HANDLE (thread join!)
|
||||
0x822F1E40: bl 0x824AA3E0
|
||||
0x822F1E44-0x822F1E5C: final cleanup: vtable[+24] bctrl on [0x828E1F08]
|
||||
0x822F1E60-0x822F1E78: [r30+0] = 0, then [r30+0] |= 1; bl 0x824567E0
|
||||
0x822F1E7C-0x822F1E88: epilogue
|
||||
```
|
||||
|
||||
**Loop exit gate**: `[r30+0] & 0x10000000` (bit 28 LSB / bit 3 MSB). Set →
|
||||
exit. Both top-of-loop check (0x822F1BBC) and end-of-iteration check
|
||||
(0x822F1E0C) gate on the same bit.
|
||||
|
||||
## What's different between engines
|
||||
|
||||
| Engine | [r30+0] at entry | Loop iterations | Exits sub_822F1AA8? |
|
||||
|--------|------------------|------------------|----------------------|
|
||||
| canary | 0x21 (per probe) | ~1678+ in 30s | NO (stays in loop) |
|
||||
| ours | 0x21 (per probe) | 0 (probes show none of the loop-body PCs fire after entry) | YES (exits quickly) |
|
||||
|
||||
Both engines have `[r30+0]=0x21` at entry — bit 28 NOT set. After the `ori
|
||||
r11, r11, 0x2` at 0x822F1B90, both should have `[r30+0]=0x23`. Bit 28 still
|
||||
not set.
|
||||
|
||||
So **some code sets bit 28 on [r30+0] between sub_822F1AA8 entry and the
|
||||
loop check** in ours but not in canary.
|
||||
|
||||
Mem-watch on 0x40d09a40 (ours' controller VA) shows **zero guest writes** in
|
||||
my 50M-instruction parallel run. Possible reasons:
|
||||
- The setter writes from kernel/runtime code that mem-watch doesn't capture
|
||||
(kernel-host store, not guest JIT store)
|
||||
- The setter writes via a computed alias (different VA but same backing)
|
||||
- The bit IS set via a probe-quantum-elided JIT store
|
||||
|
||||
## Phase B classification
|
||||
|
||||
**Class 3a — state-divergence on the controller object**. The vtable
|
||||
identity is the same (round-37 confirmed `0x820A183C` in both). The
|
||||
controller object's bit 28 of `[+0]` evolves differently during the setup
|
||||
between sub_822F1AA8 entry and the loop check.
|
||||
|
||||
Class 4 (synthesis) is now LESS attractive: ours' main thread DOES reach
|
||||
sub_822F1AA8 with the right controller. We don't need to spawn the
|
||||
dispatcher — we need to PREVENT the main thread from exiting the loop.
|
||||
|
||||
## Pragmatic next step — JIT instrumentation to find bit-28 setter
|
||||
|
||||
Most direct diagnostic: add a JIT hook in xenia-cpu that, for guest stores
|
||||
in the range [0x822F1AA8, 0x822F1E10), captures the guest PC + the written
|
||||
value when the store would set bit 28 of any address. This identifies the
|
||||
exact PC that sets the loop-exit bit.
|
||||
|
||||
Alternative: extend `--mem-watch` to also capture kernel-side stores by
|
||||
hooking the GuestMemory write path at the kernel-state level.
|
||||
|
||||
Even simpler: add a one-shot `--bit-watch=ADDR:MASK` cvar that fires when
|
||||
the value at ADDR has any bit in MASK transition from 0→1, regardless of
|
||||
who wrote it. This is the cleanest diagnostic for this exact pattern.
|
||||
|
||||
## Fix shape (when bit-28 setter is identified)
|
||||
|
||||
If the bit-28 setter is inside the vtable[+0] dispatch chain at 0x822F1B4C
|
||||
(target sub_82173990), then the fix might be a state-init issue in the
|
||||
kernel/runtime.
|
||||
|
||||
If the bit-28 setter is inside the inner wait or one of the kernel calls
|
||||
(`bl 0x824AA8B0`, `bl 0x824AA330`), the fix might be a missing event signal
|
||||
or a wrong handle-state evolution.
|
||||
|
||||
If we can't identify the setter cleanly, the synthesis fallback is to
|
||||
**inject a kernel-side hook that clears bit 28 of [r30+0] on every entry to
|
||||
sub_822F1AA8's bit-check site (0x822F1BB0)**. Crude but should keep the
|
||||
main thread in the loop.
|
||||
|
||||
## Why this is a clearer wedge picture than rounds 22-33
|
||||
|
||||
Rounds 22-33 chased the audit-049 wedge from various angles. The diagnoses
|
||||
landed on different layers:
|
||||
- R22: "wrong cluster targeted" (cluster A vs B)
|
||||
- R26-30: "state-machine progression bug"
|
||||
- R32-33: "pool 3 starvation; bootstrap walk-back"
|
||||
|
||||
This round establishes the simplest possible framing:
|
||||
|
||||
> **Canary's main thread loops forever in a dispatcher; ours' main thread
|
||||
> exits the loop after one setup phase. The exit is gated by a single bit
|
||||
> on the controller's flag word.**
|
||||
|
||||
If bit 28 of `[controller+0]` could be permanently cleared, ours' main
|
||||
thread would stay in the loop, sub_821741C8 would dispatch, signals would
|
||||
flow, tid=13 would complete, draws would happen.
|
||||
@@ -0,0 +1,79 @@
|
||||
AUDIT-PC-PROBE pc=0x822f1aa8 tid=1 hw=0 cycle=6180796 lr=0x8216ee14 r3=0x40d09a40 r11=0x40111910 [r3+0]=0x00000021 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x40541a40 [r3+0x30]=0x00000000
|
||||
AUDIT-PC-PROBE pc=0x822f1b38 tid=1 hw=0 cycle=6181181 lr=0x822f1b38 r3=0x00000001 r11=0x824b0000 [r3+0]=0x00000000 [[r3+0]+24]=0x00000000 [r3+0x0C]=0x00000000 [r3+0x30]=0x00000000
|
||||
|
||||
=== Final State ===
|
||||
PC: 0x824ac578
|
||||
LR: 0x824ac578
|
||||
CTR: 0x82153bf0
|
||||
CR: 0x24000028
|
||||
XER: CA=0 OV=0 SO=0
|
||||
r0 : 0x0000000082153bf0
|
||||
r1 : 0x00000000700ff6e0
|
||||
r2 : 0x0000000020000000
|
||||
r4 : 0x0000000000000001
|
||||
r7 : 0x0000000003a72328
|
||||
r8 : 0x0000000043b77284
|
||||
r9 : 0x0000000043b77328
|
||||
r10: 0x0000000000000001
|
||||
r11: 0x0000000000000103
|
||||
r12: 0x0000000082173c64
|
||||
r13: 0x000000007fff0000
|
||||
r18: 0x0000000040d09a7c
|
||||
r23: 0x00000000828f3844
|
||||
r26: 0x000000004024a4e0
|
||||
r27: 0x00000000820a17a8
|
||||
r31: 0x0000000000001070
|
||||
|
||||
=== Thread diagnostics ===
|
||||
hw=0 idx=0 tid=1 state=Blocked(WaitAny { handles: [4208], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x700ff6e0
|
||||
r0=0x82153bf0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a72328
|
||||
r8=0x43b77284 r9=0x43b77328 r10=0x00000001 r11=0x00000103 r12=0x82173c64 r13=0x7fff0000
|
||||
hw=0 idx=1 tid=11 state=Blocked(WaitAny { handles: [2190094916, 2190094880], deadline: None }) pc=0x824d2a94 lr=0x824d2a94 sp=0x71497d90
|
||||
r0=0x00000000 r3=0x00000000 r4=0x71497de0 r5=0x00000001 r6=0x00000003 r7=0x00000001
|
||||
r8=0x00000000 r9=0x00000000 r10=0x71497df0 r11=0x828a3244 r12=0xbcbcbcbc r13=0x4b9f1000
|
||||
hw=1 idx=0 tid=2 state=Blocked(WaitAny { handles: [2189887804], deadline: None }) pc=0x824a95f8 lr=0x824a95f8 sp=0x710ffd20
|
||||
r0=0x0000030c r3=0x00000000 r4=0x00000003 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x00000001 r9=0x6f000000 r10=0x824a9178 r11=0x82870000 r12=0x824a94f0 r13=0x4acc3000
|
||||
hw=1 idx=1 tid=13 state=Blocked(WaitAny { handles: [4216], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x715a7a20
|
||||
r0=0x821511d0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b77334 r9=0x43b77334 r10=0x40541f80 r11=0x00000001 r12=0x821cb1e0 r13=0x4d1d4000
|
||||
hw=2 idx=0 tid=7 state=Blocked(WaitAny { handles: [1111821148], deadline: Some(42946672) }) pc=0x824cd4f4 lr=0x824cd4f4 sp=0x71187e60
|
||||
r0=0x00000000 r3=0x00000000 r4=0x00000003 r5=0x00000001 r6=0x00000000 r7=0x71187eb0
|
||||
r8=0x00000000 r9=0x00000000 r10=0x00000002 r11=0x00000002 r12=0xbcbcbcbc r13=0x4b1d6000
|
||||
hw=2 idx=1 tid=8 state=Blocked(WaitAny { handles: [4176, 4132], deadline: None }) pc=0x824ab214 lr=0x824ab214 sp=0x71287c90
|
||||
r0=0x00000000 r3=0x00000000 r4=0x71287cf0 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
r8=0x00000000 r9=0x00009030 r10=0x00000002 r11=0x00000020 r12=0x822f1ff0 r13=0x4b90a000
|
||||
hw=3 idx=0 tid=4 state=Blocked(WaitAny { handles: [4120], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7112fb80
|
||||
r0=0x821511a0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b7732c r9=0x828f0000 r10=0x00000008 r11=0x00000000 r12=0x8245a660 r13=0x4adc6000
|
||||
hw=3 idx=1 tid=5 state=Blocked(WaitAny { handles: [4224], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7116fbe0
|
||||
r0=0x821511a0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x03a723d0
|
||||
r8=0x43b7732c r9=0x828f0000 r10=0x00000001 r11=0x00000000 r12=0x82458b34 r13=0x4adc8000
|
||||
hw=4 idx=0 tid=9 state=Ready pc=0x824d1404 lr=0x824d22b4 sp=0x71387df0
|
||||
r0=0x00000000 r3=0x4250dedc r4=0x4250e040 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x4b9ec000 r9=0x01010000 r10=0x01010000 r11=0x00000000 r12=0x824d22a8 r13=0x4b9ec000
|
||||
hw=5 idx=0 tid=3 state=Blocked(WaitAny { handles: [4112], deadline: None }) pc=0x824ac578 lr=0x824ac578 sp=0x7111fdf0
|
||||
r0=0x82153bf0 r3=0x00000000 r4=0x00000001 r5=0x00000000 r6=0x00000000 r7=0x00000a10
|
||||
r8=0x00000010 r9=0x00000000 r10=0x00009030 r11=0x00000000 r12=0x82181988 r13=0x4adc4000
|
||||
hw=5 idx=1 tid=6 state=Ready pc=0x824ab214 lr=0x824ab214 sp=0x7117fc60
|
||||
r0=0x821511a0 r3=0x00000001 r4=0x7117fcc0 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
r8=0x7117fcb0 r9=0x00009030 r10=0x00000002 r11=0x00000020 r12=0x82458d68 r13=0x4adca000
|
||||
hw=5 idx=2 tid=10 state=Ready pc=0x824d1404 lr=0x824d22b4 sp=0x71487e00
|
||||
r0=0x00000000 r3=0x4250dedc r4=0x4250e040 r5=0x00000001 r6=0x00000000 r7=0x00000000
|
||||
r8=0x4b9ee000 r9=0x01010000 r10=0x01010000 r11=0x00000000 r12=0x824d22a8 r13=0x4b9ee000
|
||||
hw=5 idx=3 tid=12 state=Ready pc=0x824aa6a4 lr=0x824aa6a4 sp=0x714a7da0
|
||||
r0=0x00000000 r3=0x000000ff r4=0x00000020 r5=0x714a7df4 r6=0x00000000 r7=0x00000000
|
||||
r8=0x00000000 r9=0x00000000 r10=0x00000000 r11=0x00000001 r12=0x8217898c r13=0x4d1d2000
|
||||
|
||||
-- Handle waiter lists --
|
||||
handle=0x00001018 Semaphore(0/2147483647) waiters(tid)=[4]
|
||||
handle=0x8287093c Event(sig=false, mr=false) waiters(tid)=[2]
|
||||
handle=0x00001070 Thread(id=13, exit=None) waiters(tid)=[1]
|
||||
handle=0x42450b5c Event(sig=false, mr=true) waiters(tid)=[7]
|
||||
handle=0x00001078 Event(sig=false, mr=false) waiters(tid)=[13]
|
||||
handle=0x00001080 Event(sig=false, mr=false) waiters(tid)=[5]
|
||||
handle=0x828a3244 Event(sig=false, mr=false) waiters(tid)=[11]
|
||||
handle=0x00001024 Semaphore(0/2147483647) waiters(tid)=[8]
|
||||
handle=0x828a3220 Event(sig=false, mr=true) waiters(tid)=[11]
|
||||
handle=0x00001010 Event(sig=false, mr=true) waiters(tid)=[3]
|
||||
handle=0x00001050 Event(sig=false, mr=true) waiters(tid)=[8]
|
||||
@@ -0,0 +1,127 @@
|
||||
# Phase C.1 — Validation refutes Phase A's bit-28 setter hypothesis
|
||||
|
||||
## TL;DR
|
||||
|
||||
Phase A claimed: "bit 28 of `[0x40d09a40]` (controller word) gets set in ours, causing sub_822F1AA8's dispatcher loop to exit early; candidate setter is `sub_821B55D8` at PC `0x821B5DA4`."
|
||||
|
||||
**Phase C.1 falsifies this in 4 sub-rounds:**
|
||||
|
||||
1. **`sub_821B55D8` is dead code** in both engines — its `XamInputSetState` wrapper `sub_824AA858` fires 0× in both.
|
||||
2. **`[0x40d09a40]` is never set to anything with bit 28** — `--dump-addr` at end of run shows `+0x00 = 0x00000021`, the entry value. Bit 28 is NEVER set.
|
||||
3. **The actual wedge is at the `bcctrl` at PC `0x822F1B4C`** (inside sub_822F1AA8 setup, BEFORE the dispatcher loop). tid=1 never reaches the loop top-check.
|
||||
4. **The bcctrl calls `sub_82173990`** (vtable[0] of the dispatcher singleton at `[0x828E1F08]`), which eventually waits for tid=13 to terminate. tid=13 wedges in the audit-049 silph::UImpl@GamePart_Title chain on handle `0x1078`.
|
||||
|
||||
The C.2 force-clear POC (the planned next step) would have **zero effect** because bit 28 is never set. Skipped per plan stopping criterion.
|
||||
|
||||
## Probe-fire counts (ours, 50M-instr parallel)
|
||||
|
||||
| PC | sub-round | fires | meaning |
|
||||
|---|---|---|---|
|
||||
| `0x821B55D8` (Phase A candidate fn entry) | 1 | **0** | function never reached → β/γ |
|
||||
| `0x821B5D98,DA0,DAC,D48` (loop BB heads) | 1 | **0** | function never reached |
|
||||
| `0x822F1AA8` (sub_822F1AA8 entry) | 2,3,4 | 2-3 | reached |
|
||||
| `0x822F1B38` (post-`bl 0x824AA8B0`) | 4 | 2 | reached |
|
||||
| `0x822F1B50` (post-`bcctrl`) | 4 | **0** | **bcctrl never returns** |
|
||||
| `0x822F1B60,B78,B80,BBC` (loop setup/top) | 3 | 0 | unreachable past bcctrl |
|
||||
| `0x822F1E10` (loop exit cleanup) | 2 | 0 | loop never entered, never exited |
|
||||
| `0x822F1E34` (post-thread-join) | 2 | 0 | never reached |
|
||||
| `0x82173990` (vtable[0] target) | 4 | 2 | called via bcctrl, r3=singleton (LR=0x822F1B50) |
|
||||
| `0x821748F0` (tid=13 entry) | 4 | 2 | tid=13 runs |
|
||||
| `0x821C4EB0` (silph::UImpl@GamePart_Title) | 4 | 2 | audit-009/049 reached on tid=13 |
|
||||
| `0x82457388,0x824574C0,0x82457408,0x82457490` (other oris candidates) | 2 | 0 | unreachable |
|
||||
|
||||
## Canary probe results
|
||||
|
||||
| PC | fires | meaning |
|
||||
|---|---|---|
|
||||
| `0x824AA858` (XamInputSetState wrapper) | **0** | sub_821B55D8 chain is dead code in CANARY too |
|
||||
| `0x822F1B50` (post-bcctrl, attempted) | **0** | canary's JitProlog only fires at function entries, so not directly testable; but per audit round-33 sub_821741C8 fires 471× in canary → bcctrl DOES return in canary |
|
||||
|
||||
## Critical evidence: `--dump-addr=0x40d09a40` at end of run
|
||||
|
||||
```
|
||||
addr=0x40d09a40
|
||||
+0x00: 00 00 00 21 00 00 00 01 42 44 df 00 40 54 1a 40
|
||||
^^^^^^^^^^^ ^^^^^^^^^^^
|
||||
+0x10: 40 54 1b 40 40 54 1b 80 40 54 1b c0 00 00 10 54
|
||||
+0x20: 00 00 00 00 40 24 a8 20 00 00 00 08 00 00 00 00
|
||||
```
|
||||
|
||||
- `[+0x00] = 0x00000021` ← bit 28 (mask 0x10000000) is NOT SET. Same value as at sub_822F1AA8 entry.
|
||||
- `[+0x1c] = 0x00001054` ← spawned init thread handle (= tid=8's thread handle, NOT 0x1070)
|
||||
- Thread state: tid=1 waits on handle `0x1070`, tid=13 waits on handle `0x1078`.
|
||||
|
||||
Handle `0x1070` is **tid=13's thread handle** (per stderr: `ExCreateThread: tid=13 handle=0x1070 entry=0x821748f0 ctx=0x4024a840 suspended=true`). So tid=1's wait at the wedge point is a **thread-join on tid=13**, NOT a thread-join on the dispatcher init thread (tid=8, handle 0x1054).
|
||||
|
||||
## Wedge path (corrected)
|
||||
|
||||
```
|
||||
entry_point (sub_824AB748) [tid=1 main]
|
||||
└─ sub_8216EA68
|
||||
└─ sub_822F1AA8(controller=0x40d09a40) [LR=0x8216EE14]
|
||||
├─ ExCreateThread(entry=sub_822F1EE0, ctx=controller) [PC 0x822F1B08]
|
||||
│ ⇒ tid=8 spawn, handle=0x1054 (suspended)
|
||||
├─ bl 0x824AA8B0 (no-op probe) [PC 0x822F1B34]
|
||||
└─ bcctrl on vtable[+0] of [0x828E1F08] singleton [PC 0x822F1B4C]
|
||||
│
|
||||
└─ sub_82173990(r3=singleton) [r3=0x40ba9a80, vtable=0x40111910]
|
||||
└─ ... (768-byte function with ≥18 calls; calls sub_82448AA0, sub_824AA7A0,
|
||||
sub_82448BC8, sub_82448C50, sub_8216F218, sub_8217C850, sub_82178E50,
|
||||
sub_821835E0, ...)
|
||||
└─ ... → KeWaitForSingleObject INFINITE on handle 0x1070
|
||||
(= tid=13's thread handle, thread-join)
|
||||
⇒ WEDGE — tid=13 never exits
|
||||
|
||||
(Concurrently — spawned somewhere else, not from sub_822F1AA8:)
|
||||
[tid=13, spawn-handle=0x1070, ctx=0x4024a840]
|
||||
└─ sub_821748F0 (worker boilerplate, entry from ExCreateThread)
|
||||
├─ sub_82172798, sub_82172818
|
||||
└─ sub_821749C0
|
||||
└─ sub_821CF3F0
|
||||
└─ ... → sub_821C4EB0 (UImpl@GamePart_Title@silph) [audit-009/049!]
|
||||
└─ ... → sub_821CB030 (creates KEVENT at +0x128)
|
||||
⇒ KeWaitForSingleObject INFINITE on handle 0x1078
|
||||
⇒ WEDGE — handle 0x1078 is never signaled in ours
|
||||
```
|
||||
|
||||
## Why Phase A's hypothesis is wrong
|
||||
|
||||
Phase A:
|
||||
1. Disassembled sub_822F1AA8's body, observed the bit-28 loop-exit check at `0x822F1BB8` and end-of-iter check at `0x822F1E0C`.
|
||||
2. Mem-watch on `0x40d09a40` showed zero stores → inferred "the setter writes via some path mem-watch doesn't capture."
|
||||
3. DB-scanned `oris ?, ?, 0x1000` (49 sites), found `sub_821B55D8 + 0x821B5DA4` with pattern `bl sub_824AA858 ; if r3 == 0xAA: oris r11, 0x1000 ; stw`.
|
||||
4. Concluded `sub_821B55D8` was the setter.
|
||||
|
||||
What Phase A missed:
|
||||
- Mem-watch's 0-stores result was correct: **NO setter exists**. Bit 28 is never set in either engine. The mem-watch null-result was a hint that the bit-28 hypothesis itself was wrong, but Phase A interpreted it as "mem-watch misses something."
|
||||
- The disasm-based hypothesis was visually compelling (a loop iterating arrays and setting bit 28 when a kernel call returns 0xAA) but never verified runtime.
|
||||
- `sub_821B55D8` is itself dead code in both engines.
|
||||
|
||||
## Reading-error class #19: disasm-pattern-match without runtime verification
|
||||
|
||||
When scanning for a hypothesized signal source via DB pattern-match (`oris ?, ?, 0x1000`), the analyst must run a probe to verify the suspected site is *both reached* and *takes the suspected path* before declaring it the cause. Phase A bypassed both checks. The single `--dump-addr=0x40d09a40` flag in sub-round 2 (literally 4 keystrokes added to the existing probe command) revealed the central assumption was wrong.
|
||||
|
||||
## Real divergence (handed to next session)
|
||||
|
||||
This is the **same wedge as audit-049/058/059**: tid=13 wedges in the silph::UImpl@GamePart_Title cluster on handle `0x1078`. tid=1 wedges on tid=13's thread-handle (`0x1070`) inside `sub_82173990`'s call chain.
|
||||
|
||||
`sub_82173990` is vtable[0] of the dispatcher singleton at `[0x828E1F08]`. It's a 768-byte function with ≥18 calls; the actual wait site is somewhere down its tree. To localize where in `sub_82173990` the wait happens, probe its BB heads + the `KeWaitForSingleObject` thunks (`sub_824AA330`, `sub_824AA708`).
|
||||
|
||||
The fix-shape is **NOT** "force-clear bit 28." The fix-shape is **"signal handle 0x1078 in the audit-049 cluster, or short-circuit tid=13's wait."** Round 22 (silph_synth.rs) attempted the cluster-A version of this. Cluster B (silph::UImpl) needs its own synthesis or a kernel-side signal of handle 0x1078.
|
||||
|
||||
## Phase C verdict
|
||||
|
||||
- C.1: 4 sub-rounds executed (within budget).
|
||||
- C.2: **NOT EXECUTED** — POC would be no-op since bit 28 is never set. Per plan stopping criterion, do not proceed to C.2 blind when C.1 refutes the diagnosis.
|
||||
- C.3: not applicable.
|
||||
- Branch state: no source changes. Audit artifacts only.
|
||||
|
||||
## Files in this directory
|
||||
|
||||
- `ours-c1-probe.log/stderr` — sub-round 1, probe at sub_821B55D8 BB heads (0 fires)
|
||||
- `ours-sr2-confirm-bit28.log/stderr` — sub-round 2, probe loop top/exit + dump-addr (bit 28 NEVER SET)
|
||||
- `ours-sr3-wait-trace.log/stderr` — sub-round 3, probe wait site + handle 0x1070 trace
|
||||
- `ours-sr4-bcctrl-trace.log/stderr` — sub-round 4, probe pre/post bcctrl + sub_82173990 entry + tid=13 entry (decisive)
|
||||
- canary side in `../round-C1-setter-validation-canary/`:
|
||||
- `canary-824AA858.log` — XamInputSetState wrapper fires 0× in canary too
|
||||
- `canary-822F1B50.log` — JitProlog can't probe at BB-internal PCs (function-entry-only)
|
||||
@@ -0,0 +1,144 @@
|
||||
# Phase D — Audit-049 Auto-Signal POC — FINDINGS
|
||||
|
||||
**Branch**: `iterate-2C/silph-ui-spawn-trace` (extends Phase C `481591f`)
|
||||
**Date**: 2026-06-11
|
||||
**Sub-rounds**: D2.SR1 → D2.SR4 (4/4 used)
|
||||
**Verdict**: **B — partial unwedge**
|
||||
|
||||
## Mission
|
||||
|
||||
Phase C diagnosed the audit-049 wedge as tid=13 (silph::UImpl@GamePart_Title) waiting INFINITE on a KEVENT created at `sub_821CB030+0x128` (`lr=0x821cb15c`, post-bl PC). The Phase D POC tests this diagnosis by hooking `NtCreateEvent` from that exact call site and auto-signaling the resulting handle after a configurable delay (`XENIA_SILPH_UI_AUTOSIGNAL_DELAY` instructions).
|
||||
|
||||
If tid=13 unblocks, the diagnosis is confirmed. If new wedges or new threads appear downstream, even better — that's actual game progression past the wedge.
|
||||
|
||||
## Result summary
|
||||
|
||||
| Symptom | SR2/SR3 baseline | SR4 (POC firing) |
|
||||
|---|---|---|
|
||||
| `silph autosignal: scheduled handle=0x1078 caller_lr=0x821cb15c` | yes (SR2/SR3) | yes |
|
||||
| `silph autosignal: firing handle=0x1078` | NO | **yes (cycle 16326209)** |
|
||||
| handle 0x1078 final | `signaled=false waiters=1 <NO_SIGNALS_DESPITE_WAITS>` | `signal_attempts=1 waiters=0` |
|
||||
| tid=13 final state | `Blocked(WaitAny[0x1078])` | **`Ready` pc=0x824a9108** |
|
||||
| tid=1 final state | `Blocked(WaitAny[0x1070])` thread-join | `Blocked(WaitAny[0x1070])` (tid=13 not yet exited) |
|
||||
| ExCreateThread total | 10 | **12 (+tid=14, +tid=15)** |
|
||||
| New downstream wedges | none past 0x1078 | **0x1084 (Event/Auto), 0x1088 (Event/Manual)** |
|
||||
| `cxx_throw` runtime_error decoded | none | **yes, stack depth 6, top L0=0x82612b50 → L4=sub_82450B60+0x1A8 → L6=sub_82450a50** |
|
||||
| VdSwap | 1 | 1 |
|
||||
| gpu.interrupt.delivered{source=0} | 6393 | 4539 (different trajectory, no draws) |
|
||||
|
||||
**Conclusion**: tid=13 unwedged cleanly from the audit-049 wait, spawned two follow-on threads (tid=14 entry=`silph` ctx=`0x40929c00`, tid=15 a worker), and progressed deep enough into the silph::UImpl state machine to throw a `runtime_error` from sub_82450a50 → sub_82450B60+0x1A8 (the dispatcher cluster from round 26). The auto-signal **is not** the proper signaler — it lets tid=13 proceed but downstream state-machine invariants the missing real signaler would have established are not in place, so the dispatcher trips on a "not-registered instance" lookup.
|
||||
|
||||
This is a **clean confirmation** of the Phase C diagnosis: the wedge handle, the wait site, and the LR filter are all correct. The fix shape is:
|
||||
- Either: synthesize the missing signaler properly (cluster-B silph_ui_synth.rs analogue from R33's deferred plan)
|
||||
- Or: track what the auto-signal needed to write into the work-item state (`[+8]` field per R26) BEFORE signaling, so the dispatcher's BST lookup succeeds
|
||||
|
||||
## Sub-round detail
|
||||
|
||||
### D2.SR1 — initial run, hook never fires (wrong LR filter)
|
||||
|
||||
Filter checked `creator_lr ∈ [0x821CB15C, 0x821CB160]` against `ctx.lr` at `nt_create_event` entry. But `ctx.lr` is the **thunk wrapper return slot** (`0x824a9f6c`), not the guest caller's post-bl PC. Confirmed via handle-audit `created stack` dump: frame 0 lr=`0x824a9f6c`, frame 1 lr=`0x821cb15c`. The guest caller's LR lives one frame up the PPC EABI back-chain.
|
||||
|
||||
Diagnosis classification: **D (filter mismatch)**. Reading-error class #20 (new).
|
||||
|
||||
### D2.SR2 — frame-1-LR fix; hook schedules, never fires
|
||||
|
||||
Refactored `maybe_register_silph_autosignal` to take `(ctx, mem)`, walk back-chain via existing `walk_guest_back_chain` (1 step), match the saved LR. Hook now fires:
|
||||
|
||||
```
|
||||
silph autosignal: scheduled handle=0x1078 caller_lr=0x821cb15c for cycle 10000 (now=0, delay=10000)
|
||||
```
|
||||
|
||||
But no "firing" log appears, and tid=13 stays Blocked. Classification: **D (drain site never reached)**.
|
||||
|
||||
### D2.SR3 — diagnostic added; confirms drain site never visited
|
||||
|
||||
Added a one-shot info-level "tick (first visit, none due)" log inside `fire_due_silph_autosignals` when pending is non-empty but nothing due. Re-ran. **The tick-diagnostic never fired either** — proving the function isn't being called at all in `--parallel` mode.
|
||||
|
||||
Root cause: `--parallel` dispatches to `run_execution_parallel` (line 2928 of main.rs), which has its own outer loop at line 3186. My Phase D wiring only touched the lockstep path at line 2763. Classification: **D (wrong code path wired)**.
|
||||
|
||||
### D2.SR4 — parallel-path wiring added; hook fires; tid=13 unblocks
|
||||
|
||||
Added the same `set_now_cycle_hint` + `fire_due_silph_autosignals` calls inside the parallel outer loop, right after `coord_pre_round` (and under the same `kernel_arc` guard, so no extra locking). Re-built, re-ran.
|
||||
|
||||
Now all three log lines appear:
|
||||
|
||||
```
|
||||
silph autosignal: scheduled handle=0x1078 caller_lr=0x821cb15c for cycle 16326202 (now=16316202, delay=10000)
|
||||
silph autosignal: tick (first visit, none due) now=16316213 pending=1 first_deadline=16326202
|
||||
silph autosignal: firing handle=0x1078 prev_signaled=Some(false) at cycle 16326209
|
||||
```
|
||||
|
||||
`now=16316202` at schedule time confirms `set_now_cycle_hint` is wired through correctly (the parallel path was simply never visited in SR2/SR3). Fire at cycle 16326209 = deadline 16326202 + 7-cycle scheduler granularity. Diagnostic classification: **B (partial unwedge — new waits and cxx_throw downstream)**.
|
||||
|
||||
## Code shape
|
||||
|
||||
POC is ~70 LOC across four files, all env-gated. Default off.
|
||||
|
||||
| File | Change | Lines |
|
||||
|---|---|---|
|
||||
| `crates/xenia-cpu/src/scheduler.rs` | `GuestThread.start_entry/start_context` fields; `spawn()` populates; `current_thread_entry_and_ctx()` helper | +18 |
|
||||
| `crates/xenia-kernel/src/state.rs` | `AutoSignalPending` struct; `silph_autosignal_*` fields; `set_now_cycle_hint`, `maybe_register_silph_autosignal`, `fire_due_silph_autosignals` methods | +95 |
|
||||
| `crates/xenia-kernel/src/exports.rs` | Hook in `nt_create_event` | +3 |
|
||||
| `crates/xenia-app/src/main.rs` | Fire-site wiring in lockstep loop (line 2788) **and** parallel loop (line 3215) | +12 |
|
||||
|
||||
Tests stay green at **655/655**.
|
||||
|
||||
## Reading-error class #20 (new)
|
||||
|
||||
**`ctx.lr` at kernel export entry ≠ guest caller's post-bl PC.** When a guest `bl` calls an export thunk, the thunk-wrapper has its own frame between the guest caller and the export body. At export-body entry, `ctx.lr` holds the *wrapper's* return slot, not the guest caller's post-bl PC.
|
||||
|
||||
To match a specific guest call site by LR, the export must walk one step up the back-chain (`walk_guest_back_chain(ctx.gpr[1], ctx.lr, mem, 2)`) and use `frames[1].lr`.
|
||||
|
||||
SR1 burned one full sub-round on this. Detect early in future POCs by comparing `ctx.lr` against the handle-audit's `created stack` frame dump for a known-good event (e.g. one created from a labelled site).
|
||||
|
||||
## Reading-error class #21 (new)
|
||||
|
||||
**`--parallel` and lockstep have separate outer loops in main.rs.** They share `coord_pre_round` (carved out exactly for this reason), but anything wired adjacent to that call site only takes effect on the path it's wired on. Lockstep is `run_execution` (line 2706, outer loop at 2763). Parallel is `run_execution_parallel` (line 2928, outer loop at 3186).
|
||||
|
||||
Per-round hooks added for a specific build mode must be wired in **both** paths. SR2/SR3 burned two sub-rounds on this.
|
||||
|
||||
## Files modified + LR mapping (for follow-up sessions)
|
||||
|
||||
**Wedge handle creation** (confirmed by handle-audit dump):
|
||||
```
|
||||
created cycle=0 tid=13 lr=0x824a9f6c [src=NtCreateEvent thunk return]
|
||||
created stack (6 frames):
|
||||
[ 0] fp=0x715a7a10 lr=0x824a9f6c ← ctx.lr at nt_create_event
|
||||
[ 1] fp=0x715a7aa0 lr=0x821cb15c ← guest caller's post-bl PC (filter on this)
|
||||
[ 2] fp=0x715a7bd0 lr=0x821cbae0 ← sub_821CBA08 frame
|
||||
[ 3] fp=0x715a7cd0 lr=0x821cc454 ← sub_821CC3F8 frame
|
||||
[ 4] fp=0x715a7d60 lr=0x821c4f18 ← sub_821C4EB0 frame (silph::UImpl@GamePart_Title)
|
||||
[ 5] fp=0x715a7e00 lr=0x82174a80 ← sub_821748F0 trampoline frame
|
||||
```
|
||||
|
||||
**Downstream cxx_throw stack** (after auto-signal fires, tid=5 throws runtime_error):
|
||||
```
|
||||
L0 lr=0x82612b50 std::exception throw path
|
||||
L1 lr=0x825f2444
|
||||
L2 lr=0x824547e8
|
||||
L3 lr=0x82451418
|
||||
L4 lr=0x82450d08 ← sub_82450B60+0x1A8 (dispatcher, audit-059 R26)
|
||||
L5 lr=0x82450b34
|
||||
L6 lr=0x82450a50 ← sub_82450a50 (worker dispatch)
|
||||
|
||||
cxx_throw runtime_error decoded magic=0x19930520
|
||||
cxx_throw BST ceil search candidate_key=0x828e2b2c match_found=false
|
||||
cxx_throw lhs (not-registered instance) lhs=0x715a7af0
|
||||
```
|
||||
|
||||
This confirms the dispatcher reached audit-049 territory (R26's `sub_82450B60+0x1A8` PC `0x82450D08`), looked up a runtime instance in its BST keyed by VA, and the instance was never registered. **The auto-signal bypassed an upstream registration step** the real signaler would have driven.
|
||||
|
||||
## Recommendation
|
||||
|
||||
Ship the POC env-gated (default off; no behavior change unless opted in). The verdict-B success makes it a useful diagnostic flag for future audit-049 work: future investigations can set `XENIA_SILPH_UI_AUTOSIGNAL_DELAY=10000` to skip the wedge and probe downstream behavior without first writing the proper signaler.
|
||||
|
||||
Long-term fix path remains the R33 silph_ui_synth.rs analogue: synthesize the missing signaler + its precondition state (BST instance registration at `0x715a7af0`-equivalent, work-item state `[+8]` per R26). The auto-signal POC is **not** the final fix — it confirms diagnosis but doesn't honor the dispatcher's BST registry invariant.
|
||||
|
||||
## Artifacts
|
||||
|
||||
- `poc-sr1.log`, `poc-sr1.stderr` — initial run, filter mismatch (D)
|
||||
- `poc-sr2.log`, `poc-sr2.stderr` — frame-1-LR fix, no fire (D)
|
||||
- `poc-sr3.log`, `poc-sr3.stderr` — diagnostic added, no fire (D, parallel path unwired)
|
||||
- `poc-sr4.log`, `poc-sr4.stderr` — parallel-path wired, **fires + partial unwedge (B)**
|
||||
|
||||
All `.log`/`.stderr` files are `.gitignore`d; this `FINDINGS.md` is the only artifact-side commit.
|
||||
@@ -0,0 +1,200 @@
|
||||
0x82450b60: lwz r18, 9792(r31)
|
||||
0x82450b64: lwz r16, 13880(r14)
|
||||
0x82450b68: mflr r12
|
||||
0x82450b6c: bl 0x825F0F74
|
||||
0x82450b70: subi r31, r1, 176
|
||||
0x82450b74: stwu r1, -176(r1)
|
||||
0x82450b78: mr r29, r4
|
||||
0x82450b7c: mr r27, r3
|
||||
0x82450b80: cmpwi cr6, r29, 5
|
||||
0x82450b84: bne cr6, 0x82450B94
|
||||
0x82450b88: addi r28, r27, 196
|
||||
0x82450b8c: addi r26, r27, 28
|
||||
0x82450b90: b 0x82450BAC
|
||||
0x82450b94: slwi r11, r29, 2
|
||||
0x82450b98: mr r26, r27
|
||||
0x82450b9c: add r11, r29, r11
|
||||
0x82450ba0: slwi r11, r11, 2
|
||||
0x82450ba4: add r11, r11, r27
|
||||
0x82450ba8: addi r28, r11, 96
|
||||
0x82450bac: addi r23, r27, 56
|
||||
0x82450bb0: mr r3, r23
|
||||
0x82450bb4: stw r23, 84(r31)
|
||||
0x82450bb8: bl 0x8284DCFC
|
||||
0x82450bbc: mr r3, r26
|
||||
0x82450bc0: bl 0x8284DCFC
|
||||
0x82450bc4: lwz r7, 16(r28)
|
||||
0x82450bc8: cntlzw r11, r7
|
||||
0x82450bcc: extrwi r11, r11, 1, 26
|
||||
0x82450bd0: cmplwi cr6, r11, 0x0
|
||||
0x82450bd4: beq cr6, 0x82450BEC
|
||||
0x82450bd8: mr r3, r26
|
||||
0x82450bdc: bl 0x8284DD0C
|
||||
0x82450be0: mr r3, r23
|
||||
0x82450be4: bl 0x8284DD0C
|
||||
0x82450be8: b 0x82450EE8
|
||||
0x82450bec: lwz r11, 12(r28)
|
||||
0x82450bf0: lwz r9, 8(r28)
|
||||
0x82450bf4: srwi r10, r11, 2
|
||||
0x82450bf8: clrlwi r8, r11, 30
|
||||
0x82450bfc: cmplw cr6, r9, r10
|
||||
0x82450c00: bgt cr6, 0x82450C08
|
||||
0x82450c04: sub r10, r10, r9
|
||||
0x82450c08: lwz r9, 4(r28)
|
||||
0x82450c0c: slwi r10, r10, 2
|
||||
0x82450c10: slwi r8, r8, 2
|
||||
0x82450c14: lwz r6, 8(r28)
|
||||
0x82450c18: addi r11, r11, 1
|
||||
0x82450c1c: slwi r6, r6, 2
|
||||
0x82450c20: li r24, 0
|
||||
0x82450c24: lwzx r10, r10, r9
|
||||
0x82450c28: cmplw cr6, r6, r11
|
||||
0x82450c2c: lwzx r30, r10, r8
|
||||
0x82450c30: stw r11, 12(r28)
|
||||
0x82450c34: stw r30, 80(r31)
|
||||
0x82450c38: bgt cr6, 0x82450C40
|
||||
0x82450c3c: stw r24, 12(r28)
|
||||
0x82450c40: subic. r11, r7, 1
|
||||
0x82450c44: stw r11, 16(r28)
|
||||
0x82450c48: bne 0x82450C50
|
||||
0x82450c4c: stw r24, 12(r28)
|
||||
0x82450c50: addi r25, r27, 28
|
||||
0x82450c54: mr r3, r25
|
||||
0x82450c58: bl 0x8284DCFC
|
||||
0x82450c5c: mr r3, r25
|
||||
0x82450c60: stw r30, 216(r27)
|
||||
0x82450c64: bl 0x8284DD0C
|
||||
0x82450c68: mr r3, r26
|
||||
0x82450c6c: bl 0x8284DD0C
|
||||
0x82450c70: lwz r11, 28(r30)
|
||||
0x82450c74: clrlwi r11, r11, 31
|
||||
0x82450c78: cmplwi cr6, r11, 0x0
|
||||
0x82450c7c: bne cr6, 0x82450D30
|
||||
0x82450c80: lwz r11, 8(r30)
|
||||
0x82450c84: cmplwi cr6, r11, 0x1
|
||||
0x82450c88: blt cr6, 0x82450CE4
|
||||
0x82450c8c: bne cr6, 0x82450D3C
|
||||
0x82450c90: lwz r11, 28(r30)
|
||||
0x82450c94: rlwinm r11, r11, 0, 29, 29
|
||||
0x82450c98: cmplwi cr6, r11, 0x0
|
||||
0x82450c9c: beq cr6, 0x82450CB0
|
||||
0x82450ca0: mr r4, r30
|
||||
0x82450ca4: mr r3, r27
|
||||
0x82450ca8: bl 0x824510E0
|
||||
0x82450cac: b 0x82450CBC
|
||||
0x82450cb0: mr r4, r30
|
||||
0x82450cb4: mr r3, r27
|
||||
0x82450cb8: bl 0x824517B0
|
||||
0x82450cbc: stw r29, 220(r27)
|
||||
0x82450cc0: bl 0x824AA830
|
||||
0x82450cc4: mr r11, r3
|
||||
0x82450cc8: lwz r3, 92(r27)
|
||||
0x82450ccc: li r5, 0
|
||||
0x82450cd0: addi r11, r11, 66
|
||||
0x82450cd4: li r4, 1
|
||||
0x82450cd8: stw r11, 224(r27)
|
||||
0x82450cdc: bl 0x824AB158
|
||||
0x82450ce0: b 0x82450D3C
|
||||
0x82450ce4: lwz r11, 28(r30)
|
||||
0x82450ce8: mr r4, r30
|
||||
0x82450cec: mr r3, r27
|
||||
0x82450cf0: rlwinm r11, r11, 0, 29, 29
|
||||
0x82450cf4: cmplwi cr6, r11, 0x0
|
||||
0x82450cf8: beq cr6, 0x82450D04
|
||||
0x82450cfc: bl 0x82450F68
|
||||
0x82450d00: b 0x82450D08
|
||||
0x82450d04: bl 0x82451238
|
||||
0x82450d08: stw r29, 220(r27)
|
||||
0x82450d0c: bl 0x824AA830
|
||||
0x82450d10: mr r11, r3
|
||||
0x82450d14: lwz r3, 92(r27)
|
||||
0x82450d18: li r5, 0
|
||||
0x82450d1c: addi r11, r11, 66
|
||||
0x82450d20: li r4, 1
|
||||
0x82450d24: stw r11, 224(r27)
|
||||
0x82450d28: bl 0x824AB158
|
||||
0x82450d2c: b 0x82450D3C
|
||||
0x82450d30: lwz r11, 28(r30)
|
||||
0x82450d34: ori r11, r11, 0x2
|
||||
0x82450d38: stw r11, 28(r30)
|
||||
0x82450d3c: lwz r11, 8(r30)
|
||||
0x82450d40: mr r29, r24
|
||||
0x82450d44: cmpwi cr6, r11, 2
|
||||
0x82450d48: blt cr6, 0x82450E08
|
||||
0x82450d4c: cmpwi cr6, r11, 3
|
||||
0x82450d50: ble cr6, 0x82450DA0
|
||||
0x82450d54: cmpwi cr6, r11, 4
|
||||
0x82450d58: bne cr6, 0x82450E08
|
||||
0x82450d5c: lwz r11, 28(r30)
|
||||
0x82450d60: rlwinm r11, r11, 0, 29, 29
|
||||
0x82450d64: cmplwi cr6, r11, 0x0
|
||||
0x82450d68: bne cr6, 0x82450D98
|
||||
0x82450d6c: lwz r29, 36(r30)
|
||||
0x82450d70: mr r3, r29
|
||||
0x82450d74: lwz r11, 0(r29)
|
||||
0x82450d78: lwz r11, 4(r11)
|
||||
0x82450d7c: mtctr r11
|
||||
0x82450d80: bctrl
|
||||
0x82450d84: clrlwi r11, r3, 24
|
||||
0x82450d88: cmplwi cr6, r11, 0x0
|
||||
0x82450d8c: beq cr6, 0x82450D98
|
||||
0x82450d90: mr r3, r29
|
||||
0x82450d94: bl 0x8244FB38
|
||||
0x82450d98: li r29, 1
|
||||
0x82450d9c: b 0x82450E28
|
||||
0x82450da0: addi r3, r30, 40
|
||||
0x82450da4: bl 0x82451DB8
|
||||
0x82450da8: lwz r11, 32(r30)
|
||||
0x82450dac: cmplwi cr6, r11, 0x0
|
||||
0x82450db0: beq cr6, 0x82450DCC
|
||||
0x82450db4: rlwinm r11, r11, 0, 0, 31
|
||||
0x82450db8: lwz r10, 4(r30)
|
||||
0x82450dbc: lwz r11, 4(r11)
|
||||
0x82450dc0: cmplw cr6, r10, r11
|
||||
0x82450dc4: li r11, 1
|
||||
0x82450dc8: beq cr6, 0x82450DD0
|
||||
0x82450dcc: mr r11, r24
|
||||
0x82450dd0: clrlwi r11, r11, 24
|
||||
0x82450dd4: cmplwi cr6, r11, 0x0
|
||||
0x82450dd8: beq cr6, 0x82450E00
|
||||
0x82450ddc: lwz r4, 8(r30)
|
||||
0x82450de0: lwz r5, 0(r30)
|
||||
0x82450de4: lwz r3, 32(r30)
|
||||
0x82450de8: cmpwi cr6, r4, 1
|
||||
0x82450dec: ble cr6, 0x82450DFC
|
||||
0x82450df0: bl 0x8245D9D8
|
||||
0x82450df4: li r29, 1
|
||||
0x82450df8: b 0x82450E28
|
||||
0x82450dfc: stw r4, 8(r3)
|
||||
0x82450e00: li r29, 1
|
||||
0x82450e04: b 0x82450E28
|
||||
0x82450e08: mr r3, r26
|
||||
0x82450e0c: stw r26, 88(r31)
|
||||
0x82450e10: bl 0x8284DCFC
|
||||
0x82450e14: addi r4, r31, 80
|
||||
0x82450e18: mr r3, r28
|
||||
0x82450e1c: bl 0x823232C0
|
||||
0x82450e20: mr r3, r26
|
||||
0x82450e24: bl 0x8284DD0C
|
||||
0x82450e28: clrlwi r11, r29, 24
|
||||
0x82450e2c: cmplwi cr6, r11, 0x0
|
||||
0x82450e30: beq cr6, 0x82450ECC
|
||||
0x82450e34: lwz r11, 28(r30)
|
||||
0x82450e38: rlwinm r11, r11, 0, 30, 30
|
||||
0x82450e3c: cmplwi cr6, r11, 0x0
|
||||
0x82450e40: beq cr6, 0x82450E68
|
||||
0x82450e44: mr r3, r26
|
||||
0x82450e48: stw r26, 88(r31)
|
||||
0x82450e4c: bl 0x8284DCFC
|
||||
0x82450e50: addi r4, r31, 80
|
||||
0x82450e54: mr r3, r28
|
||||
0x82450e58: bl 0x823232C0
|
||||
0x82450e5c: mr r3, r26
|
||||
0x82450e60: bl 0x8284DD0C
|
||||
0x82450e64: b 0x82450ECC
|
||||
0x82450e68: lwz r11, 40(r30)
|
||||
0x82450e6c: cmplwi cr6, r11, 0x0
|
||||
0x82450e70: beq cr6, 0x82450EA4
|
||||
0x82450e74: rlwinm r3, r11, 0, 0, 31
|
||||
0x82450e78: bl 0x82458A70
|
||||
0x82450e7c: lwz r29, 40(r30)
|
||||
@@ -0,0 +1,80 @@
|
||||
0x82451238: mflr r12
|
||||
0x8245123c: li r0, 0
|
||||
0x82451240: stw r0, 4(r1)
|
||||
0x82451244: bl 0x825F0F80
|
||||
0x82451248: subi r31, r1, 160
|
||||
0x8245124c: stwu r1, -160(r1)
|
||||
0x82451250: mr r30, r4
|
||||
0x82451254: li r9, 1
|
||||
0x82451258: lwz r10, 32(r30)
|
||||
0x8245125c: stw r30, 188(r31)
|
||||
0x82451260: stw r9, 8(r30)
|
||||
0x82451264: cmplwi cr6, r10, 0x0
|
||||
0x82451268: beq cr6, 0x82451288
|
||||
0x8245126c: lwz r11, 4(r30)
|
||||
0x82451270: lwz r8, 4(r10)
|
||||
0x82451274: cmplw cr6, r11, r8
|
||||
0x82451278: bne cr6, 0x82451288
|
||||
0x8245127c: mr r11, r9
|
||||
0x82451280: li r26, 0
|
||||
0x82451284: b 0x82451290
|
||||
0x82451288: li r26, 0
|
||||
0x8245128c: mr r11, r26
|
||||
0x82451290: clrlwi r11, r11, 24
|
||||
0x82451294: cmplwi cr6, r11, 0x0
|
||||
0x82451298: beq cr6, 0x824512A0
|
||||
0x8245129c: stw r9, 8(r10)
|
||||
0x824512a0: lwz r3, 36(r30)
|
||||
0x824512a4: lwz r11, 0(r3)
|
||||
0x824512a8: lwz r11, 32(r11)
|
||||
0x824512ac: mtctr r11
|
||||
0x824512b0: bctrl
|
||||
0x824512b4: mr r27, r3
|
||||
0x824512b8: stw r26, 84(r31)
|
||||
0x824512bc: stw r27, 96(r31)
|
||||
0x824512c0: bl 0x82454498
|
||||
0x824512c4: addi r4, r31, 84
|
||||
0x824512c8: bl 0x82454580
|
||||
0x824512cc: stw r26, 92(r31)
|
||||
0x824512d0: addi r11, r27, 2047
|
||||
0x824512d4: lis r10, 0x2
|
||||
0x824512d8: clrrwi r11, r11, 11
|
||||
0x824512dc: cmplw cr6, r11, r10
|
||||
0x824512e0: stw r11, 100(r31)
|
||||
0x824512e4: ble cr6, 0x824512F4
|
||||
0x824512e8: lis r11, 0x8207
|
||||
0x824512ec: addi r11, r11, 6724
|
||||
0x824512f0: b 0x824512F8
|
||||
0x824512f4: addi r11, r31, 100
|
||||
0x824512f8: addi r3, r31, 84
|
||||
0x824512fc: lwz r4, 0(r11)
|
||||
0x82451300: bl 0x82454B08
|
||||
0x82451304: mr r8, r8
|
||||
0x82451308: mr r28, r3
|
||||
0x8245130c: stw r28, 92(r31)
|
||||
0x82451310: b 0x82451324
|
||||
0x82451314: lwz r30, 188(r31)
|
||||
0x82451318: lwz r27, 96(r31)
|
||||
0x8245131c: li r26, 0
|
||||
0x82451320: lwz r28, 92(r31)
|
||||
0x82451324: addi r3, r31, 84
|
||||
0x82451328: bl 0x82454AA0
|
||||
0x8245132c: mr r29, r3
|
||||
0x82451330: cmplwi cr6, r28, 0x0
|
||||
0x82451334: beq cr6, 0x82451684
|
||||
0x82451338: lwz r3, 36(r30)
|
||||
0x8245133c: li r8, 0
|
||||
0x82451340: addi r7, r31, 88
|
||||
0x82451344: mr r6, r29
|
||||
0x82451348: mr r5, r29
|
||||
0x8245134c: mr r4, r28
|
||||
0x82451350: lwz r11, 0(r3)
|
||||
0x82451354: lwz r11, 28(r11)
|
||||
0x82451358: mtctr r11
|
||||
0x8245135c: bctrl
|
||||
0x82451360: clrlwi r11, r3, 24
|
||||
0x82451364: cmplwi cr6, r11, 0x0
|
||||
0x82451368: beq cr6, 0x82451684
|
||||
0x8245136c: lwz r11, 28(r30)
|
||||
0x82451370: rlwinm r11, r11, 0, 28, 28
|
||||
0x82451374: cmplwi cr6, r11, 0x0
|
||||
@@ -0,0 +1,52 @@
|
||||
=== Fire counts ===
|
||||
ours: 3
|
||||
canary: 7
|
||||
|
||||
=== Per-LR breakdown ===
|
||||
ours:
|
||||
lr=0x82458674: 3
|
||||
canary:
|
||||
lr=0x82457bd4: 2
|
||||
lr=0x82458674: 5
|
||||
|
||||
=== Side-by-side first 5 fires (entry registers) ===
|
||||
|
||||
--- fire #0 ---
|
||||
ours: tid=6 cycle=363 lr=0x82458674 r3=0x40ba9ac0
|
||||
dump: 419fecda 000007f6 00000000 41d7dd10 00001688 00000000 00000000 41f5dd80 82457958 823f53f0 00000000 00000000 00000001 00000000 00000000 4024a5c0
|
||||
canary: tid=11 cycle=<unk> lr=0x82458674 r3=0xbccc4ac0 r4=0x00000000 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
dump: bdb19cda 000007f6 00000000 bde98d10 00001688 00000000 00000000 be078d80 82457958 823f53f0 00000000 00000000 00000001 00000000 00000000 bc365760
|
||||
|
||||
--- fire #1 ---
|
||||
ours: tid=6 cycle=140548 lr=0x82458674 r3=0x40ba9b80
|
||||
dump: 42c0f09a 00018ff6 00000000 43777210 0004d055 00000000 00000000 41f60d80 82457958 823f53f0 00000000 00000000 00000001 00000000 00000000 4024a960
|
||||
canary: tid=11 cycle=<unk> lr=0x82458674 r3=0xbccc4b80 r4=0x00000000 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
dump: bed2a09a 00018ff6 00000000 bf892210 0004d055 00000000 00000000 be07bd80 82457958 823f53f0 00000000 00000000 00000001 00000000 00000000 bc365840
|
||||
|
||||
--- fire #2 ---
|
||||
ours: tid=6 cycle=5957876 lr=0x82458674 r3=0x40ba9b80
|
||||
dump: 419fecda 000007f6 00000000 414f5f70 000003b9 00000000 00000000 41f60d80 82457958 823f53f0 00000000 00000040 00000001 00000000 00000000 4024a980
|
||||
canary: tid=11 cycle=<unk> lr=0x82458674 r3=0xbccc4b80 r4=0x00000000 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
dump: bdb19cda 000007f6 00000000 bd610b90 000003b9 00000000 00000000 be07bd80 82457958 823f53f0 00000000 00000040 00000001 00000000 00000000 bc365860
|
||||
|
||||
--- fire #3 ---
|
||||
ours: <no fire>
|
||||
canary: tid=11 cycle=<unk> lr=0x82458674 r3=0xbccc5300 r4=0x00000000 r5=0x00000001 r6=0x00000001 r7=0x00000000
|
||||
dump: bdb1acda 000007f6 00000000 bce24ed0 00000167 00000000 00000000 be07bd80 82457958 823f53f0 00000000 00000000 00000001 00000000 00000000 bc365f40
|
||||
|
||||
--- fire #4 ---
|
||||
ours: <no fire>
|
||||
canary: tid=6 cycle=<unk> lr=0x82457bd4 r3=0x701cf3c0 r4=0x00000004 r5=0x00002530 r6=0x00008000 r7=0x00000001
|
||||
dump: be95af9a 0000c170 00000000 b2050010 000681e9 00000000 00000000 be07bd80 82457958 823f53f0 00000000 0000c17a 00000001 701cf4e0 00000000 be95af90
|
||||
|
||||
=== Equivalence check: u32 lanes at +0x04 and +0x10 (work-item magic + counter) ===
|
||||
Both fields are stable identifiers across engines (host VAs differ but data should match).
|
||||
|
||||
Index of fields:
|
||||
[+0x04] = work-item 'size?' (looks like a length field)
|
||||
[+0x10] = state counter (per round 30, this is [+128/4 ?]) — but in dump it's u32[4]
|
||||
|
||||
ours [+04,+10]: [(2038, 5768), (102390, 315477), (2038, 953)]
|
||||
canary [+04,+10]: [(2038, 5768), (102390, 315477), (2038, 953), (2038, 359), (49520, 426473), (232195, 999643), (6134, 13763)]
|
||||
|
||||
ours fires whose [+04,+10] match a canary fire: 3/3
|
||||
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Round 35 lockstep diff: align sub_8280AD40 entry fires between
|
||||
ours (--audit-pc-probe-hex AUDIT-PC-PROBE / AUDIT-R3-DUMP) and
|
||||
canary (AUDIT-HLC JitProlog).
|
||||
|
||||
Outputs side-by-side rendering of:
|
||||
- per-fire entry register snapshot (r3..r10, lr)
|
||||
- 64-byte r3 dump (u32 lanes, big-endian)
|
||||
Alignment is by tid + invocation order (no input-equivalence required).
|
||||
"""
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
OURS_LOG = os.path.join(THIS_DIR, "ours.log")
|
||||
CANARY_LOG = os.path.join(
|
||||
os.path.dirname(THIS_DIR), "round35-lockstep-inflate-canary", "canary.log"
|
||||
)
|
||||
|
||||
PC_TARGET = 0x8280AD40
|
||||
|
||||
|
||||
def parse_ours(path):
|
||||
"""Pair AUDIT-PC-PROBE lines with their following AUDIT-R3-DUMP lines."""
|
||||
fires = []
|
||||
cur = None
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith("AUDIT-PC-PROBE"):
|
||||
m = re.search(
|
||||
r"pc=0x([0-9a-f]+) tid=(\d+) hw=\d+ cycle=(\d+) lr=0x([0-9a-f]+) r3=0x([0-9a-f]+) r11=0x([0-9a-f]+)",
|
||||
line,
|
||||
)
|
||||
if not m:
|
||||
continue
|
||||
pc = int(m.group(1), 16)
|
||||
if pc != PC_TARGET:
|
||||
cur = None
|
||||
continue
|
||||
cur = {
|
||||
"tid": int(m.group(2)),
|
||||
"cycle": int(m.group(3)),
|
||||
"lr": int(m.group(4), 16),
|
||||
"r3": int(m.group(5), 16),
|
||||
"dump": [],
|
||||
}
|
||||
fires.append(cur)
|
||||
elif line.startswith("AUDIT-R3-DUMP") and cur is not None:
|
||||
lanes = re.findall(r"\+0x[0-9a-f]+=0x([0-9a-f]+)", line)
|
||||
cur["dump"] = [int(x, 16) for x in lanes]
|
||||
cur = None
|
||||
return fires
|
||||
|
||||
|
||||
def parse_canary(path):
|
||||
"""Pair AUDIT-HLC JitProlog header lines with following r3+NN dump lines."""
|
||||
fires = []
|
||||
cur = None
|
||||
hdr_re = re.compile(
|
||||
r"AUDIT-HLC JitProlog pc=8280AD40 tid=([0-9A-F]+) r3=([0-9A-F]+) r4=([0-9A-F]+) "
|
||||
r"r5=([0-9A-F]+) r6=([0-9A-F]+) r7=([0-9A-F]+) r8=([0-9A-F]+) r9=([0-9A-F]+) r10=([0-9A-F]+) lr=([0-9A-F]+)"
|
||||
)
|
||||
dump_re = re.compile(
|
||||
r"AUDIT-HLC JitProlog pc=8280AD40 r3\+([0-9A-F]+): ([0-9A-F]+) ([0-9A-F]+) ([0-9A-F]+) ([0-9A-F]+)"
|
||||
)
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
m = hdr_re.search(line)
|
||||
if m:
|
||||
cur = {
|
||||
"tid": int(m.group(1), 16),
|
||||
"r3": int(m.group(2), 16),
|
||||
"r4": int(m.group(3), 16),
|
||||
"r5": int(m.group(4), 16),
|
||||
"r6": int(m.group(5), 16),
|
||||
"r7": int(m.group(6), 16),
|
||||
"r8": int(m.group(7), 16),
|
||||
"r9": int(m.group(8), 16),
|
||||
"r10": int(m.group(9), 16),
|
||||
"lr": int(m.group(10), 16),
|
||||
"dump": [],
|
||||
}
|
||||
fires.append(cur)
|
||||
continue
|
||||
m = dump_re.search(line)
|
||||
if m and cur is not None:
|
||||
off = int(m.group(1), 16)
|
||||
for i in range(4):
|
||||
word = int(m.group(2 + i), 16)
|
||||
# extend dump to fit
|
||||
idx = off // 4 + i
|
||||
while len(cur["dump"]) <= idx:
|
||||
cur["dump"].append(0)
|
||||
cur["dump"][idx] = word
|
||||
return fires
|
||||
|
||||
|
||||
def fmt_dump(d):
|
||||
return " ".join(f"{w:08x}" for w in d[:16])
|
||||
|
||||
|
||||
def main():
|
||||
ours = parse_ours(OURS_LOG)
|
||||
canary = parse_canary(CANARY_LOG)
|
||||
|
||||
print(f"=== Fire counts ===")
|
||||
print(f" ours: {len(ours)}")
|
||||
print(f" canary: {len(canary)}")
|
||||
print()
|
||||
|
||||
print(f"=== Per-LR breakdown ===")
|
||||
for label, fires in (("ours", ours), ("canary", canary)):
|
||||
lr_counts = {}
|
||||
for f in fires:
|
||||
lr_counts[f["lr"]] = lr_counts.get(f["lr"], 0) + 1
|
||||
print(f" {label}:")
|
||||
for lr, n in sorted(lr_counts.items()):
|
||||
print(f" lr=0x{lr:08x}: {n}")
|
||||
print()
|
||||
|
||||
print(f"=== Side-by-side first 5 fires (entry registers) ===")
|
||||
n = max(len(ours), len(canary))
|
||||
n = min(n, 5)
|
||||
for i in range(n):
|
||||
print(f"\n--- fire #{i} ---")
|
||||
if i < len(ours):
|
||||
f = ours[i]
|
||||
print(
|
||||
f" ours: tid={f['tid']:<3} cycle={f['cycle']:<10} lr=0x{f['lr']:08x} r3=0x{f['r3']:08x}"
|
||||
)
|
||||
print(f" dump: {fmt_dump(f['dump'])}")
|
||||
else:
|
||||
print(f" ours: <no fire>")
|
||||
if i < len(canary):
|
||||
f = canary[i]
|
||||
print(
|
||||
f" canary: tid={f['tid']:<3} cycle=<unk> lr=0x{f['lr']:08x} r3=0x{f['r3']:08x} "
|
||||
f"r4=0x{f['r4']:08x} r5=0x{f['r5']:08x} r6=0x{f['r6']:08x} r7=0x{f['r7']:08x}"
|
||||
)
|
||||
print(f" dump: {fmt_dump(f['dump'])}")
|
||||
else:
|
||||
print(f" canary: <no fire>")
|
||||
|
||||
print()
|
||||
print("=== Equivalence check: u32 lanes at +0x04 and +0x10 (work-item magic + counter) ===")
|
||||
print(" Both fields are stable identifiers across engines (host VAs differ but data should match).")
|
||||
print()
|
||||
print(" Index of fields:")
|
||||
print(" [+0x04] = work-item 'size?' (looks like a length field)")
|
||||
print(" [+0x10] = state counter (per round 30, this is [+128/4 ?]) — but in dump it's u32[4]")
|
||||
print()
|
||||
# +0x04 is dump[1], +0x10 is dump[4]
|
||||
ours_keys = [(f["dump"][1], f["dump"][4]) if len(f["dump"]) > 4 else None for f in ours]
|
||||
canary_keys = [(f["dump"][1], f["dump"][4]) if len(f["dump"]) > 4 else None for f in canary]
|
||||
print(f" ours [+04,+10]: {ours_keys}")
|
||||
print(f" canary [+04,+10]: {canary_keys}")
|
||||
print()
|
||||
# Cross-match: every ours key should appear in canary (canary is a superset)
|
||||
matched = []
|
||||
unmatched_ours = []
|
||||
for k in ours_keys:
|
||||
if k in canary_keys:
|
||||
matched.append(k)
|
||||
else:
|
||||
unmatched_ours.append(k)
|
||||
print(f" ours fires whose [+04,+10] match a canary fire: {len(matched)}/{len(ours)}")
|
||||
if unmatched_ours:
|
||||
print(f" ours fires with NO canary match: {unmatched_ours}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,17 @@
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 tid=00000006 r3=BCCC4A80 r4=00000018 r5=828F3888 r6=701CF924 r7=82456F00 r8=00000000 r9=00000000 r10=00000018 lr=822F1D5C
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+00: BC22C910 00010004 00000000 000003E8
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+10: 0101FFFF 00000000 00000000 01010000
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+20: FFFFFFFF 00000000 00000000 00000000
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+30: 00000000 BC365BC0 00000000 00000000
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+40: 00000000 00000000 00000000 BDE9A398
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+50: BC365560 00000000 00000000 00000000
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+60: 00000000 00000000 00000000 01010040
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+70: 00000000 00000000 00000000 FFFFFFFF
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+80: 00000000 00000000 00000000 BC22C930
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+90: 00000000 00000001 00000800 00000000
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+A0: F800004C 00000000 00000000 BC365220
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+B0: BC3655C0 00000000 00000000 00000000
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+C0: 00CC0048 00460020 00460072 00650071
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+D0: 00750065 006E0063 00790000 01010000
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+E0: 00000000 00000000 00000000 FFFFFFFF
|
||||
K> F8000008 AUDIT-HLC JitProlog pc=821741C8 r3+F0: 00000000 00000000 00000000 BD610B80
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,89 @@
|
||||
warn: CreateDXGIFactory2: Ignoring flags
|
||||
info: Game: xenia_canary.exe
|
||||
info: DXVK: v2.7.1
|
||||
info: Build: x86_64 gcc 15.1.0
|
||||
info: Vulkan: Found vkGetInstanceProcAddr in winevulkan.dll @ 0x6ffffbd84000
|
||||
info: Extension providers:
|
||||
info: Platform WSI
|
||||
info: OpenVR
|
||||
info: OpenVR: could not open registry key, status 2
|
||||
info: OpenVR: Failed to locate module
|
||||
info: OpenXR
|
||||
info: Enabled instance extensions:
|
||||
info: VK_EXT_surface_maintenance1
|
||||
info: VK_KHR_get_surface_capabilities2
|
||||
info: VK_KHR_surface
|
||||
info: VK_KHR_win32_surface
|
||||
info: Found device: NVIDIA GeForce GTX 1070 Ti (NVIDIA 580.159.3)
|
||||
info: Found device: llvmpipe (LLVM 20.1.2, 256 bits) (llvmpipe 25.2.8)
|
||||
info: Skipping: Software driver
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
warn: DxgiAdapter::QueryInterface: Unknown interface query
|
||||
warn: f0db4c7f-fe5a-42a2-bd62-f2a6cf6fc83e
|
||||
564.236:00dc:013c:info:vkd3d-proton:vkd3d_instance_apply_application_workarounds: Program name: "xenia_canary.exe" (hash: c099ade372da5277)
|
||||
564.236:00dc:013c:info:vkd3d-proton:vkd3d_instance_deduce_config_flags_from_environment: shader_cache is used, global_pipeline_cache is enforced.
|
||||
564.236:00dc:013c:info:vkd3d-proton:vkd3d_config_flags_init_once: VKD3D_CONFIG=''.
|
||||
564.240:00dc:013c:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
564.240:00dc:013c:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
564.399:00dc:013c:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
564.825:00dc:013c:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
564.825:00dc:013c:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
564.827:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
564.839:00dc:013c:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
564.839:00dc:013c:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
564.839:00dc:013c:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
564.840:00dc:013c:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
564.840:00dc:013c:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
564.843:00dc:0154:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
564.844:00dc:0154:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: Promoting write cache to read cache. No need to merge any disk caches.
|
||||
564.844:00dc:0154:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 1.012 ms.
|
||||
564.845:00dc:0154:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.607 ms.
|
||||
564.845:00dc:0154:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.370 ms.
|
||||
564.845:00dc:0154:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
564.903:00dc:013c:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
564.903:00dc:013c:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
564.946:00dc:013c:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
565.065:00dc:013c:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
565.065:00dc:013c:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
565.066:00dc:013c:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
565.067:00dc:013c:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
565.067:00dc:013c:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
565.067:00dc:013c:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
565.067:00dc:013c:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
565.067:00dc:013c:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
565.068:00dc:015c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
565.068:00dc:015c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
565.068:00dc:015c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.136 ms.
|
||||
565.068:00dc:015c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.221 ms.
|
||||
565.069:00dc:015c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.031 ms.
|
||||
565.069:00dc:015c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
565.075:00dc:013c:fixme:vkd3d-proton:d3d12_command_queue_init: Ignoring priority 0x64.
|
||||
warn: DXGIGetDebugInterface1: Stub
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
565.173:00dc:00e0:info:vkd3d-proton:dxgi_vk_swap_chain_init: Creating swapchain (1280 x 720), BufferCount = 3.
|
||||
565.194:00dc:00e0:info:vkd3d-proton:dxgi_vk_swap_chain_init_sync_objects: Ensure maximum latency of 3 frames with KHR_present_wait.
|
||||
565.195:00dc:00e0:info:vkd3d-proton:dxgi_vk_swap_chain_init_sleep_state: Timer interval is 1.0 ms.
|
||||
warn: DXGI: MakeWindowAssociation: Ignoring flags
|
||||
warn: DxgiOutput::WaitForVBlank: Inaccurate
|
||||
info: Setting timer interval to 1000 us
|
||||
565.773:00dc:0164:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
566.349:00dc:016c:fixme:vkd3d-proton:vkd3d_texture_view_desc_fixup: Remapping 2D to 2D_ARRAY. Needs Vulkan spec tightening to match D3D12 properly.
|
||||
566.387:00dc:0164:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
@@ -0,0 +1,89 @@
|
||||
warn: CreateDXGIFactory2: Ignoring flags
|
||||
info: Game: xenia_canary.exe
|
||||
info: DXVK: v2.7.1
|
||||
info: Build: x86_64 gcc 15.1.0
|
||||
info: Vulkan: Found vkGetInstanceProcAddr in winevulkan.dll @ 0x6ffffbfb4000
|
||||
info: Extension providers:
|
||||
info: Platform WSI
|
||||
info: OpenVR
|
||||
info: OpenVR: could not open registry key, status 2
|
||||
info: OpenVR: Failed to locate module
|
||||
info: OpenXR
|
||||
info: Enabled instance extensions:
|
||||
info: VK_EXT_surface_maintenance1
|
||||
info: VK_KHR_get_surface_capabilities2
|
||||
info: VK_KHR_surface
|
||||
info: VK_KHR_win32_surface
|
||||
info: Found device: NVIDIA GeForce GTX 1070 Ti (NVIDIA 580.159.3)
|
||||
info: Found device: llvmpipe (LLVM 20.1.2, 256 bits) (llvmpipe 25.2.8)
|
||||
info: Skipping: Software driver
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
warn: DxgiAdapter::QueryInterface: Unknown interface query
|
||||
warn: f0db4c7f-fe5a-42a2-bd62-f2a6cf6fc83e
|
||||
805.907:00d0:0124:info:vkd3d-proton:vkd3d_instance_apply_application_workarounds: Program name: "xenia_canary.exe" (hash: c099ade372da5277)
|
||||
805.907:00d0:0124:info:vkd3d-proton:vkd3d_instance_deduce_config_flags_from_environment: shader_cache is used, global_pipeline_cache is enforced.
|
||||
805.907:00d0:0124:info:vkd3d-proton:vkd3d_config_flags_init_once: VKD3D_CONFIG=''.
|
||||
805.910:00d0:0124:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
805.910:00d0:0124:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
805.955:00d0:0124:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
806.100:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
806.100:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.101:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.105:00d0:0124:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
806.105:00d0:0124:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
806.105:00d0:0124:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
806.105:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
806.105:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
806.106:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
806.106:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
806.106:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.161 ms.
|
||||
806.107:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.185 ms.
|
||||
806.107:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.028 ms.
|
||||
806.107:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
806.154:00d0:0124:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
806.154:00d0:0124:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
806.197:00d0:0124:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.310:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
806.312:00d0:0124:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
806.312:00d0:0124:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
806.312:00d0:0124:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
806.312:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
806.312:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
806.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
806.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
806.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.156 ms.
|
||||
806.314:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.659 ms.
|
||||
806.314:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.035 ms.
|
||||
806.314:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
806.319:00d0:0124:fixme:vkd3d-proton:d3d12_command_queue_init: Ignoring priority 0x64.
|
||||
warn: DXGIGetDebugInterface1: Stub
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
806.408:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init: Creating swapchain (1280 x 720), BufferCount = 3.
|
||||
806.422:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init_sync_objects: Ensure maximum latency of 3 frames with KHR_present_wait.
|
||||
806.423:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init_sleep_state: Timer interval is 1.0 ms.
|
||||
warn: DXGI: MakeWindowAssociation: Ignoring flags
|
||||
warn: DxgiOutput::WaitForVBlank: Inaccurate
|
||||
info: Setting timer interval to 1000 us
|
||||
806.948:00d0:014c:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
807.499:00d0:0154:fixme:vkd3d-proton:vkd3d_texture_view_desc_fixup: Remapping 2D to 2D_ARRAY. Needs Vulkan spec tightening to match D3D12 properly.
|
||||
807.521:00d0:014c:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
@@ -0,0 +1,89 @@
|
||||
warn: CreateDXGIFactory2: Ignoring flags
|
||||
info: Game: xenia_canary.exe
|
||||
info: DXVK: v2.7.1
|
||||
info: Build: x86_64 gcc 15.1.0
|
||||
info: Vulkan: Found vkGetInstanceProcAddr in winevulkan.dll @ 0x6ffffbfb4000
|
||||
info: Extension providers:
|
||||
info: Platform WSI
|
||||
info: OpenVR
|
||||
info: OpenVR: could not open registry key, status 2
|
||||
info: OpenVR: Failed to locate module
|
||||
info: OpenXR
|
||||
info: Enabled instance extensions:
|
||||
info: VK_EXT_surface_maintenance1
|
||||
info: VK_KHR_get_surface_capabilities2
|
||||
info: VK_KHR_surface
|
||||
info: VK_KHR_win32_surface
|
||||
info: Found device: NVIDIA GeForce GTX 1070 Ti (NVIDIA 580.159.3)
|
||||
info: Found device: llvmpipe (LLVM 20.1.2, 256 bits) (llvmpipe 25.2.8)
|
||||
info: Skipping: Software driver
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
warn: DxgiAdapter::QueryInterface: Unknown interface query
|
||||
warn: f0db4c7f-fe5a-42a2-bd62-f2a6cf6fc83e
|
||||
893.096:00d4:0128:info:vkd3d-proton:vkd3d_instance_apply_application_workarounds: Program name: "xenia_canary.exe" (hash: c099ade372da5277)
|
||||
893.096:00d4:0128:info:vkd3d-proton:vkd3d_instance_deduce_config_flags_from_environment: shader_cache is used, global_pipeline_cache is enforced.
|
||||
893.096:00d4:0128:info:vkd3d-proton:vkd3d_config_flags_init_once: VKD3D_CONFIG=''.
|
||||
893.099:00d4:0128:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
893.099:00d4:0128:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
893.145:00d4:0128:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.308:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.310:00d4:0128:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
893.310:00d4:0128:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
893.310:00d4:0128:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
893.310:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
893.310:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
893.311:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
893.311:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
893.311:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.187 ms.
|
||||
893.312:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.161 ms.
|
||||
893.312:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.040 ms.
|
||||
893.312:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
893.360:00d4:0128:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
893.360:00d4:0128:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
893.405:00d4:0128:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.520:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
893.522:00d4:0128:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
893.522:00d4:0128:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
893.522:00d4:0128:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
893.522:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
893.522:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
893.523:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
893.523:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
893.523:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.153 ms.
|
||||
893.523:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.199 ms.
|
||||
893.523:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.034 ms.
|
||||
893.523:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
893.529:00d4:0128:fixme:vkd3d-proton:d3d12_command_queue_init: Ignoring priority 0x64.
|
||||
warn: DXGIGetDebugInterface1: Stub
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
893.622:00d4:00d8:info:vkd3d-proton:dxgi_vk_swap_chain_init: Creating swapchain (1280 x 720), BufferCount = 3.
|
||||
893.631:00d4:00d8:info:vkd3d-proton:dxgi_vk_swap_chain_init_sync_objects: Ensure maximum latency of 3 frames with KHR_present_wait.
|
||||
893.632:00d4:00d8:info:vkd3d-proton:dxgi_vk_swap_chain_init_sleep_state: Timer interval is 1.0 ms.
|
||||
warn: DXGI: MakeWindowAssociation: Ignoring flags
|
||||
warn: DxgiOutput::WaitForVBlank: Inaccurate
|
||||
info: Setting timer interval to 1000 us
|
||||
894.203:00d4:0150:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
894.705:00d4:0158:fixme:vkd3d-proton:vkd3d_texture_view_desc_fixup: Remapping 2D to 2D_ARRAY. Needs Vulkan spec tightening to match D3D12 properly.
|
||||
894.727:00d4:0150:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
@@ -0,0 +1,89 @@
|
||||
warn: CreateDXGIFactory2: Ignoring flags
|
||||
info: Game: xenia_canary.exe
|
||||
info: DXVK: v2.7.1
|
||||
info: Build: x86_64 gcc 15.1.0
|
||||
info: Vulkan: Found vkGetInstanceProcAddr in winevulkan.dll @ 0x6ffffbfb4000
|
||||
info: Extension providers:
|
||||
info: Platform WSI
|
||||
info: OpenVR
|
||||
info: OpenVR: could not open registry key, status 2
|
||||
info: OpenVR: Failed to locate module
|
||||
info: OpenXR
|
||||
info: Enabled instance extensions:
|
||||
info: VK_EXT_surface_maintenance1
|
||||
info: VK_KHR_get_surface_capabilities2
|
||||
info: VK_KHR_surface
|
||||
info: VK_KHR_win32_surface
|
||||
info: Found device: NVIDIA GeForce GTX 1070 Ti (NVIDIA 580.159.3)
|
||||
info: Found device: llvmpipe (LLVM 20.1.2, 256 bits) (llvmpipe 25.2.8)
|
||||
info: Skipping: Software driver
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
warn: DxgiAdapter::QueryInterface: Unknown interface query
|
||||
warn: f0db4c7f-fe5a-42a2-bd62-f2a6cf6fc83e
|
||||
956.778:00d0:0124:info:vkd3d-proton:vkd3d_instance_apply_application_workarounds: Program name: "xenia_canary.exe" (hash: c099ade372da5277)
|
||||
956.778:00d0:0124:info:vkd3d-proton:vkd3d_instance_deduce_config_flags_from_environment: shader_cache is used, global_pipeline_cache is enforced.
|
||||
956.778:00d0:0124:info:vkd3d-proton:vkd3d_config_flags_init_once: VKD3D_CONFIG=''.
|
||||
956.781:00d0:0124:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
956.781:00d0:0124:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
956.826:00d0:0124:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
956.983:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
956.985:00d0:0124:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
956.985:00d0:0124:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
956.985:00d0:0124:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
956.985:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
956.985:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
956.985:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
956.986:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
956.986:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.171 ms.
|
||||
956.986:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.269 ms.
|
||||
956.986:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.028 ms.
|
||||
956.986:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
957.031:00d0:0124:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
957.031:00d0:0124:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
957.075:00d0:0124:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
957.186:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
957.188:00d0:0124:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
957.188:00d0:0124:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
957.188:00d0:0124:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
957.188:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
957.188:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
957.188:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
957.188:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
957.189:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.172 ms.
|
||||
957.189:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.231 ms.
|
||||
957.189:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.029 ms.
|
||||
957.189:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
957.195:00d0:0124:fixme:vkd3d-proton:d3d12_command_queue_init: Ignoring priority 0x64.
|
||||
warn: DXGIGetDebugInterface1: Stub
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
957.285:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init: Creating swapchain (1280 x 720), BufferCount = 3.
|
||||
957.295:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init_sync_objects: Ensure maximum latency of 3 frames with KHR_present_wait.
|
||||
957.295:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init_sleep_state: Timer interval is 1.0 ms.
|
||||
warn: DXGI: MakeWindowAssociation: Ignoring flags
|
||||
warn: DxgiOutput::WaitForVBlank: Inaccurate
|
||||
info: Setting timer interval to 1000 us
|
||||
957.806:00d0:014c:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
958.343:00d0:0154:fixme:vkd3d-proton:vkd3d_texture_view_desc_fixup: Remapping 2D to 2D_ARRAY. Needs Vulkan spec tightening to match D3D12 properly.
|
||||
958.382:00d0:014c:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
@@ -0,0 +1,89 @@
|
||||
warn: CreateDXGIFactory2: Ignoring flags
|
||||
info: Game: xenia_canary.exe
|
||||
info: DXVK: v2.7.1
|
||||
info: Build: x86_64 gcc 15.1.0
|
||||
info: Vulkan: Found vkGetInstanceProcAddr in winevulkan.dll @ 0x6ffffbfb4000
|
||||
info: Extension providers:
|
||||
info: Platform WSI
|
||||
info: OpenVR
|
||||
info: OpenVR: could not open registry key, status 2
|
||||
info: OpenVR: Failed to locate module
|
||||
info: OpenXR
|
||||
info: Enabled instance extensions:
|
||||
info: VK_EXT_surface_maintenance1
|
||||
info: VK_KHR_get_surface_capabilities2
|
||||
info: VK_KHR_surface
|
||||
info: VK_KHR_win32_surface
|
||||
info: Found device: NVIDIA GeForce GTX 1070 Ti (NVIDIA 580.159.3)
|
||||
info: Found device: llvmpipe (LLVM 20.1.2, 256 bits) (llvmpipe 25.2.8)
|
||||
info: Skipping: Software driver
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
warn: DxgiAdapter::QueryInterface: Unknown interface query
|
||||
warn: f0db4c7f-fe5a-42a2-bd62-f2a6cf6fc83e
|
||||
1217.108:00d4:0128:info:vkd3d-proton:vkd3d_instance_apply_application_workarounds: Program name: "xenia_canary.exe" (hash: c099ade372da5277)
|
||||
1217.108:00d4:0128:info:vkd3d-proton:vkd3d_instance_deduce_config_flags_from_environment: shader_cache is used, global_pipeline_cache is enforced.
|
||||
1217.108:00d4:0128:info:vkd3d-proton:vkd3d_config_flags_init_once: VKD3D_CONFIG=''.
|
||||
1217.111:00d4:0128:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
1217.111:00d4:0128:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
1217.160:00d4:0128:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.307:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.309:00d4:0128:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
1217.309:00d4:0128:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
1217.309:00d4:0128:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
1217.309:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
1217.309:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
1217.310:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
1217.310:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
1217.310:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.166 ms.
|
||||
1217.310:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.173 ms.
|
||||
1217.310:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.031 ms.
|
||||
1217.310:00d4:0140:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
1217.360:00d4:0128:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
1217.360:00d4:0128:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
1217.403:00d4:0128:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.515:00d4:0128:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1217.516:00d4:0128:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
1217.516:00d4:0128:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
1217.516:00d4:0128:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
1217.516:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
1217.516:00d4:0128:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
1217.517:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
1217.517:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
1217.517:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.157 ms.
|
||||
1217.517:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.208 ms.
|
||||
1217.518:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.032 ms.
|
||||
1217.518:00d4:0148:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
1217.524:00d4:0128:fixme:vkd3d-proton:d3d12_command_queue_init: Ignoring priority 0x64.
|
||||
warn: DXGIGetDebugInterface1: Stub
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
1217.612:00d4:00d8:info:vkd3d-proton:dxgi_vk_swap_chain_init: Creating swapchain (1280 x 720), BufferCount = 3.
|
||||
1217.622:00d4:00d8:info:vkd3d-proton:dxgi_vk_swap_chain_init_sync_objects: Ensure maximum latency of 3 frames with KHR_present_wait.
|
||||
1217.622:00d4:00d8:info:vkd3d-proton:dxgi_vk_swap_chain_init_sleep_state: Timer interval is 1.0 ms.
|
||||
warn: DXGI: MakeWindowAssociation: Ignoring flags
|
||||
warn: DxgiOutput::WaitForVBlank: Inaccurate
|
||||
info: Setting timer interval to 1000 us
|
||||
1218.136:00d4:0150:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
1218.678:00d4:0158:fixme:vkd3d-proton:vkd3d_texture_view_desc_fixup: Remapping 2D to 2D_ARRAY. Needs Vulkan spec tightening to match D3D12 properly.
|
||||
1218.699:00d4:0150:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
@@ -0,0 +1,89 @@
|
||||
warn: CreateDXGIFactory2: Ignoring flags
|
||||
info: Game: xenia_canary.exe
|
||||
info: DXVK: v2.7.1
|
||||
info: Build: x86_64 gcc 15.1.0
|
||||
info: Vulkan: Found vkGetInstanceProcAddr in winevulkan.dll @ 0x6ffffbfb4000
|
||||
info: Extension providers:
|
||||
info: Platform WSI
|
||||
info: OpenVR
|
||||
info: OpenVR: could not open registry key, status 2
|
||||
info: OpenVR: Failed to locate module
|
||||
info: OpenXR
|
||||
info: Enabled instance extensions:
|
||||
info: VK_EXT_surface_maintenance1
|
||||
info: VK_KHR_get_surface_capabilities2
|
||||
info: VK_KHR_surface
|
||||
info: VK_KHR_win32_surface
|
||||
info: Found device: NVIDIA GeForce GTX 1070 Ti (NVIDIA 580.159.3)
|
||||
info: Found device: llvmpipe (LLVM 20.1.2, 256 bits) (llvmpipe 25.2.8)
|
||||
info: Skipping: Software driver
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
warn: DxgiAdapter::QueryInterface: Unknown interface query
|
||||
warn: f0db4c7f-fe5a-42a2-bd62-f2a6cf6fc83e
|
||||
1413.916:00d0:0124:info:vkd3d-proton:vkd3d_instance_apply_application_workarounds: Program name: "xenia_canary.exe" (hash: c099ade372da5277)
|
||||
1413.916:00d0:0124:info:vkd3d-proton:vkd3d_instance_deduce_config_flags_from_environment: shader_cache is used, global_pipeline_cache is enforced.
|
||||
1413.916:00d0:0124:info:vkd3d-proton:vkd3d_config_flags_init_once: VKD3D_CONFIG=''.
|
||||
1413.919:00d0:0124:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
1413.919:00d0:0124:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
1413.963:00d0:0124:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.109:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.111:00d0:0124:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
1414.111:00d0:0124:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
1414.111:00d0:0124:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
1414.111:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
1414.111:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
1414.112:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
1414.112:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
1414.112:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.173 ms.
|
||||
1414.113:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.276 ms.
|
||||
1414.113:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.029 ms.
|
||||
1414.113:00d0:013c:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
1414.157:00d0:0124:info:vkd3d-proton:vkd3d_get_vk_version: vkd3d-proton - applicationVersion: 3.0.1.
|
||||
1414.157:00d0:0124:info:vkd3d-proton:vkd3d_instance_init: vkd3d-proton - build: 3b10bd7a7ec6a73.
|
||||
1414.199:00d0:0124:info:vkd3d-proton:vkd3d_init_device_caps: Not all relevant pipeline stages are supported by EXT_dgc. Skipping.
|
||||
1414.310:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_decide_hvv_usage: Topology: Device heaps are split. Assuming small BAR situation.
|
||||
1414.310:00d0:0124:info:vkd3d-proton:vkd3d_memory_info_upload_hvv_memory_properties: Topology: HVV usage is not allowed, using HOST_COHERENT for UPLOAD.
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_get_bindless_flags: Device does not support VK_EXT_mutable_descriptor_type (or VALVE).
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.311:00d0:0124:info:vkd3d-proton:vkd3d_bindless_state_add_binding: Device supports VK_EXT_descriptor_buffer!
|
||||
1414.312:00d0:0124:info:vkd3d-proton:d3d12_device_caps_init_shader_model: Enabling support for SM 6.6.
|
||||
1414.312:00d0:0124:fixme:vkd3d-proton:d3d12_device_caps_init_feature_options1: TotalLaneCount = 2432, may be inaccurate.
|
||||
1414.312:00d0:0124:info:vkd3d-proton:d3d12_device_determine_ray_tracing_tier: DXR support enabled.
|
||||
1414.312:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Remapping VKD3D_SHADER_CACHE to: vkd3d-proton.cache.
|
||||
1414.312:00d0:0124:info:vkd3d-proton:vkd3d_pipeline_library_init_disk_cache: Attempting to load disk cache from: vkd3d-proton.cache.
|
||||
1414.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Performing async setup of stream archive ...
|
||||
1414.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_merge: No write cache exists. No need to merge any disk caches.
|
||||
1414.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Merging pipeline libraries took 0.158 ms.
|
||||
1414.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Mapping read-only cache took 0.256 ms.
|
||||
1414.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_cache_initial_setup: Parsing stream archive took 0.031 ms.
|
||||
1414.313:00d0:0144:info:vkd3d-proton:vkd3d_pipeline_library_disk_thread_main: Done performing async setup of stream archive.
|
||||
1414.319:00d0:0124:fixme:vkd3d-proton:d3d12_command_queue_init: Ignoring priority 0x64.
|
||||
warn: DXGIGetDebugInterface1: Stub
|
||||
info: DXGI: Hiding actual GPU, reporting:
|
||||
info: vendor ID: 0x1002
|
||||
info: device ID: 0x73df
|
||||
1414.406:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init: Creating swapchain (1280 x 720), BufferCount = 3.
|
||||
1414.416:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init_sync_objects: Ensure maximum latency of 3 frames with KHR_present_wait.
|
||||
1414.416:00d0:00d4:info:vkd3d-proton:dxgi_vk_swap_chain_init_sleep_state: Timer interval is 1.0 ms.
|
||||
warn: DXGI: MakeWindowAssociation: Ignoring flags
|
||||
warn: DxgiOutput::WaitForVBlank: Inaccurate
|
||||
info: Setting timer interval to 1000 us
|
||||
1414.927:00d0:014c:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
1415.477:00d0:0154:fixme:vkd3d-proton:vkd3d_texture_view_desc_fixup: Remapping 2D to 2D_ARRAY. Needs Vulkan spec tightening to match D3D12 properly.
|
||||
1415.500:00d0:014c:info:vkd3d-proton:dxgi_vk_swap_chain_recreate_swapchain_in_present_task: Got 3 swapchain images.
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
47
audit-runs/iterate-2D-deferred-fixes/DEFERRED_FIXES.md
Normal file
47
audit-runs/iterate-2D-deferred-fixes/DEFERRED_FIXES.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# iterate-2D Deferred Structural Fixes — Outcome
|
||||
|
||||
Branch `iterate-2D/subsystem-fixes`. After verification + the user's go-ahead:
|
||||
|
||||
## Issue 1 — 32-bit word-form ALU truncation (PPCBUG-020) — ✅ FIXED & LANDED
|
||||
Commit **341196a**. Confirmed load-bearing via runtime ours-vs-canary capture:
|
||||
Sylpheed's ms→LARGE_INTEGER converter `sub_824ACA88` (`clrldi; mulli r11,r11,-10000; std`)
|
||||
produced `0x00000000_FFFD8F00` in ours vs canary's correct `0xFFFFFFFF_FFFD8F00` for a 16 ms
|
||||
wait — a positive (absolute) timeout → ~26000× over-wait that froze the main frame loop.
|
||||
Fixed the 17 data-losing word-form ops (full 64-bit result, CA/OV/CR0 preserved byte-identical),
|
||||
updated 7 bug-asserting tests, re-baselined `sylpheed_n50m` (imports 40454→1790936), `sylpheed_n2m`
|
||||
unchanged. 660/660 + ignored oracle green; lockstep determinism preserved. Boot unwedged
|
||||
(parallel NtWaitForMultipleObjectsEx 94→30428; frozen worker/critical-section loops now run).
|
||||
VdSwap still 1 — rendering progression needs the out-of-scope acd1656 fixes (nt_create_event
|
||||
polarity + 2.AF), not in this branch.
|
||||
|
||||
## Issue 2 — Memory page-size per-region collapse — DEFERRED (verified NOT load-bearing)
|
||||
Sylpheed requests `MmAllocatePhysicalMemoryEx` with flags=0, alignment(r8)=0 (default); ours returns
|
||||
self-consistent 4K-aligned addresses and boots. ours has no 0xA0/0xC0/0xE0 physical-region model at
|
||||
all, so a faithful fix is a region-model rewrite that shifts every physical guest VA (golden-breaking,
|
||||
invalidates the audit-059 VA map) with no demonstrated boot benefit. A partial page-size-only change
|
||||
would shift VAs for zero correctness gain — do NOT do it piecemeal. Pursue only if a render-path
|
||||
struct is proven to depend on physical region/alignment.
|
||||
|
||||
## Issue 3 — Timing — LEFT (not load-bearing / determinism-coupled)
|
||||
- 3d DPC/APC: INERT — the only timer (NtSetTimerEx) passes a NULL APC routine; no
|
||||
NtQueueApcThread/KeInsertQueueDpc imported.
|
||||
- 3b timeout sign: was a SYMPTOM of Issue 1 (the "positive absolute" timeouts were mulli-corruption
|
||||
artifacts) — resolved by the Issue 1 fix.
|
||||
- 3a/3c timebase/skew: timebase = instruction-count IS the deterministic lockstep clock; must not
|
||||
become wallclock. 2.AF deadline-drain already present. Not load-bearing for Sylpheed.
|
||||
|
||||
## Issue 4 — VFS synthesized-success-on-miss — LEFT (risky / coupled to Issue 1 trajectory)
|
||||
The synthesis fallback handles a MIX (writable-partition probes partition0/Cache0 + a genuine disc
|
||||
miss dat/files.tbl, verified absent from the ISO). Canary doesn't fire XamShowDirtyDiscErrorUI during
|
||||
boot (the one "DirtyDisc" log hit is the import-table declaration). Not cleanly separable without
|
||||
heuristic disc-vs-partition routing. Re-verify on the corrected post-Issue-1 (and post-acd1656)
|
||||
trajectory before changing.
|
||||
|
||||
## Issue 5 — Mutant object — SKIPPED (verified unused)
|
||||
Sylpheed's XEX import table contains NO mutant symbols (NtCreateMutant/NtReleaseMutant/KeReleaseMutant/
|
||||
KeInitializeMutant/NtQueryMutant) — the game cannot call them; unimplemented=0 across boot. A correct
|
||||
implementation needs mutant hand-off semantics + an owner-type redesign (the existing
|
||||
`Mutex { owner: Option<u8> }` tracks a HW slot, not a thread) in the determinism-critical wait path,
|
||||
for code that never executes. Per the mandate's skip-if-unused criterion, left unimplemented. Can be
|
||||
added on request as a pure canary-parity / future-title feature (determinism-safe since no Sylpheed
|
||||
mutant ever exists at runtime).
|
||||
@@ -26,6 +26,14 @@ use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::demangle;
|
||||
|
||||
/// Maximum number of consecutive non-function slots tolerated inside an
|
||||
/// anchor-recovered vtable before the run is considered terminated. MSVC
|
||||
/// vtables can carry null / pure-virtual / unrecognised-thunk slots in their
|
||||
/// head or interior; a small budget lets those through without merging two
|
||||
/// physically-adjacent vtables. Kept small to avoid bridging the gap between
|
||||
/// distinct tables.
|
||||
const MAX_ANCHOR_GAP: usize = 2;
|
||||
|
||||
/// One detected vtable.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Vtable {
|
||||
@@ -56,6 +64,35 @@ pub fn analyze(
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
function_starts: &std::collections::BTreeSet<u32>,
|
||||
) -> Vec<Vtable> {
|
||||
analyze_with_anchors(pe, image_base, sections, function_starts, &std::collections::BTreeSet::new())
|
||||
}
|
||||
|
||||
/// Like [`analyze`], but additionally recovers vtables whose base address is
|
||||
/// known a-priori from a constructor vptr-write store (an "anchor"). The
|
||||
/// contiguity heuristic in pass 1 fragments any vtable whose head region
|
||||
/// contains words that don't resolve to recognised function entries (null /
|
||||
/// pure-virtual / unrecognised thunk slots); those vtables are never emitted
|
||||
/// and the downstream typed-dispatch resolver can't type objects of that
|
||||
/// class. An anchor is a *content-independent* vtable signal — the ctor
|
||||
/// literally installs `vtable_base` into `this+0` via
|
||||
/// `addis/addi (or lis/ori) → stw rX, 0(rThis)` — so for every anchor not
|
||||
/// already covered by a pass-1 run we synthesise a vtable starting at that
|
||||
/// base, reading the fnptr-array run while *tolerating* up to
|
||||
/// [`MAX_ANCHOR_GAP`] consecutive non-function slots before terminating.
|
||||
///
|
||||
/// `anchors` are absolute VAs of vtable bases (from
|
||||
/// [`scan_vptr_write_constants`]). Existing pass-1 vtables are kept unchanged
|
||||
/// (no regression): an anchor that already coincides with a detected vtable
|
||||
/// base is skipped, and an anchor that lands *inside* an existing run is also
|
||||
/// skipped (it's a sub-object pointer, not a fresh table).
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze_with_anchors(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
function_starts: &std::collections::BTreeSet<u32>,
|
||||
anchors: &std::collections::BTreeSet<u32>,
|
||||
) -> Vec<Vtable> {
|
||||
let started = std::time::Instant::now();
|
||||
// Sections we'll scan for vtable bodies.
|
||||
@@ -117,6 +154,120 @@ pub fn analyze(
|
||||
let _ = (va_start, va_end);
|
||||
}
|
||||
|
||||
// --- Anchor-driven recovery (vptr-write-anchored vtables) ---
|
||||
//
|
||||
// Build a coverage interval set from pass-1 runs so we don't re-emit a
|
||||
// table for an anchor that already lies within an extracted vtable.
|
||||
let mut covered: Vec<(u32, u32)> = candidates
|
||||
.iter()
|
||||
.map(|v| (v.address, v.address + v.length * 4))
|
||||
.collect();
|
||||
covered.sort_unstable();
|
||||
|
||||
let is_covered = |addr: u32, covered: &[(u32, u32)]| -> bool {
|
||||
covered.iter().any(|&(s, e)| addr >= s && addr < e)
|
||||
};
|
||||
|
||||
// Section lookup for "which scan target contains this VA?"
|
||||
let scan_targets_va: Vec<(u32, u32, usize, usize)> = sections
|
||||
.iter()
|
||||
.filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
|
||||
.map(|s| {
|
||||
let va = image_base + s.virtual_address;
|
||||
(
|
||||
va,
|
||||
va + s.virtual_size,
|
||||
s.virtual_address as usize,
|
||||
(s.virtual_address + s.virtual_size) as usize,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Cap a recovered run at the *next anchor* so two physically-adjacent
|
||||
// anchored vtables don't merge. We deliberately do NOT cap at pass-1
|
||||
// fragments: a fragment is a sub-run the contiguity scan carved out of a
|
||||
// larger table, and the anchor legitimately re-absorbs it (subsumed
|
||||
// fragments are removed afterwards).
|
||||
let anchor_bases: std::collections::BTreeSet<u32> = anchors.iter().copied().collect();
|
||||
|
||||
let mut recovered = 0usize;
|
||||
let mut newly: Vec<Vtable> = Vec::new();
|
||||
for &anchor in anchors {
|
||||
if is_covered(anchor, &covered) { continue; }
|
||||
// Locate the containing .rdata/.data section.
|
||||
let Some(&(va_lo, va_hi, raw_lo, raw_hi)) =
|
||||
scan_targets_va.iter().find(|&&(lo, hi, _, _)| anchor >= lo && anchor < hi)
|
||||
else { continue };
|
||||
if anchor % 4 != 0 { continue; }
|
||||
let raw_hi = raw_hi.min(pe.len());
|
||||
// Read the fnptr-array run starting at the anchor. Tolerate small
|
||||
// gaps of non-function slots (null / pure-virtual / unrecognised),
|
||||
// but require the run to actually contain at least one real function
|
||||
// (otherwise it's just data, not a vtable).
|
||||
let next_base = anchor_bases.range((anchor + 4)..).next().copied();
|
||||
let mut methods: Vec<u32> = Vec::new();
|
||||
let mut gap = 0usize;
|
||||
let mut real_fns = 0usize;
|
||||
let mut off = (anchor - va_lo) as usize + raw_lo;
|
||||
let mut va = anchor;
|
||||
while off + 4 <= raw_hi && va < va_hi {
|
||||
if let Some(nb) = next_base && va >= nb { break; }
|
||||
let val = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]);
|
||||
if function_starts.contains(&val) {
|
||||
methods.push(val);
|
||||
real_fns += 1;
|
||||
gap = 0;
|
||||
} else {
|
||||
// A non-function slot. Keep the slot (so downstream slot
|
||||
// indexing stays aligned) but count toward the gap budget.
|
||||
gap += 1;
|
||||
if gap > MAX_ANCHOR_GAP {
|
||||
// Drop the trailing gap slots — they belong past the
|
||||
// table's end.
|
||||
methods.truncate(methods.len().saturating_sub(gap - 1));
|
||||
break;
|
||||
}
|
||||
methods.push(val);
|
||||
}
|
||||
off += 4;
|
||||
va += 4;
|
||||
}
|
||||
// Trim any trailing non-function slots (the table ends at its last
|
||||
// real method).
|
||||
while methods.last().is_some_and(|&m| !function_starts.contains(&m)) {
|
||||
methods.pop();
|
||||
}
|
||||
if real_fns == 0 || methods.is_empty() { continue; }
|
||||
let length = methods.len() as u32;
|
||||
newly.push(Vtable {
|
||||
address: anchor,
|
||||
length,
|
||||
col_address: None,
|
||||
class_name: synth_anon_name(&methods),
|
||||
rtti_present: false,
|
||||
base_classes_json: None,
|
||||
methods,
|
||||
});
|
||||
recovered += 1;
|
||||
}
|
||||
if recovered > 0 {
|
||||
// Drop pass-1 fragments fully subsumed by a recovered (anchored)
|
||||
// vtable — the anchor base is authoritative and the fragment was a
|
||||
// contiguity-scan artifact of the same table. Keep fragments that
|
||||
// only partially overlap (defensive; shouldn't happen for true
|
||||
// sub-runs) so we never lose method coverage.
|
||||
let recovered_spans: Vec<(u32, u32)> =
|
||||
newly.iter().map(|v| (v.address, v.address + v.length * 4)).collect();
|
||||
candidates.retain(|v| {
|
||||
!recovered_spans
|
||||
.iter()
|
||||
.any(|&(s, e)| v.address >= s && v.address + v.length * 4 <= e)
|
||||
});
|
||||
candidates.extend(newly);
|
||||
tracing::info!(recovered, "vtables recovered from vptr-write anchors");
|
||||
}
|
||||
let _ = &covered;
|
||||
|
||||
// RTTI walk: for each candidate, look at vtable[-1].
|
||||
let pe_image_base = image_base;
|
||||
for v in &mut candidates {
|
||||
@@ -268,6 +419,98 @@ fn read_class_hierarchy(
|
||||
serde_json::to_string(&names).ok()
|
||||
}
|
||||
|
||||
/// Pre-pass: discover candidate vtable *bases* from constructor vptr-write
|
||||
/// stores, independent of the static contiguity heuristic. A vptr install is
|
||||
/// the canonical `addis/addi` (or `lis/ori`) immediate build of a constant
|
||||
/// pointing into `.rdata` / `.data`, followed by `stw rX, 0(rThis)` — i.e. the
|
||||
/// ctor writing the vtable pointer to `this+0`. We return the set of such
|
||||
/// constants; these are fed to [`analyze_with_anchors`] so a vtable with
|
||||
/// non-function head words isn't lost.
|
||||
///
|
||||
/// We only consider stores at displacement 0 (the primary vptr; secondary
|
||||
/// MI vptrs land at non-zero offsets and are handled by the existing
|
||||
/// contiguity scan / typed-dispatch resolver well enough). The register
|
||||
/// tracker mirrors the lis+addi propagation used elsewhere and is reset at
|
||||
/// every basic-block boundary (`block_boundaries`).
|
||||
pub fn scan_vptr_write_constants(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
functions: &std::collections::BTreeMap<u32, (u32, bool)>, // start -> (end, is_saverestore)
|
||||
sections: &[PeSection],
|
||||
block_boundaries: &std::collections::HashSet<u32>,
|
||||
) -> std::collections::BTreeSet<u32> {
|
||||
// Ranges that a vtable base may legitimately live in.
|
||||
let data_ranges: Vec<(u32, u32)> = sections
|
||||
.iter()
|
||||
.filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
|
||||
.map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size))
|
||||
.collect();
|
||||
let in_data = |a: u32| data_ranges.iter().any(|&(s, e)| a >= s && a < e);
|
||||
|
||||
const OP_ADDI: u32 = 14;
|
||||
const OP_ADDIS: u32 = 15;
|
||||
const OP_ORI: u32 = 24;
|
||||
const OP_STW: u32 = 36;
|
||||
const OP_X_FORM: u32 = 31;
|
||||
|
||||
let read = |addr: u32| -> Option<u32> {
|
||||
let off = addr.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||
};
|
||||
|
||||
let mut anchors: std::collections::BTreeSet<u32> = std::collections::BTreeSet::new();
|
||||
for (&fn_start, &(fn_end, is_saverestore)) in functions {
|
||||
if is_saverestore { continue; }
|
||||
let mut reg: [Option<u32>; 32] = [None; 32];
|
||||
let mut pc = fn_start;
|
||||
while pc < fn_end {
|
||||
if pc != fn_start && block_boundaries.contains(&pc) {
|
||||
reg = [None; 32];
|
||||
}
|
||||
let Some(instr) = read(pc) else { break };
|
||||
let op = instr >> 26;
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||
let uimm = instr & 0xFFFF;
|
||||
match op {
|
||||
OP_ADDIS if ra == 0 => reg[rd] = Some(uimm << 16),
|
||||
OP_ADDIS => reg[rd] = reg[ra].map(|b| b.wrapping_add(uimm << 16)),
|
||||
OP_ADDI if ra != 0 => reg[rd] = reg[ra].map(|b| b.wrapping_add(simm as u32)),
|
||||
OP_ADDI => reg[rd] = Some(simm as u32),
|
||||
OP_ORI => {
|
||||
let rs = rd;
|
||||
reg[ra] = reg[rs].map(|b| b | uimm);
|
||||
}
|
||||
OP_STW => {
|
||||
// `stw rS, off(rA)` with displacement 0 = primary vptr install.
|
||||
if ra != 0
|
||||
&& simm == 0
|
||||
&& let Some(val) = reg[rd]
|
||||
&& in_data(val)
|
||||
{
|
||||
anchors.insert(val);
|
||||
}
|
||||
}
|
||||
32..=35 | 40..=43 | 48..=51 => reg[rd] = None,
|
||||
OP_X_FORM => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
if xo != 444 && xo != 467 { reg[rd] = None; } // keep `or`(444=mr)/`mtspr`-ish
|
||||
}
|
||||
18 | 16 => {
|
||||
if (instr & 1) != 0 {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
pc = pc.wrapping_add(4);
|
||||
}
|
||||
}
|
||||
anchors
|
||||
}
|
||||
|
||||
/// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of
|
||||
/// the sorted method-PC list. Two vtables with identical method ordering
|
||||
/// collapse to the same anonymous name.
|
||||
@@ -385,6 +628,112 @@ mod tests {
|
||||
assert!(!vtables[0].rtti_present);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anchor_recovers_vtable_with_nonfn_head() {
|
||||
// A vtable whose head has a null + an unrecognised word, so the
|
||||
// contiguity scan (≥3 contiguous known fns) fragments it. The anchor
|
||||
// (from a ctor vptr-write) must recover the whole table from its base.
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let text_va = 0x2000u32;
|
||||
let rdata_size = 0x40u32;
|
||||
let text_size = 0x100u32;
|
||||
let total = (text_va + text_size) as usize;
|
||||
let mut pe = vec![0u8; total];
|
||||
|
||||
let f0 = image_base + text_va;
|
||||
let f1 = image_base + text_va + 0x10;
|
||||
let f2 = image_base + text_va + 0x20;
|
||||
// Slots: [null, NONFN(0xDEAD), f0, f1, f2]
|
||||
let slots: [u32; 5] = [0, 0xDEADBEEF, f0, f1, f2];
|
||||
for (i, val) in slots.iter().enumerate() {
|
||||
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
|
||||
.copy_from_slice(&val.to_be_bytes());
|
||||
}
|
||||
|
||||
let sections = vec![
|
||||
PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: rdata_va,
|
||||
virtual_size: rdata_size,
|
||||
raw_offset: rdata_va,
|
||||
raw_size: rdata_size,
|
||||
flags: 0x4000_0040,
|
||||
},
|
||||
PeSection {
|
||||
name: ".text".into(),
|
||||
virtual_address: text_va,
|
||||
virtual_size: text_size,
|
||||
raw_offset: text_va,
|
||||
raw_size: text_size,
|
||||
flags: 0x6000_0020,
|
||||
},
|
||||
];
|
||||
let mut function_starts = std::collections::BTreeSet::new();
|
||||
for &pc in &[f0, f1, f2] { function_starts.insert(pc); }
|
||||
|
||||
// Without an anchor: the head gap (null + nonfn = 2 slots) means the
|
||||
// contiguous run is only [f0,f1,f2]=3 starting at +0x08, so pass-1
|
||||
// still finds it but at the WRONG base (0x...1008), not the true base.
|
||||
let no_anchor = analyze(&pe, image_base, §ions, &function_starts);
|
||||
assert!(
|
||||
!no_anchor.iter().any(|v| v.address == image_base + rdata_va),
|
||||
"without anchor the table is not recovered at its true base"
|
||||
);
|
||||
|
||||
// With the anchor at the true base:
|
||||
let mut anchors = std::collections::BTreeSet::new();
|
||||
anchors.insert(image_base + rdata_va);
|
||||
let with_anchor =
|
||||
analyze_with_anchors(&pe, image_base, §ions, &function_starts, &anchors);
|
||||
let v = with_anchor
|
||||
.iter()
|
||||
.find(|v| v.address == image_base + rdata_va)
|
||||
.expect("anchor must recover vtable at its true base");
|
||||
// length spans through f2 (slot 4): 5 slots.
|
||||
assert_eq!(v.length, 5, "table spans null/nonfn head through last fn");
|
||||
assert_eq!(v.methods[2], f0);
|
||||
assert_eq!(v.methods[4], f2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_vptr_write_constants_finds_ctor_store() {
|
||||
// Encode a ctor: addis r11,r0,0x8201; addi r11,r11,lo; stw r11,0(r31)
|
||||
// installing vtable base 0x8200A908 into this+0.
|
||||
let image_base = 0x82000000u32;
|
||||
let ctor = 0x82001000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
// Lay out a tiny .rdata at 0x...A900 so the constant lands in-range.
|
||||
let vt_base = 0x8200A908u32; // 0x82010000 - 22264
|
||||
let addis = (15u32 << 26) | (11 << 21) | (0 << 16) | 0x8201;
|
||||
let lo = (vt_base & 0xFFFF) as i16; // -22264
|
||||
let addi = (14u32 << 26) | (11 << 21) | (0 << 16) | ((lo as u16) as u32);
|
||||
// addi r11,r0,lo would set r11=lo (sign-extended); we need addis+addi
|
||||
// chained. Re-encode addis into r11 from r0, then addi r11,r11,lo.
|
||||
let addi2 = (14u32 << 26) | (11 << 21) | (11 << 16) | ((lo as u16) as u32);
|
||||
let stw = (36u32 << 26) | (11 << 21) | (31 << 16) | 0; // stw r11,0(r31)
|
||||
let at = (ctor - image_base) as usize;
|
||||
pe[at..at + 4].copy_from_slice(&addis.to_be_bytes());
|
||||
pe[at + 4..at + 8].copy_from_slice(&addi2.to_be_bytes());
|
||||
pe[at + 8..at + 12].copy_from_slice(&stw.to_be_bytes());
|
||||
let _ = addi;
|
||||
|
||||
let sections = vec![PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: 0xA900,
|
||||
virtual_size: 0x200,
|
||||
raw_offset: 0xA900,
|
||||
raw_size: 0x200,
|
||||
flags: 0x4000_0040,
|
||||
}];
|
||||
let mut funcs: std::collections::BTreeMap<u32, (u32, bool)> = std::collections::BTreeMap::new();
|
||||
funcs.insert(ctor, (ctor + 0x40, false));
|
||||
let anchors = scan_vptr_write_constants(
|
||||
&pe, image_base, &funcs, §ions, &std::collections::HashSet::new(),
|
||||
);
|
||||
assert!(anchors.contains(&vt_base), "ctor vptr store must yield anchor {vt_base:#x}, got {anchors:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_2_method_run() {
|
||||
let image_base = 0x82000000u32;
|
||||
|
||||
@@ -415,6 +415,18 @@ fn main() -> Result<()> {
|
||||
// metrics summary.
|
||||
let _obs = observability::init(&config)?;
|
||||
|
||||
// Env-gated indirect-dispatch recorder (off by default). Resolve the env
|
||||
// once here; a scope guard dumps the recorded (call_site -> target) table
|
||||
// at end-of-run no matter how the run terminates.
|
||||
xenia_cpu::dispatch_rec::install();
|
||||
struct DispatchRecGuard;
|
||||
impl Drop for DispatchRecGuard {
|
||||
fn drop(&mut self) {
|
||||
xenia_cpu::dispatch_rec::dump();
|
||||
}
|
||||
}
|
||||
let _dispatch_rec_guard = DispatchRecGuard;
|
||||
|
||||
let result = match cli.command {
|
||||
Commands::Disasm { path, count, at } => cmd_disasm(&path, count, at),
|
||||
Commands::Exec {
|
||||
@@ -1301,6 +1313,29 @@ fn cmd_exec_inner(
|
||||
}
|
||||
}
|
||||
|
||||
// iterate-2E — pointer-chase probe. `XENIA_AUDIT_DEREF=<reg>:<off>`
|
||||
// (e.g. `4:36`). On each AUDIT-PC-PROBE fire, dumps gpr[reg] as a base
|
||||
// object, the sub-object at [base+off], and that sub-object's vtable.
|
||||
// Read-only; lockstep digest unaffected.
|
||||
if let Ok(spec) = std::env::var("XENIA_AUDIT_DEREF") {
|
||||
if !spec.is_empty() {
|
||||
let (rs, os) = spec
|
||||
.split_once(':')
|
||||
.ok_or_else(|| anyhow::anyhow!("XENIA_AUDIT_DEREF {spec:?}: expected <reg>:<off>"))?;
|
||||
let reg: u8 = rs.trim_start_matches('r').parse()
|
||||
.map_err(|e| anyhow::anyhow!("XENIA_AUDIT_DEREF reg {rs:?}: {e}"))?;
|
||||
let off: u32 = if let Some(h) = os.strip_prefix("0x") {
|
||||
u32::from_str_radix(h, 16)
|
||||
} else {
|
||||
os.parse::<u32>()
|
||||
}.map_err(|e| anyhow::anyhow!("XENIA_AUDIT_DEREF off {os:?}: {e}"))?;
|
||||
kernel.audit_deref = Some((reg, off));
|
||||
if !quiet {
|
||||
tracing::info!("audit-deref armed: r{} +0x{:x}", reg, off);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Diagnostic. Parse `--dump-addr=0x828F3D08,...` (or
|
||||
// `XENIA_DUMP_ADDR=...`) into `kernel.dump_addrs`. The contents
|
||||
// are dumped at end-of-run by `dump_thread_diagnostic`. Pure
|
||||
@@ -1414,6 +1449,45 @@ fn cmd_exec_inner(
|
||||
// atoms that live inside `kernel.gpu.mmio`.
|
||||
mem.add_mmio_region(xenia_gpu::build_mmio_region(kernel.gpu.mmio()));
|
||||
|
||||
// apu stage 1 — reserve the 320-entry XMA context array and install the
|
||||
// `0x7FEA0000` register aperture (mirrors canary's `XmaDecoder::Setup`).
|
||||
//
|
||||
// Physical placement: canary stores a *physical* address in
|
||||
// `ContextArrayAddress` (reg 0x600) — `PhysicalHeap::GetPhysicalAddress`
|
||||
// returns `va - heap_base` (== `va & 0x1FFFFFFF` for the physical heaps).
|
||||
// Our memory model is FLAT: `translate_virtual` is a raw `membase + addr`
|
||||
// with no separate physical-window mirror, and `translate_physical` masks
|
||||
// `& 0x1FFFFFFF` — so the two only coincide for low (`< 0x2000_0000`) VAs.
|
||||
// `heap_alloc` returns a `0x40000000`-region VA, so `va & 0x1FFFFFFF` would
|
||||
// be 0 (disagreeing with the context pointers `XMACreateContext` hands out
|
||||
// at `va + i*64`). The guest reads `ContextArrayAddress` and indexes it as
|
||||
// `base + i*64`; for that to equal the pointers it dereferences, the base
|
||||
// MUST equal the VA. So we advertise `va` itself — self-consistent in the
|
||||
// flat model (the guest reaches every context through the same VA space).
|
||||
// Stage 3's decoder will read the context structs via this VA directly
|
||||
// (not via `translate_physical`). The 20480-byte buffer is page-committed
|
||||
// by `heap_alloc`, so the guest never faults writing the 64-byte structs.
|
||||
{
|
||||
let array_size =
|
||||
(xenia_apu::XMA_CONTEXT_COUNT as u32) * xenia_apu::XMA_CONTEXT_SIZE; // 320 * 64
|
||||
match kernel.heap_alloc(array_size, &mem) {
|
||||
Some(va) => {
|
||||
let phys = va; // flat model: array base == VA (see note above)
|
||||
kernel.xma.lock().unwrap().init(va, phys);
|
||||
mem.add_mmio_region(xenia_apu::build_mmio_region(kernel.xma.clone()));
|
||||
tracing::info!(
|
||||
va = format_args!("{va:#010x}"),
|
||||
phys = format_args!("{phys:#010x}"),
|
||||
size = format_args!("{array_size:#x}"),
|
||||
"xma: context array reserved + 0x7FEA0000 aperture installed"
|
||||
);
|
||||
}
|
||||
None => {
|
||||
tracing::error!("xma: failed to reserve context array (heap exhausted)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Install the initial guest thread on HW slot 0. The thread handle we
|
||||
// hand the scheduler isn't visible to any guest API yet, but joiners
|
||||
// (XThreadWait-style) will see it via `find_by_tid`.
|
||||
@@ -1474,16 +1548,28 @@ fn cmd_exec_inner(
|
||||
mem.write_u32(addr, block);
|
||||
}
|
||||
("xboxkrnl.exe", 0x00AD) => {
|
||||
// KeTimeStampBundle — 0x18 block with FILETIME at +0 and
|
||||
// interrupt-time u64 at +0x10. Mirrors the clock used by
|
||||
// KeQuerySystemTime so fast-path readers see consistent values.
|
||||
// KeTimeStampBundle — X_TIME_STAMP_BUNDLE (canary layout,
|
||||
// kernel_state.h): +0x00 interrupt_time u64, +0x08
|
||||
// system_time u64 (FILETIME 100ns), +0x10 tick_count u32
|
||||
// (milliseconds since boot), +0x14 padding. The guest's
|
||||
// worker-hub channel-dispatch loop (sub_82450A68 @
|
||||
// 0x82450b10) polls [block+0x10] (tick_count) and gates
|
||||
// dispatch on a `tick_count + 66` (ms) deadline. The block
|
||||
// MUST be ticked over the run or that deadline never
|
||||
// elapses (tid14 0x109c starvation gate). Initialize to a
|
||||
// zero-uptime base; KernelState::update_timestamp_bundle
|
||||
// ticks it every round from the deterministic global_clock.
|
||||
let block = alloc_zero(0x18, &mut mem, &mut kernel);
|
||||
if block != 0 {
|
||||
let fake_time: u64 = 132_500_000_000_000_000; // ~2021 FILETIME
|
||||
mem.write_u32(block, (fake_time >> 32) as u32);
|
||||
mem.write_u32(block + 4, fake_time as u32);
|
||||
mem.write_u32(block + 0x10, (fake_time >> 32) as u32);
|
||||
mem.write_u32(block + 0x14, fake_time as u32);
|
||||
// FILETIME base (~2021) so system_time is plausible.
|
||||
let fake_time: u64 = 132_500_000_000_000_000;
|
||||
mem.write_u32(block, 0); // interrupt_time hi
|
||||
mem.write_u32(block + 4, 0); // interrupt_time lo
|
||||
mem.write_u32(block + 0x08, (fake_time >> 32) as u32); // system_time hi
|
||||
mem.write_u32(block + 0x0C, fake_time as u32); // system_time lo
|
||||
mem.write_u32(block + 0x10, 0); // tick_count (ms) = 0 at boot
|
||||
mem.write_u32(block + 0x14, 0); // padding
|
||||
kernel.timestamp_bundle_addr = block;
|
||||
}
|
||||
mem.write_u32(addr, block);
|
||||
}
|
||||
@@ -1505,8 +1591,19 @@ fn cmd_exec_inner(
|
||||
mem.write_u32(addr, block);
|
||||
}
|
||||
("xboxkrnl.exe", 0x01BE) => {
|
||||
// VdGlobalDevice — passed through to Vd* shims. Write 0.
|
||||
mem.write_u32(addr, 0);
|
||||
// VdGlobalDevice — a *pointer to* a global D3D-device cell.
|
||||
// Mirror xenia-canary RegisterVideoExports (xboxkrnl_video.cc:
|
||||
// 557-564): allocate a 4-byte cell, point the import slot at
|
||||
// it, and zero the cell. The guest's graphics init then stores
|
||||
// its device object INTO the cell (e.g. sub_824C6DC0 @
|
||||
// 0x824C6F18 `stw r31, 0([0x82000750])`), and the swap-complete
|
||||
// callback sub_824CE2B8 reads it back via the two-level
|
||||
// `[[VdGlobalDevice]+0]+15160` to bump the swap counter (clock
|
||||
// B). Writing 0 directly here (the old behaviour) made that
|
||||
// store land at address 0 and the swap counter never advance —
|
||||
// freezing the title-loop's per-frame manager update.
|
||||
let cell = alloc_zero(0x4, &mut mem, &mut kernel);
|
||||
mem.write_u32(addr, cell);
|
||||
}
|
||||
("xboxkrnl.exe", 0x01C0) => {
|
||||
// VdGpuClockInMHz
|
||||
@@ -2105,7 +2202,13 @@ fn coord_pre_round(
|
||||
let fired = if kernel.parallel_active {
|
||||
kernel.interrupts.tick_vsync_wallclock()
|
||||
} else {
|
||||
kernel.interrupts.tick_vsync_instr(stats.instruction_count)
|
||||
// iterate-3AJ: present-anchored — pass the guest's live present
|
||||
// (`VdSwap`) count so vsync tracks the real present rate once the
|
||||
// guest is presenting (≈1 vblank/present), instead of firing a
|
||||
// fixed instruction quantum that over-fires ~66× during one heavy
|
||||
// splash asset-load frame and collapsed the logo fade-in.
|
||||
let presents = kernel.gpu.swaps_seen();
|
||||
kernel.interrupts.tick_vsync_instr(stats.instruction_count, presents)
|
||||
};
|
||||
if fired {
|
||||
use std::sync::atomic::Ordering;
|
||||
@@ -2124,6 +2227,27 @@ fn coord_pre_round(
|
||||
}
|
||||
|
||||
kernel.fire_due_timers();
|
||||
// 2.AF — fire expired wait-deadlines under load. Without this drain,
|
||||
// `advance_to_next_wake_if_due` only runs in `coord_idle_advance` (the
|
||||
// no-Ready-threads path), so a thread whose `KeWait*`/`KeDelay` deadline
|
||||
// expires while other threads keep the scheduler busy sits Blocked
|
||||
// forever (observed: tid=5's 42.95ms deadline unfired 29s+). Drain every
|
||||
// entry whose deadline `<=` the current guest timebase — the same `now`
|
||||
// basis `fire_due_timers` uses, so the two stay in lock-step — and let
|
||||
// `handle_timeout_wake` stamp `STATUS_TIMEOUT` and scrub the waiter from
|
||||
// each handle. `advance_to_next_wake_if_due` pops at most one due wake
|
||||
// per call and returns `None` once the earliest remaining deadline is in
|
||||
// the future, so this loop terminates. Deterministic: `ctx(0).timebase`
|
||||
// is the guest-cycle timebase, not host_ns. This runs in `coord_pre_round`
|
||||
// which both the lockstep and parallel outer loops call every round.
|
||||
loop {
|
||||
let now = kernel.now_basis_at(0);
|
||||
let Some((r, reason)) = kernel.scheduler.advance_to_next_wake_if_due(now)
|
||||
else {
|
||||
break;
|
||||
};
|
||||
kernel.handle_timeout_wake(r, reason);
|
||||
}
|
||||
// Graphics-interrupt delivery is no longer done here — see
|
||||
// `dispatch_graphics_interrupts`, called from the outer loop with
|
||||
// `mem` and `&mut stats` in scope. The audio path still uses the
|
||||
@@ -2253,8 +2377,19 @@ fn coord_post_round(
|
||||
let mut gpu_runs = (executed_this_round
|
||||
/ xenia_cpu::scheduler::HW_THREAD_COUNT as u64)
|
||||
.max(1);
|
||||
if gpu_runs > 64 {
|
||||
gpu_runs = 64;
|
||||
// Fairness cap on GPU commands drained per round. Must scale with the
|
||||
// per-round instruction volume: with the superblock runner a single
|
||||
// round legitimately retires up to ~SUPERBLOCK_INSTR_BUDGET per slot
|
||||
// (vs ~6 for the old one-block path), so the rate `executed/6` is much
|
||||
// higher and a flat cap of 64 throttled GPU command processing ~17×
|
||||
// (packets 50279→1861 @50M) — collapsing the present loop / splash.
|
||||
// Cap at the budget so the GPU keeps pace with the CPU at the same
|
||||
// per-instruction rate the one-block path had. The inner loop already
|
||||
// early-breaks on `!gpu.is_ready`, so this only bounds a pathological
|
||||
// backlog, never busy-spins.
|
||||
let gpu_cap = superblock_budget().max(64);
|
||||
if gpu_runs > gpu_cap {
|
||||
gpu_runs = gpu_cap;
|
||||
}
|
||||
if let Some(gpu) = kernel.gpu.as_inline_mut() {
|
||||
gpu.sync_with_mmio();
|
||||
@@ -2270,11 +2405,31 @@ fn coord_post_round(
|
||||
let _ = gpu_runs;
|
||||
}
|
||||
|
||||
// APU stage 3 — pump the XMA decoder on the CPU thread, same cadence as the
|
||||
// inline GPU. Deterministic (no host thread / clock): for each context with
|
||||
// a pending kick it runs one Work() pass, decoding the guest's XMA packets
|
||||
// into PCM and writing it back into the output ring + context struct.
|
||||
if let Ok(mut xma) = kernel.xma.try_lock() {
|
||||
xma.decode_pending(mem);
|
||||
}
|
||||
|
||||
if kernel.gpu.has_pending_interrupts() {
|
||||
for _pi in kernel.gpu.take_pending_interrupts() {
|
||||
for pi in kernel.gpu.take_pending_interrupts() {
|
||||
// Canary `ExecutePacketType3_INTERRUPT` dispatches the callback
|
||||
// once per set bit of `cpu_mask` with that bit's index as the
|
||||
// target CPU (`DispatchInterruptCallback(1, n)`). The guest's
|
||||
// swap-acknowledge fence stores `cpu_mask`, and the ISR clears
|
||||
// `1 << current_cpu` from it — so the ISR must run impersonating
|
||||
// the masked CPU or the fence never reaches 0. Sylpheed uses a
|
||||
// single-bit mask (`0x4` → CPU 2); take the lowest set bit.
|
||||
let cpu = if pi.cpu_mask == 0 {
|
||||
xenia_kernel::interrupts::VSYNC_TARGET_CPU
|
||||
} else {
|
||||
pi.cpu_mask.trailing_zeros().min(5) as u8
|
||||
};
|
||||
kernel
|
||||
.interrupts
|
||||
.queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP);
|
||||
.queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP, cpu);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2349,7 +2504,7 @@ fn worker_prologue(
|
||||
stats: &mut ExecStats,
|
||||
) -> PrologueOutcome {
|
||||
use xenia_cpu::interpreter::{step_cached, StepResult};
|
||||
use xenia_cpu::scheduler::{HwState, INITIAL_GUEST_TID};
|
||||
use xenia_cpu::scheduler::{BlockReason, HwState, INITIAL_GUEST_TID};
|
||||
use xenia_cpu::PpcOpcode;
|
||||
const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;
|
||||
|
||||
@@ -2374,10 +2529,19 @@ fn worker_prologue(
|
||||
// and println one record. Read-only; lockstep digest unaffected.
|
||||
// Empty set is the common case → single `is_empty()` test inside
|
||||
// the helper, no overhead on the hot path.
|
||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||
kernel.fire_branch_probe_if_match(hw_id);
|
||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||
kernel.fire_lr_trace_if_match(hw_id);
|
||||
// Perf (Tier-A #3): all four `fire_*_if_match` helpers early-return
|
||||
// on an empty registry, but paying 4× call overhead per slot-visit
|
||||
// (~3.2M visits boot-to-splash) is itself measurable. Gate the whole
|
||||
// group behind a single `any_probe_active()` predicted branch so the
|
||||
// common (no-probe) headless path never even makes the calls. When a
|
||||
// probe IS configured each helper still re-checks its own set, so
|
||||
// behaviour is identical either way.
|
||||
if kernel.any_probe_active() {
|
||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||
kernel.fire_branch_probe_if_match(hw_id);
|
||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||
kernel.fire_lr_trace_if_match(hw_id);
|
||||
}
|
||||
|
||||
if mem.has_mem_watch() {
|
||||
let ctx = kernel.scheduler.ctx(hw_id);
|
||||
@@ -2387,12 +2551,26 @@ fn worker_prologue(
|
||||
|
||||
// 1) Halt-sentinel check (per HW thread).
|
||||
if pc == LR_HALT {
|
||||
// iterate-4A: the async audio-callback injection (`try_inject_audio_callback`)
|
||||
// sets `interrupts.saved`/`injected_ref` to the dedicated audio
|
||||
// worker and runs REAL guest code (`sub_824D29F0`, which calls
|
||||
// blocking kernel APIs) across MANY scheduler rounds before
|
||||
// returning to `LR_HALT_SENTINEL`. The restore must fire only when
|
||||
// the thread that *actually* reached the sentinel is the injected
|
||||
// worker itself — i.e. the FULL `ThreadRef` (hw_id AND idx), which
|
||||
// `scheduler.current` holds after `begin_slot_visit`. Matching on
|
||||
// `hw_id` alone let ANY OTHER thread sharing that HW slot reach
|
||||
// `LR_HALT` and consume the audio worker's `saved` slot; when the
|
||||
// worker later truly returned, `saved` was already `None`, the
|
||||
// guard failed, and control fell through to "marking exited" — the
|
||||
// worker was removed and every subsequent audio callback dropped
|
||||
// (`find_by_handle` skips Exited threads). The graphics ISR path is
|
||||
// fully synchronous (`dispatch_graphics_interrupts` restores inline
|
||||
// and never leaves `interrupts.saved` set across rounds), so this
|
||||
// restore lifecycle is exclusive to audio and graphics is
|
||||
// unaffected.
|
||||
let injected_here = kernel.interrupts.saved.is_some()
|
||||
&& kernel
|
||||
.interrupts
|
||||
.injected_ref
|
||||
.map(|r| r.hw_id == hw_id)
|
||||
== Some(true);
|
||||
&& kernel.interrupts.injected_ref == kernel.scheduler.current;
|
||||
if injected_here
|
||||
&& let Some(saved) = kernel.interrupts.saved.take()
|
||||
{
|
||||
@@ -2404,17 +2582,64 @@ fn worker_prologue(
|
||||
kernel.interrupts.delivered += 1;
|
||||
let source = saved.source;
|
||||
let mut restore_outcome = "ready";
|
||||
let current = kernel.scheduler.thread(target_ref).state.clone();
|
||||
if let HwState::ServicingIrq(reason) = current {
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(reason);
|
||||
restore_outcome = "reblocked";
|
||||
|
||||
// iterate-4A: the dedicated audio worker's canonical resting
|
||||
// state is "parked on its synthetic handle, awaiting the next
|
||||
// callback injection". The callback (`sub_824D29F0`) runs real
|
||||
// guest code that can be flipped `ServicingIrq -> Ready` by an
|
||||
// intervening `wake_ref` (a `KeSetEvent`/timeout targeting the
|
||||
// worker as a waiter mid-callback). The old re-block heuristic
|
||||
// only re-parked when the state was *still* `ServicingIrq`, so
|
||||
// such a wake left the worker `Ready` — it then ran its thread
|
||||
// entry to the `LR_HALT` sentinel, EXITED, and every subsequent
|
||||
// callback dropped (`find_by_handle` skips Exited workers),
|
||||
// wedging the intro-video audio→XMA pipeline. When this restore
|
||||
// is an audio callback (`source == INTERRUPT_SOURCE_AUDIO`),
|
||||
// re-park the worker UNCONDITIONALLY onto its synthetic
|
||||
// park-handle so it survives to receive the next fire. (Graphics
|
||||
// restores keep the `ServicingIrq`-only re-block: a graphics
|
||||
// victim is a borrowed real thread, not a parked worker, and the
|
||||
// old behavior there must stay byte-identical.)
|
||||
if source == xenia_kernel::INTERRUPT_SOURCE_AUDIO {
|
||||
let worker_handle =
|
||||
kernel.scheduler.thread(target_ref).thread_handle;
|
||||
let index = worker_handle.and_then(|h| {
|
||||
kernel
|
||||
.xaudio
|
||||
.worker_handles
|
||||
.iter()
|
||||
.position(|wh| *wh == Some(h))
|
||||
});
|
||||
if let Some(index) = index {
|
||||
let park = xenia_kernel::xaudio::synthetic_park_handle(index);
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(BlockReason::WaitAny {
|
||||
handles: vec![park],
|
||||
deadline: None,
|
||||
});
|
||||
restore_outcome = "reparked";
|
||||
} else if let HwState::ServicingIrq(reason) =
|
||||
kernel.scheduler.thread(target_ref).state.clone()
|
||||
{
|
||||
// Fallback (handle unresolved): preserve the legacy
|
||||
// ServicingIrq-only re-block rather than leak the worker.
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(reason);
|
||||
restore_outcome = "reblocked";
|
||||
}
|
||||
} else {
|
||||
let current = kernel.scheduler.thread(target_ref).state.clone();
|
||||
if let HwState::ServicingIrq(reason) = current {
|
||||
kernel.scheduler.thread_mut(target_ref).state =
|
||||
HwState::Blocked(reason);
|
||||
restore_outcome = "reblocked";
|
||||
}
|
||||
}
|
||||
tracing::debug!(
|
||||
source,
|
||||
hw_id,
|
||||
outcome = restore_outcome,
|
||||
"graphics interrupt: callback returned"
|
||||
"interrupt: callback returned"
|
||||
);
|
||||
return PrologueOutcome::Continue;
|
||||
}
|
||||
@@ -2443,8 +2668,15 @@ fn worker_prologue(
|
||||
return PrologueOutcome::Continue;
|
||||
}
|
||||
|
||||
// 2) Import thunk intercept.
|
||||
if let Some((module, ordinal, name)) = thunk_map.get(&pc) {
|
||||
// 2) Import thunk intercept. Perf (Tier-A #4): import thunks occupy a
|
||||
// small contiguous address band; the overwhelming majority of executing
|
||||
// PCs are ordinary guest code outside it. Range-reject against the band
|
||||
// (two integer compares) before paying the `thunk_map` hash. Faithful
|
||||
// no-op — any in-band PC still goes through the exact map lookup, and an
|
||||
// out-of-band PC can never be a registered thunk.
|
||||
if kernel.pc_in_thunk_band(pc)
|
||||
&& let Some((module, ordinal, name)) = thunk_map.get(&pc)
|
||||
{
|
||||
let module = *module;
|
||||
let ordinal_u32 = *ordinal as u32;
|
||||
let thunk_pc = pc;
|
||||
@@ -2575,6 +2807,10 @@ fn worker_prologue(
|
||||
|
||||
match result {
|
||||
StepResult::Continue => {}
|
||||
StepResult::Yield => {
|
||||
// db16cyc spin-wait hint (per-instruction path): yield the slot.
|
||||
kernel.scheduler.yield_current();
|
||||
}
|
||||
StepResult::SystemCall => {
|
||||
tracing::warn!("SYSCALL at {:#010x} (hw={})", pc, hw_id);
|
||||
}
|
||||
@@ -2654,6 +2890,11 @@ fn worker_epilogue(
|
||||
|
||||
match result {
|
||||
StepResult::Continue => {}
|
||||
StepResult::Yield => {
|
||||
// db16cyc spin-wait hint: hand the slot to a Ready peer so the
|
||||
// spinner doesn't starve the co-located thread it is waiting on.
|
||||
kernel.scheduler.yield_current();
|
||||
}
|
||||
StepResult::SystemCall => {
|
||||
let last_pc = block.instrs.last().map(|i| i.addr).unwrap_or(pc_before);
|
||||
tracing::warn!("SYSCALL at {:#010x} (hw={})", last_pc, hw_id);
|
||||
@@ -2702,6 +2943,212 @@ fn worker_epilogue(
|
||||
SlotOutcome::Continue
|
||||
}
|
||||
|
||||
/// Hard cap on the number of guest instructions a single superblock
|
||||
/// runner invocation executes before returning to the round-robin
|
||||
/// scheduler. Bounds how coarse the lockstep interleaving can get: a
|
||||
/// larger budget amortizes more per-round/per-slot tax (faster) but
|
||||
/// runs one HW thread for longer between scheduler returns (coarser
|
||||
/// cross-thread interleaving). 1024 keeps a slot-visit ~170× longer
|
||||
/// than the old single-block (~6 instr) granularity while still
|
||||
/// returning to the round well inside a single 50k quantum. Purely an
|
||||
/// instruction count → deterministic, schedule reproduces byte-identically.
|
||||
///
|
||||
/// Tuned empirically on the Sylpheed boot-to-splash workload (iterate-3AL):
|
||||
/// budgets up to 256 keep boot progression byte-for-byte healthy (draws /
|
||||
/// swaps / packets track the one-block baseline), then a sharp cliff at
|
||||
/// ~384 collapses the present loop (a producer/consumer boot handoff
|
||||
/// starves when one slot runs too long without returning to the round).
|
||||
/// 128 sits 3× below that cliff with ~1.65× boot-to-splash speedup — a
|
||||
/// deliberately conservative pick (correctness over the last few %). The
|
||||
/// `XENIA_SUPERBLOCK_BUDGET` env var overrides it for further tuning.
|
||||
const SUPERBLOCK_INSTR_BUDGET: u64 = 128;
|
||||
|
||||
/// Effective superblock budget. Defaults to [`SUPERBLOCK_INSTR_BUDGET`];
|
||||
/// `XENIA_SUPERBLOCK_BUDGET` overrides it (A/B tuning without a rebuild).
|
||||
/// A budget of 1 reproduces the old one-block-per-slot-visit behaviour
|
||||
/// (the chain always stops after the first block). Read once and cached.
|
||||
fn superblock_budget() -> u64 {
|
||||
use std::sync::OnceLock;
|
||||
static BUDGET: OnceLock<u64> = OnceLock::new();
|
||||
*BUDGET.get_or_init(|| {
|
||||
std::env::var("XENIA_SUPERBLOCK_BUDGET")
|
||||
.ok()
|
||||
.and_then(|v| v.parse::<u64>().ok())
|
||||
.filter(|&v| v >= 1)
|
||||
.unwrap_or(SUPERBLOCK_INSTR_BUDGET)
|
||||
})
|
||||
}
|
||||
|
||||
/// Superblock runner (iterate-3AL). Executes a *chain* of basic blocks
|
||||
/// for one slot-visit — following each block's terminating branch into
|
||||
/// the next block — instead of a single block, amortizing the per-round
|
||||
/// (timebase / coord / `round_schedule`) and per-slot (`worker_prologue`)
|
||||
/// dispatch tax over up to [`SUPERBLOCK_INSTR_BUDGET`] guest instructions.
|
||||
///
|
||||
/// Determinism + cross-thread correctness: the chain ENDS (returns to the
|
||||
/// round) at exactly the points where lockstep granularity matters, all
|
||||
/// pure functions of guest state (never wall-clock):
|
||||
/// - a non-`Continue` step result (Yield / SystemCall / Trap / Unimpl /
|
||||
/// Halted) — `step_block` already bails on these; `Yield` in
|
||||
/// particular is the db16cyc spin-wait hand-off that prevents a
|
||||
/// spinner from starving its producer.
|
||||
/// - the just-run block was `sync_sensitive` (reserved load/store or a
|
||||
/// memory barrier) — the guest's own ordering points.
|
||||
/// - the block touched MMIO (the `mem.mmio_access_count()` watermark
|
||||
/// advanced) — GPU/register ordering vs other HW threads stays at the
|
||||
/// same fine granularity as the old one-block path.
|
||||
/// - the next PC leaves ordinary guest code: an import thunk, the halt
|
||||
/// sentinel, or unmapped memory — those need the full `worker_prologue`
|
||||
/// dispatch, so we stop and let the next round's prologue handle them.
|
||||
/// - the instruction budget is reached.
|
||||
///
|
||||
/// Instruction-count / clock accounting stays exact: `executed` is summed
|
||||
/// from the per-block `cycle_count` delta across every chained block and
|
||||
/// handed to `worker_epilogue` once, which advances `stats.instruction_count`
|
||||
/// and `decrement_quantum` by precisely the retired count — identical to
|
||||
/// dispatching each block separately.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn run_superblock(
|
||||
wc: &mut WorkerCtx,
|
||||
kernel: &mut xenia_kernel::KernelState,
|
||||
mem: &xenia_memory::GuestMemory,
|
||||
debugger: &mut xenia_debugger::Debugger,
|
||||
thunk_map: &HashMap<u32, (ModuleId, u16, String)>,
|
||||
stats: &mut ExecStats,
|
||||
tid: Option<u32>,
|
||||
thread_ref: xenia_cpu::ThreadRef,
|
||||
first_block_ptr: *const xenia_cpu::block_cache::DecodedBlock,
|
||||
first_pc_before: u32,
|
||||
) -> SlotOutcome {
|
||||
use xenia_cpu::interpreter::{step_block, StepResult};
|
||||
const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;
|
||||
|
||||
let budget = superblock_budget();
|
||||
|
||||
// Heisenbug fix (toolkit audit, 2026-06-21): probes and mem-watch are
|
||||
// OBSERVE-ONLY diagnostics and must NOT change guest scheduling. The
|
||||
// previous implementation disabled superblock chaining whenever any
|
||||
// probe / mem-watch was armed (so the per-block-entry observation in
|
||||
// `worker_prologue` was reached for every block). But chaining is what
|
||||
// determines thread interleaving, so arming a probe perturbed the
|
||||
// schedule — it starved the movie/XMV subsystem so it never reached the
|
||||
// video state, making the probe useless on exactly the code we most
|
||||
// needed to observe (`XENIA_SUPERBLOCK_BUDGET=1` reproduces the same
|
||||
// starvation, confirming chaining is the lever).
|
||||
//
|
||||
// The fix fires the SAME per-block-entry observation INSIDE the chain
|
||||
// loop, at every chained block's entry PC (see `fire_block_entry_probes`
|
||||
// below), so chaining — and therefore scheduling — is byte-identical
|
||||
// whether or not a probe is armed. `chain_allowed` no longer depends on
|
||||
// the probe/mem-watch state.
|
||||
//
|
||||
// `wants_hooks()` (the interactive debugger / breakpoint path) still
|
||||
// forces the per-instruction path in `worker_prologue` and never reaches
|
||||
// `run_superblock`, so the only remaining reason to never chain here is
|
||||
// the explicit budget==1 reproduction request.
|
||||
let chain_allowed = budget > 1;
|
||||
|
||||
// Per-block-entry diagnostic observation, replicating exactly what
|
||||
// `worker_prologue` does at the first block of a slot visit:
|
||||
// 1. the four `fire_*_if_match` probe helpers (read-only; each
|
||||
// re-checks its own armed set against the live ctx PC), and
|
||||
// 2. the mem-watch writer-context publish, so a watched store that
|
||||
// fires mid-block is attributed to the CORRECT chained block's
|
||||
// entry PC / LR (matching the single-block reporting granularity)
|
||||
// instead of the stale superblock-entry PC.
|
||||
// The closure is a pure function of the live scheduler context; the
|
||||
// caller must ensure `ctx.pc` equals the block-entry PC before calling.
|
||||
let probe_hw_id = wc.hw_id;
|
||||
let fire_block_entry_probes =
|
||||
|kernel: &mut xenia_kernel::KernelState, mem: &xenia_memory::GuestMemory| {
|
||||
let hw_id = probe_hw_id;
|
||||
if kernel.any_probe_active() {
|
||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||
kernel.fire_branch_probe_if_match(hw_id);
|
||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||
kernel.fire_lr_trace_if_match(hw_id);
|
||||
}
|
||||
if mem.has_mem_watch() {
|
||||
let ctx = kernel.scheduler.ctx(hw_id);
|
||||
let tid_w = kernel.scheduler.tid(hw_id).unwrap_or(0);
|
||||
xenia_memory::set_writer_ctx(tid_w, ctx.pc, ctx.lr as u32);
|
||||
}
|
||||
};
|
||||
|
||||
let mut block_ptr = first_block_ptr;
|
||||
let mut pc_before = first_pc_before;
|
||||
let mut total_executed: u64 = 0;
|
||||
|
||||
let (result, last_block_ptr, last_pc_before) = loop {
|
||||
let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
|
||||
let mmio_before = mem.mmio_access_count();
|
||||
let block = unsafe { &*block_ptr };
|
||||
let result = {
|
||||
let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
|
||||
step_block(ctx, mem, block)
|
||||
};
|
||||
let executed = kernel
|
||||
.scheduler
|
||||
.ctx_mut_ref(thread_ref)
|
||||
.cycle_count
|
||||
.saturating_sub(cycle_before);
|
||||
total_executed = total_executed.saturating_add(executed);
|
||||
|
||||
// STOP conditions (any → end the superblock, hand to epilogue):
|
||||
// non-Continue result (let the epilogue apply it), chaining
|
||||
// disabled, a sync-sensitive block just ran, MMIO was touched,
|
||||
// or the budget is spent.
|
||||
if !chain_allowed
|
||||
|| !matches!(result, StepResult::Continue)
|
||||
|| block.sync_sensitive
|
||||
|| mem.mmio_access_count() != mmio_before
|
||||
|| total_executed >= budget
|
||||
{
|
||||
break (result, block_ptr, pc_before);
|
||||
}
|
||||
|
||||
// Decide whether the NEXT PC is an ordinary guest block we can
|
||||
// chain into. Anything else (thunk / halt sentinel / unmapped)
|
||||
// needs the full prologue dispatch next round.
|
||||
let next_pc = kernel.scheduler.ctx(wc.hw_id).pc;
|
||||
if next_pc == LR_HALT
|
||||
|| (kernel.pc_in_thunk_band(next_pc) && thunk_map.contains_key(&next_pc))
|
||||
|| !mem.is_mapped(next_pc)
|
||||
{
|
||||
break (result, block_ptr, pc_before);
|
||||
}
|
||||
|
||||
// Chain into the next block. `ctx.pc` now equals `next_pc` (the
|
||||
// chained block's entry), so fire the per-block-entry observation
|
||||
// BEFORE stepping it — identical to what `worker_prologue` did at
|
||||
// the first block. This keeps the probe firing at EVERY armed
|
||||
// block-entry while leaving the chaining decision (and thus the
|
||||
// schedule) untouched. The first block was already observed by the
|
||||
// prologue, so we only observe the newly-chained blocks here.
|
||||
pc_before = next_pc;
|
||||
fire_block_entry_probes(kernel, mem);
|
||||
|
||||
// Build/fetch the next block. Re-borrows `wc.block_cache`, which
|
||||
// invalidates the previous `block_ptr` — but we've already finished
|
||||
// using it (only `sync_sensitive`/diagnostics were read, above), so
|
||||
// the raw-pointer aliasing rule is respected.
|
||||
block_ptr = wc.block_cache.lookup_or_build(next_pc, mem) as *const _;
|
||||
};
|
||||
|
||||
worker_epilogue(
|
||||
wc,
|
||||
kernel,
|
||||
debugger,
|
||||
stats,
|
||||
tid,
|
||||
thread_ref,
|
||||
last_block_ptr,
|
||||
last_pc_before,
|
||||
result,
|
||||
total_executed,
|
||||
)
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))]
|
||||
fn run_execution(
|
||||
mem: &xenia_memory::GuestMemory,
|
||||
@@ -2715,11 +3162,18 @@ fn run_execution(
|
||||
halt_on_deadlock: bool,
|
||||
shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||
) -> ExecStats {
|
||||
use xenia_cpu::interpreter::step_block;
|
||||
|
||||
let mut stats = ExecStats::default();
|
||||
let _ = quiet; // retained for future per-kind suppression
|
||||
|
||||
// APU stage 3 — give the XMA decoder a stable pointer to the guest memory
|
||||
// mapping `run_execution` runs against, so the kick MMIO write can run
|
||||
// Work() synchronously (canary `!use_dedicated_xma_thread` semantics: the
|
||||
// game observes the updated context the instant its kick store retires).
|
||||
// `mem` outlives this call for both the headless and UI paths.
|
||||
if let Ok(mut xma) = kernel.xma.lock() {
|
||||
xma.set_memory(mem);
|
||||
}
|
||||
|
||||
// `--halt-on-deadlock` CLI flag OR `XENIA_HALT_ON_DEADLOCK=1|true` env var:
|
||||
// when the scheduler next hits a hard deadlock (every live HW thread
|
||||
// blocked on a handle wait with no pending timer) we bail out with a
|
||||
@@ -2760,6 +3214,10 @@ fn run_execution(
|
||||
// re-decoding the same handful of pages 60×/s.
|
||||
let mut isr_decode_cache = xenia_cpu::decoder::DecodeCache::new();
|
||||
|
||||
// Tier-A perf #2: reusable buffer for `round_schedule_into` so the round
|
||||
// loop doesn't heap-allocate a `Vec<u8>` every iteration.
|
||||
let mut order_buf = [0u8; xenia_cpu::scheduler::HW_THREAD_COUNT];
|
||||
|
||||
'outer: loop {
|
||||
// Per-round prologue: budget / shutdown / heartbeat / vsync /
|
||||
// timers / audio-interrupt injection. Carved into
|
||||
@@ -2780,6 +3238,32 @@ fn run_execution(
|
||||
RoundCtl::BreakOuter => break,
|
||||
RoundCtl::Continue => {}
|
||||
}
|
||||
// ITERATE-2C Phase D — deposit the current instruction count so
|
||||
// `nt_create_event` can compute absolute auto-signal deadlines,
|
||||
// then drain any pending auto-signals whose deadline has passed.
|
||||
// Both calls are no-ops when `XENIA_SILPH_UI_AUTOSIGNAL_DELAY`
|
||||
// is unset (the pending queue stays empty).
|
||||
kernel.set_now_cycle_hint(stats.instruction_count);
|
||||
// Drive the coherent monotonic "now" the kernel deadline-arithmetic
|
||||
// reads (`KernelState::now_basis_at` -> `Scheduler::global_clock`)
|
||||
// from the deterministic retired-instruction count. Floored up (never
|
||||
// backwards). This is the LOCKSTEP analogue of the parallel writeback's
|
||||
// `advance_global_clock`: a parked/poll thread computing a relative
|
||||
// timeout via `parse_timeout` now reads a real, non-zero, monotone
|
||||
// basis instead of `idle_ctx`'s timebase-0, so its deadline lands in
|
||||
// the future and `coord_idle_advance` stops re-arming the constant
|
||||
// past deadline forever (the timebase-desync livelock / render-gate
|
||||
// root). Pure function of guest instructions -> bit-reproducible.
|
||||
kernel
|
||||
.scheduler
|
||||
.advance_global_clock_to(stats.instruction_count);
|
||||
// ITERATE-2J — tick the KeTimeStampBundle (ordinal 0x00AD) from the
|
||||
// same deterministic clock so the guest's worker-hub tick_count
|
||||
// deadline gate (`[block+0x10] + 66` ms) actually elapses. Without
|
||||
// this the block is frozen at boot and the hub spins forever,
|
||||
// starving tid14 on event 0x109c.
|
||||
kernel.update_timestamp_bundle(mem, kernel.scheduler.global_clock());
|
||||
kernel.fire_due_silph_autosignals(stats.instruction_count);
|
||||
dispatch_graphics_interrupts(
|
||||
kernel,
|
||||
mem,
|
||||
@@ -2788,10 +3272,12 @@ fn run_execution(
|
||||
thunk_map,
|
||||
);
|
||||
|
||||
// Snapshot round schedule. `round_schedule` also advances rng state
|
||||
// when seeded; mutation is intentional.
|
||||
// Snapshot round schedule. `round_schedule_into` also advances rng
|
||||
// state when seeded; mutation is intentional. Perf (Tier-A #2): fill
|
||||
// a reusable stack array instead of allocating a fresh Vec per round.
|
||||
kernel.scheduler.begin_round();
|
||||
let order = kernel.scheduler.round_schedule();
|
||||
let order_n = kernel.scheduler.round_schedule_into(&mut order_buf);
|
||||
let order = &order_buf[..order_n];
|
||||
|
||||
if order.is_empty() {
|
||||
// No Ready threads — advance time to the earliest pending
|
||||
@@ -2813,7 +3299,7 @@ fn run_execution(
|
||||
// GPU when block dispatch engages.
|
||||
let instrs_at_round_start = stats.instruction_count;
|
||||
|
||||
for hw_id in order {
|
||||
for &hw_id in order {
|
||||
let wc = &mut workers[hw_id as usize];
|
||||
match worker_prologue(
|
||||
wc,
|
||||
@@ -2832,34 +3318,25 @@ fn run_execution(
|
||||
block_ptr,
|
||||
pc_before,
|
||||
} => {
|
||||
// Block-cache step. The lockstep path keeps the
|
||||
// kernel state borrowed straight through (single
|
||||
// host thread, no contention). Step 03 of the
|
||||
// M3 real-parallelism plan introduces a
|
||||
// drop-and-reacquire window around `step_block`
|
||||
// for the parallel branch.
|
||||
let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
|
||||
let block = unsafe { &*block_ptr };
|
||||
let result = {
|
||||
let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
|
||||
step_block(ctx, mem, block)
|
||||
};
|
||||
let executed = kernel
|
||||
.scheduler
|
||||
.ctx_mut_ref(thread_ref)
|
||||
.cycle_count
|
||||
.saturating_sub(cycle_before);
|
||||
match worker_epilogue(
|
||||
// SUPERBLOCK runner (iterate-3AL). Instead of one
|
||||
// basic block per slot-visit, chain straight-line
|
||||
// blocks through their branches up to a deterministic
|
||||
// instruction budget, yielding back to the round only
|
||||
// at cross-thread synchronization points. Amortizes
|
||||
// the per-round (timebase / coord / round_schedule)
|
||||
// and per-slot (prologue) tax over hundreds of
|
||||
// instructions instead of ~6. See `run_superblock`.
|
||||
match run_superblock(
|
||||
wc,
|
||||
kernel,
|
||||
mem,
|
||||
debugger,
|
||||
thunk_map,
|
||||
&mut stats,
|
||||
tid,
|
||||
thread_ref,
|
||||
block_ptr,
|
||||
pc_before,
|
||||
result,
|
||||
executed,
|
||||
) {
|
||||
SlotOutcome::Continue => continue,
|
||||
SlotOutcome::BreakOuter => break 'outer,
|
||||
@@ -3118,6 +3595,16 @@ fn run_execution_parallel(
|
||||
.and_then(|t| guard.scheduler.find_by_tid(t))
|
||||
.unwrap_or(thread_ref);
|
||||
*guard.scheduler.ctx_mut_ref(target_ref) = ctx_taken;
|
||||
// Advance the parallel-mode coherent clock by
|
||||
// the instructions this block retired. This is
|
||||
// the single authoritative "now" the kernel
|
||||
// deadline-arithmetic reads in parallel mode
|
||||
// (per-thread `ctx.timebase` is incoherent here
|
||||
// because peers extract/zero their slots) —
|
||||
// keeping it monotonic breaks the timebase-
|
||||
// desync livelock where a woken thread re-armed
|
||||
// the same constant deadline forever.
|
||||
guard.scheduler.advance_global_clock(executed);
|
||||
// worker_epilogue's exit_current path
|
||||
// expects scheduler.current to be set
|
||||
// to the running thread.
|
||||
@@ -3204,6 +3691,25 @@ fn run_execution_parallel(
|
||||
}
|
||||
let mut guard = pre_outcome.1;
|
||||
|
||||
// ITERATE-2C Phase D — same auto-signal hook as the lockstep
|
||||
// path. Held under the same `kernel_arc` guard the rest of
|
||||
// this prologue runs under, so no extra locking.
|
||||
{
|
||||
let s = stats_mtx.lock().expect("stats mutex poisoned");
|
||||
guard.set_now_cycle_hint(s.instruction_count);
|
||||
guard.fire_due_silph_autosignals(s.instruction_count);
|
||||
}
|
||||
|
||||
// ITERATE-2J — tick the KeTimeStampBundle (ordinal 0x00AD) from
|
||||
// the parallel-mode coherent global_clock (summed per-block
|
||||
// retired instructions). Same fix as the lockstep loop: keeps the
|
||||
// guest's worker-hub tick_count deadline gate advancing so it
|
||||
// dispatches channel-3 and unblocks tid14 on event 0x109c.
|
||||
{
|
||||
let clock = guard.scheduler.global_clock();
|
||||
guard.update_timestamp_bundle(mem, clock);
|
||||
}
|
||||
|
||||
// Iterate-2.BE — host-driven synchronous ISR dispatch.
|
||||
// Runs under the kernel lock while workers are still parked
|
||||
// at the phaser B2 barrier (the coordinator hasn't published
|
||||
@@ -3414,7 +3920,17 @@ fn dispatch_graphics_interrupts(
|
||||
None
|
||||
};
|
||||
|
||||
/// X_KPCR offset of `prcb_data.current_cpu` (canary `xthread.cc`
|
||||
/// `SetActiveCpu` → `pcr.prcb_data.current_cpu`). The guest graphics
|
||||
/// ISR reads it via `lbz r10, 268(r13)` to decide which per-CPU bit of
|
||||
/// the swap-acknowledge fence to clear.
|
||||
const PCR_CURRENT_CPU_OFF: u32 = 268;
|
||||
|
||||
while let Some(source) = kernel.interrupts.peek_next() {
|
||||
let target_cpu = kernel
|
||||
.interrupts
|
||||
.peek_next_cpu()
|
||||
.unwrap_or(xenia_kernel::interrupts::VSYNC_TARGET_CPU);
|
||||
// Victim selection: Ready first, then Blocked (canary's
|
||||
// `XThread::GetCurrentThread()` analog — any live thread will
|
||||
// do for borrowing context). Skip Idle/Exited/ServicingIrq.
|
||||
@@ -3484,6 +4000,19 @@ fn dispatch_graphics_interrupts(
|
||||
saved
|
||||
};
|
||||
|
||||
// Impersonate the interrupt's target CPU on the borrowed thread's
|
||||
// PCR, mirroring canary `EmulateCPInterruptDPC` →
|
||||
// `XThread::SetActiveCpu(cpu)`. The guest swap-complete ISR clears
|
||||
// `1 << [pcr.current_cpu]` from the per-present swap-acknowledge
|
||||
// fence; if it runs on the wrong CPU it clears the wrong bit and
|
||||
// the GPU's trailing `WAIT_REG_MEM` on that fence never releases —
|
||||
// stranding the present/title loop. Save/restore so borrowing a
|
||||
// thread doesn't permanently rewrite its processor number.
|
||||
let pcr_addr = (kernel.scheduler.ctx_mut_ref(target_ref).gpr[13] as u32)
|
||||
.wrapping_add(PCR_CURRENT_CPU_OFF);
|
||||
let saved_cpu = mem.read_u8(pcr_addr);
|
||||
mem.write_u8(pcr_addr, target_cpu);
|
||||
|
||||
// Stash the previous `scheduler.current` (call_export reaches
|
||||
// it; imports the ISR calls must dispatch on the borrowed
|
||||
// thread). Restore on the way out.
|
||||
@@ -3555,6 +4084,9 @@ fn dispatch_graphics_interrupts(
|
||||
isr_instrs += 1;
|
||||
match r {
|
||||
StepResult::Continue => {}
|
||||
// db16cyc inside the synchronous ISR has no slot to yield —
|
||||
// the ISR runs to completion on the borrowed context.
|
||||
StepResult::Yield => {}
|
||||
StepResult::SystemCall => {
|
||||
tracing::warn!("graphics ISR hit `sc` instruction; aborting");
|
||||
break;
|
||||
@@ -3573,6 +4105,7 @@ fn dispatch_graphics_interrupts(
|
||||
|
||||
// Restore the borrowed context.
|
||||
saved.restore(kernel.scheduler.ctx_mut_ref(target_ref));
|
||||
mem.write_u8(pcr_addr, saved_cpu);
|
||||
kernel.scheduler.current = prev_current;
|
||||
kernel.interrupts.delivered += 1;
|
||||
|
||||
@@ -3741,10 +4274,18 @@ fn dump_thread_diagnostic(
|
||||
),
|
||||
}
|
||||
}
|
||||
if quiet {
|
||||
return;
|
||||
}
|
||||
use xenia_kernel::objects::KernelObject;
|
||||
|
||||
// Toolkit-audit fix (2026-06-21): only the ALWAYS-ON thread/waiter table
|
||||
// is suppressed by `--quiet`. The explicitly-armed diagnostics below
|
||||
// (`--trace-handles`, `--trace-handles-focus`, `--dump-addr`) are
|
||||
// requested output — arming the flag IS the user asking for it — and
|
||||
// were previously swallowed by the blanket `if quiet { return; }`, which
|
||||
// made the documented headless `--quiet` invocation silently drop every
|
||||
// handle/focus/dump report. They are each self-gated below (on
|
||||
// `audit.enabled` / `!audit.focus.is_empty()` / `!dump_addrs.is_empty()`)
|
||||
// so they only print when actually armed.
|
||||
if !quiet {
|
||||
println!("\n=== Thread diagnostics ===");
|
||||
for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
|
||||
if slot.runqueue.is_empty() {
|
||||
@@ -3841,6 +4382,7 @@ fn dump_thread_diagnostic(
|
||||
println!(" cs={:#010x} waiters(tid)={:?}", cs_ptr, tids);
|
||||
}
|
||||
}
|
||||
} // end `if !quiet` (always-on thread/waiter table)
|
||||
|
||||
// Audit trails (only when --trace-handles flipped the flag). For each
|
||||
// tracked handle, emit a compact block: kind, creator, and the bounded
|
||||
@@ -4253,6 +4795,12 @@ fn run_with_ui(
|
||||
.map_err(|e| anyhow::anyhow!("winit event loop build failed: {e}"))?;
|
||||
let (ui_handles, kernel_bridge) = xenia_ui::build(event_loop.create_proxy());
|
||||
kernel.ui = Some(kernel_bridge);
|
||||
// iterate-3O: enable per-draw geometry capture so the UI can replay real
|
||||
// guest draws. Only on the `--ui` path; headless `check` never gets here,
|
||||
// so the deterministic core/golden stays untouched.
|
||||
if let Some(gpu) = kernel.gpu.as_inline_mut() {
|
||||
gpu.enable_frame_capture();
|
||||
}
|
||||
|
||||
let shutdown = std::sync::Arc::clone(&ui_handles.shutdown);
|
||||
let title_owned = std::path::Path::new(title)
|
||||
@@ -4510,8 +5058,23 @@ fn cmd_dis(
|
||||
// pointer-validity oracle; runs over .rdata + .data.
|
||||
let function_starts: std::collections::BTreeSet<u32> =
|
||||
func_analysis.functions.keys().copied().collect();
|
||||
let vtables = xenia_analysis::vtables::analyze(
|
||||
&pe_image, base, §ions, &function_starts,
|
||||
// Anchor discovery: recover vtable bases from constructor vptr-write
|
||||
// stores so a vtable with non-function head words (null / pure-virtual /
|
||||
// unrecognised thunk slots) isn't fragmented away by the contiguity
|
||||
// heuristic. (Fixes e.g. the XMV engine vtable 0x8200a908.)
|
||||
let vptr_anchor_funcs: std::collections::BTreeMap<u32, (u32, bool)> = func_analysis
|
||||
.functions
|
||||
.iter()
|
||||
.map(|(&s, fi)| (s, (fi.end, fi.is_saverestore)))
|
||||
.collect();
|
||||
let vptr_block_boundaries: std::collections::HashSet<u32> =
|
||||
xref_result.labels.keys().copied().collect();
|
||||
let vtable_anchors = xenia_analysis::vtables::scan_vptr_write_constants(
|
||||
&pe_image, base, &vptr_anchor_funcs, §ions, &vptr_block_boundaries,
|
||||
);
|
||||
info!(vtable_anchors = vtable_anchors.len(), "vptr-write anchor scan complete");
|
||||
let vtables = xenia_analysis::vtables::analyze_with_anchors(
|
||||
&pe_image, base, §ions, &function_starts, &vtable_anchors,
|
||||
);
|
||||
let rtti_count = vtables.iter().filter(|v| v.rtti_present).count();
|
||||
info!(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"instructions": 2000005,
|
||||
"instructions": 2000073,
|
||||
"imports": 5635,
|
||||
"unimpl": 0,
|
||||
"draws": 0,
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
"instructions": 50000001,
|
||||
"imports": 40454,
|
||||
"instructions": 50000200,
|
||||
"imports": 189264,
|
||||
"unimpl": 0,
|
||||
"draws": 0,
|
||||
"swaps": 1,
|
||||
"unique_render_targets": 0,
|
||||
"shader_blobs_live": 0,
|
||||
"texture_cache_entries": 0
|
||||
"draws": 768,
|
||||
"swaps": 157,
|
||||
"unique_render_targets": 2,
|
||||
"shader_blobs_live": 6,
|
||||
"texture_cache_entries": 1
|
||||
}
|
||||
|
||||
@@ -57,6 +57,16 @@ fn run_oracle(label: &str, max_instr: u64, golden_rel: &str) {
|
||||
&iso,
|
||||
"-n",
|
||||
&max_instr_str,
|
||||
// Pin the inline (single-threaded) GPU backend. The default
|
||||
// threaded backend drains the ring on a separate host thread,
|
||||
// so the exact instruction at which a CP interrupt is queued —
|
||||
// and therefore when the guest's swap-complete ISR callback runs
|
||||
// (iterate-2S armed it via SCRATCH_REG writeback) — varies run to
|
||||
// run. Inline draining is instruction-count-deterministic, which
|
||||
// is what a regression golden needs. (The threaded path is the
|
||||
// documented "GPU thread race" the stable-digest already warns
|
||||
// about.)
|
||||
"--gpu-inline",
|
||||
"--stable-digest",
|
||||
"--expect",
|
||||
&golden_str,
|
||||
|
||||
@@ -6,5 +6,12 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
xenia-types = { workspace = true }
|
||||
xenia-memory = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
# Raw FFmpeg FFI for the XMA2 audio decoder (stage 3). The system libs are
|
||||
# FFmpeg 6.1 (libavcodec 60), so we pin the matching `6.1` series. The `build`
|
||||
# feature regenerates bindings via bindgen against the installed headers, so
|
||||
# the FFI matches the distro FFmpeg exactly. We only need avcodec + avutil.
|
||||
ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] }
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
pub mod xma;
|
||||
pub mod xma2_codec;
|
||||
pub mod xma_decode;
|
||||
|
||||
pub use xma::{build_mmio_region, XmaDecoder, XMA_CONTEXT_COUNT, XMA_CONTEXT_SIZE};
|
||||
|
||||
/// Audio processing unit stub. Logging only for now.
|
||||
pub struct AudioSystem {
|
||||
pub enabled: bool,
|
||||
|
||||
932
crates/xenia-apu/src/xma.rs
Normal file
932
crates/xenia-apu/src/xma.rs
Normal file
@@ -0,0 +1,932 @@
|
||||
//! Register-mapped XMA context system — a faithful port of xenia-canary's
|
||||
//! `apu/xma_decoder.cc` context-array + MMIO machinery, MINUS the audio
|
||||
//! decoder itself (stage 3).
|
||||
//!
|
||||
//! The guest allocates XMA contexts via `XMACreateContext` (which hands back a
|
||||
//! pointer into our 320-entry context array in physical guest memory), writes
|
||||
//! the 64-byte `XMA_CONTEXT_DATA` struct, then *kicks* decode by writing the
|
||||
//! per-context bit into the `0x7FEA0000` register aperture. This module
|
||||
//! satisfies all of that without faulting and records which contexts the guest
|
||||
//! kicked; stage 3 will consume the recorded `pending` flags to actually
|
||||
//! produce PCM.
|
||||
//!
|
||||
//! ## Byte order
|
||||
//! The guest accesses the aperture byte-reversed (`stwbrx`/`lwbrx`), so the raw
|
||||
//! `u32` our MMIO boundary delivers is byte-swapped relative to the logical
|
||||
//! register value — exactly the situation canary handles with `xe::byte_swap`.
|
||||
//! So `write_register` swaps the incoming value before decoding and the
|
||||
//! register file holds host-order values; `read_register` swaps on the way out.
|
||||
//! This was proven empirically: the guest's Clear writes arrive as
|
||||
//! `0x01000000`/`0x02000000`/`0x04000000`, i.e. byte-reversed `1`/`2`/`4`,
|
||||
//! targeting contexts 0/1/2 (which it had just allocated) — NOT 24/25/26. The
|
||||
//! register-index math (`(addr & 0xFFFF) / 4`) is the same as canary's.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use xenia_memory::access::MemoryAccess;
|
||||
use xenia_memory::{GuestMemory, MmioRegion};
|
||||
|
||||
use crate::xma_decode::{self, ContextDecodeState, XmaContextData};
|
||||
|
||||
/// Size in bytes of an `XMA_CONTEXT_DATA` struct (canary `xma_context.h`).
|
||||
/// Stage 1 does not decode the fields — only the stride matters.
|
||||
pub const XMA_CONTEXT_SIZE: u32 = 64;
|
||||
/// Number of XMA contexts the hardware exposes (canary `kContextCount`).
|
||||
pub const XMA_CONTEXT_COUNT: usize = 320;
|
||||
|
||||
/// Register aperture base (guest physical). Canary maps the XMA decoder at
|
||||
/// `0x7FEA0000` in `XmaDecoder::Setup`.
|
||||
pub const APERTURE_BASE: u32 = 0x7FEA_0000;
|
||||
/// Mask used by `MmioRegion::contains` so any `0x7FEAxxxx` address hits.
|
||||
pub const APERTURE_MASK: u32 = 0xFFFF_0000;
|
||||
/// Total aperture size in bytes (the low 16-bit register window).
|
||||
pub const APERTURE_SIZE: u32 = 0x0001_0000;
|
||||
|
||||
// ----- Register indices (canary `XmaRegister` enum / xma_register_table.inc).
|
||||
// Indices are dword indices: byte offset = index * 4.
|
||||
|
||||
/// `ContextArrayAddress` — physical base of the context array. byte 0x1800.
|
||||
const REG_CONTEXT_ARRAY_ADDRESS: u32 = 0x600;
|
||||
/// `CurrentContextIndex` — the context the HW is currently servicing. byte
|
||||
/// 0x1818. Polled by the guest; we rotate it so a poll never sticks.
|
||||
const REG_CURRENT_CONTEXT_INDEX: u32 = 0x606;
|
||||
|
||||
/// First of the 10 `ContextNKick` registers (`Context0Kick`..`Context9Kick`).
|
||||
/// byte 0x1940. Each register's bit N kicks context `base*32 + N`.
|
||||
const REG_CONTEXT_KICK_BASE: u32 = 0x650;
|
||||
/// First of the 10 `ContextNLock` registers. byte 0x1A40.
|
||||
const REG_CONTEXT_LOCK_BASE: u32 = 0x690;
|
||||
/// First of the 10 `ContextNClear` registers. byte 0x1A80.
|
||||
const REG_CONTEXT_CLEAR_BASE: u32 = 0x6A0;
|
||||
/// Each group spans 10 registers (320 contexts / 32-per-register).
|
||||
const CONTEXT_GROUP_LEN: u32 = 10;
|
||||
|
||||
/// Number of 32-bit words backing the register file. The highest index we
|
||||
/// touch is `0x6A9`; round up generously so any in-aperture index is in range
|
||||
/// (64 KB aperture / 4).
|
||||
const REGISTER_FILE_WORDS: usize = 0x4000;
|
||||
|
||||
/// Register-mapped XMA context array. Owns the allocation bitmap, the register
|
||||
/// file, and the per-context kick/enable bookkeeping that stage 3 consumes.
|
||||
pub struct XmaDecoder {
|
||||
/// Guest virtual address of the context array (handed back by
|
||||
/// `allocate_context`).
|
||||
context_array_guest_va: u32,
|
||||
/// Physical address stored into `ContextArrayAddress` (reg 0x600).
|
||||
context_array_phys: u32,
|
||||
/// 320-slot allocation bitmap, one bit per context (`bitmap[i>>6]` bit
|
||||
/// `i & 63`). A set bit means *allocated*.
|
||||
bitmap: [u64; (XMA_CONTEXT_COUNT + 63) / 64],
|
||||
/// Flat register file, host-native values. Indexed by dword register index.
|
||||
registers: Vec<u32>,
|
||||
/// Per-context "decode requested" flag, set on Kick, cleared on Clear.
|
||||
/// Stage 3 drains this to produce PCM.
|
||||
pending: [bool; XMA_CONTEXT_COUNT],
|
||||
/// Per-context enable flag. A Lock disables; a Kick (re-)enables. Mirrors
|
||||
/// canary's "is_enabled" notion loosely — exact decode semantics are
|
||||
/// stage 3.
|
||||
enabled: [bool; XMA_CONTEXT_COUNT],
|
||||
/// Total kicks observed (diagnostic; lets headless logs show progress).
|
||||
kick_count: u64,
|
||||
/// Rotating value served for `CurrentContextIndex` reads so a guest poll
|
||||
/// can't spin forever on a fixed value. Atomic so the read path can stay
|
||||
/// `&self`.
|
||||
current_context_index: AtomicU32,
|
||||
/// Per-context stage-3 decode state (FFmpeg codec, staged PCM frame, ring
|
||||
/// bookkeeping). Lazily populated as contexts are decoded.
|
||||
decode_state: Vec<ContextDecodeState>,
|
||||
/// Total PCM bytes written to guest output buffers (diagnostic).
|
||||
pcm_bytes_total: u64,
|
||||
/// Stable pointer to the guest memory mapping, captured at init. Used to run
|
||||
/// `Work()` SYNCHRONOUSLY inside the kick MMIO write — exactly as canary's
|
||||
/// default `!use_dedicated_xma_thread` path does (`context.Work()` right in
|
||||
/// `WriteRegister`), so the game sees the updated context the instant its
|
||||
/// kick store retires. The mapping lives for the whole run; decode is
|
||||
/// deterministic and happens on the CPU thread, so this is determinism-safe.
|
||||
mem_ptr: *const GuestMemory,
|
||||
}
|
||||
|
||||
// The decoder is owned behind an `Arc<Mutex<..>>` and only ever touched from the
|
||||
// CPU scheduler thread (kick MMIO writes + the per-round pump). The raw `mem_ptr`
|
||||
// is a stable whole-run mapping; access is single-threaded.
|
||||
unsafe impl Send for XmaDecoder {}
|
||||
|
||||
impl XmaDecoder {
|
||||
/// Construct an un-initialized decoder. Call [`Self::init`] once the
|
||||
/// context-array memory has been reserved.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
context_array_guest_va: 0,
|
||||
context_array_phys: 0,
|
||||
bitmap: [0; (XMA_CONTEXT_COUNT + 63) / 64],
|
||||
registers: vec![0; REGISTER_FILE_WORDS],
|
||||
pending: [false; XMA_CONTEXT_COUNT],
|
||||
enabled: [false; XMA_CONTEXT_COUNT],
|
||||
kick_count: 0,
|
||||
current_context_index: AtomicU32::new(0),
|
||||
decode_state: (0..XMA_CONTEXT_COUNT).map(|_| ContextDecodeState::new()).collect(),
|
||||
pcm_bytes_total: 0,
|
||||
mem_ptr: std::ptr::null(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Capture the stable guest-memory mapping so the kick MMIO path can run
|
||||
/// `Work()` synchronously (canary semantics). Call once at boot, after the
|
||||
/// final `mem` is in its long-lived location.
|
||||
pub fn set_memory(&mut self, mem: &GuestMemory) {
|
||||
self.mem_ptr = mem as *const GuestMemory;
|
||||
}
|
||||
|
||||
/// Wire in the context-array addresses (after the app reserves the buffer)
|
||||
/// and publish the physical base into `ContextArrayAddress` (reg 0x600),
|
||||
/// exactly as canary's `XmaDecoder::Setup` does.
|
||||
pub fn init(&mut self, context_array_guest_va: u32, context_array_phys: u32) {
|
||||
self.context_array_guest_va = context_array_guest_va;
|
||||
self.context_array_phys = context_array_phys;
|
||||
self.registers[REG_CONTEXT_ARRAY_ADDRESS as usize] = context_array_phys;
|
||||
tracing::info!(
|
||||
va = format_args!("{context_array_guest_va:#010x}"),
|
||||
phys = format_args!("{context_array_phys:#010x}"),
|
||||
"xma: context array initialized"
|
||||
);
|
||||
}
|
||||
|
||||
/// Acquire a free context slot and return its guest pointer
|
||||
/// (`context_array_guest_va + i*64`), or 0 if all 320 slots are in use.
|
||||
/// Mirrors canary's `XmaDecoder::AllocateContext`.
|
||||
pub fn allocate_context(&mut self) -> u32 {
|
||||
for i in 0..XMA_CONTEXT_COUNT {
|
||||
let word = i >> 6;
|
||||
let bit = 1u64 << (i & 63);
|
||||
if self.bitmap[word] & bit == 0 {
|
||||
self.bitmap[word] |= bit;
|
||||
let ptr = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE;
|
||||
tracing::info!(
|
||||
index = i,
|
||||
ptr = format_args!("{ptr:#010x}"),
|
||||
"xma: allocate_context"
|
||||
);
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
tracing::warn!("xma: allocate_context — all {} slots in use", XMA_CONTEXT_COUNT);
|
||||
0
|
||||
}
|
||||
|
||||
/// Free the slot backing `guest_ptr`. Mirrors canary's
|
||||
/// `XmaDecoder::ReleaseContext`. Out-of-range / unaligned pointers are
|
||||
/// ignored (the guest never faults).
|
||||
pub fn release_context(&mut self, guest_ptr: u32) {
|
||||
if guest_ptr < self.context_array_guest_va {
|
||||
return;
|
||||
}
|
||||
let offset = guest_ptr - self.context_array_guest_va;
|
||||
let i = (offset / XMA_CONTEXT_SIZE) as usize;
|
||||
if i >= XMA_CONTEXT_COUNT {
|
||||
return;
|
||||
}
|
||||
let word = i >> 6;
|
||||
let bit = 1u64 << (i & 63);
|
||||
self.bitmap[word] &= !bit;
|
||||
self.pending[i] = false;
|
||||
self.enabled[i] = false;
|
||||
tracing::info!(index = i, ptr = format_args!("{guest_ptr:#010x}"), "xma: release_context");
|
||||
}
|
||||
|
||||
/// Read a register. Returns the stored value, except `CurrentContextIndex`
|
||||
/// (0x606) which rotates `0..XMA_CONTEXT_COUNT` per read so a polling guest
|
||||
/// always sees forward progress. Out-of-range indices read 0.
|
||||
pub fn read_register(&self, reg_index: u32) -> u32 {
|
||||
// The guest accesses the aperture byte-reversed (`lwbrx`), so the
|
||||
// register file holds host-order values and we swap on the way out —
|
||||
// exactly as canary's `ReadRegister` returns `xe::byte_swap(reg)`.
|
||||
let host = if reg_index == REG_CURRENT_CONTEXT_INDEX {
|
||||
// Rotate mod context count on each read so a poll never sticks.
|
||||
let prev = self.current_context_index.fetch_add(1, Ordering::Relaxed);
|
||||
prev % XMA_CONTEXT_COUNT as u32
|
||||
} else {
|
||||
self.registers.get(reg_index as usize).copied().unwrap_or(0)
|
||||
};
|
||||
host.swap_bytes()
|
||||
}
|
||||
|
||||
/// Write a register, then apply the side-effect of the Kick / Lock / Clear
|
||||
/// register groups. Each register in a group covers 32 contexts; bit N maps
|
||||
/// to `context_id = (reg_index - group_base) * 32 + N`. We iterate set bits
|
||||
/// with `trailing_zeros` + clear-lowest-bit, mirroring canary's
|
||||
/// `std::countr_zero` loop. The incoming value is byte-swapped first (see
|
||||
/// below).
|
||||
pub fn write_register(&mut self, reg_index: u32, value: u32) {
|
||||
// The guest writes the aperture byte-reversed (`stwbrx`); undo it so the
|
||||
// register file holds host-order values, mirroring canary's
|
||||
// `WriteRegister` which does `value = xe::byte_swap(value)` first. Proven
|
||||
// by the guest's Clear writes (`0x01000000` == context 0, not 24).
|
||||
let value = value.swap_bytes();
|
||||
if let Some(slot) = self.registers.get_mut(reg_index as usize) {
|
||||
*slot = value;
|
||||
}
|
||||
|
||||
if (REG_CONTEXT_KICK_BASE..REG_CONTEXT_KICK_BASE + CONTEXT_GROUP_LEN).contains(®_index) {
|
||||
let base = (reg_index - REG_CONTEXT_KICK_BASE) * 32;
|
||||
let mut bits = value;
|
||||
while bits != 0 {
|
||||
let b = bits.trailing_zeros();
|
||||
bits &= bits - 1;
|
||||
let context_id = (base + b) as usize;
|
||||
if context_id < XMA_CONTEXT_COUNT {
|
||||
self.pending[context_id] = true;
|
||||
self.enabled[context_id] = true;
|
||||
self.kick_count += 1;
|
||||
tracing::debug!(
|
||||
context_id,
|
||||
kick_count = self.kick_count,
|
||||
"xma: kick (decode requested)"
|
||||
);
|
||||
// Canary `!use_dedicated_xma_thread`: run Work() right here so
|
||||
// the game observes the updated context when its kick store
|
||||
// retires. Safe — `mem_ptr` is a stable whole-run mapping and
|
||||
// we're on the CPU thread.
|
||||
if !self.mem_ptr.is_null() {
|
||||
let mem: &GuestMemory = unsafe { &*self.mem_ptr };
|
||||
self.enabled[context_id] = false;
|
||||
self.work_one(mem, context_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (REG_CONTEXT_LOCK_BASE..REG_CONTEXT_LOCK_BASE + CONTEXT_GROUP_LEN)
|
||||
.contains(®_index)
|
||||
{
|
||||
let base = (reg_index - REG_CONTEXT_LOCK_BASE) * 32;
|
||||
let mut bits = value;
|
||||
while bits != 0 {
|
||||
let b = bits.trailing_zeros();
|
||||
bits &= bits - 1;
|
||||
let context_id = (base + b) as usize;
|
||||
if context_id < XMA_CONTEXT_COUNT {
|
||||
self.enabled[context_id] = false;
|
||||
tracing::debug!(context_id, "xma: lock (context disabled)");
|
||||
}
|
||||
}
|
||||
} else if (REG_CONTEXT_CLEAR_BASE..REG_CONTEXT_CLEAR_BASE + CONTEXT_GROUP_LEN)
|
||||
.contains(®_index)
|
||||
{
|
||||
let base = (reg_index - REG_CONTEXT_CLEAR_BASE) * 32;
|
||||
let mut bits = value;
|
||||
while bits != 0 {
|
||||
let b = bits.trailing_zeros();
|
||||
bits &= bits - 1;
|
||||
let context_id = (base + b) as usize;
|
||||
if context_id < XMA_CONTEXT_COUNT {
|
||||
self.pending[context_id] = false;
|
||||
self.enabled[context_id] = false;
|
||||
tracing::debug!(context_id, "xma: clear (context state reset)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Total kicks observed so far (diagnostic; stage 3 will consume `pending`).
|
||||
pub fn kick_count(&self) -> u64 {
|
||||
self.kick_count
|
||||
}
|
||||
|
||||
/// Whether context `i` has a pending (un-serviced) kick. Stage-3 hook.
|
||||
pub fn is_pending(&self, i: usize) -> bool {
|
||||
self.pending.get(i).copied().unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Total PCM bytes the decoder has written to guest output buffers.
|
||||
pub fn pcm_bytes_total(&self) -> u64 {
|
||||
self.pcm_bytes_total
|
||||
}
|
||||
|
||||
/// Stage-3 entry point. Called once per scheduler round from the CPU
|
||||
/// thread's per-round coordinator. For each context with a pending kick,
|
||||
/// run one `Work()` pass (canary `XmaContextNew::Work`): read the context,
|
||||
/// decode available input into PCM, drain into the output ring, and write
|
||||
/// the decoder-owned fields back. Deterministic — no host thread, no clock.
|
||||
pub fn decode_pending(&mut self, mem: &GuestMemory) {
|
||||
if self.context_array_guest_va == 0 {
|
||||
return;
|
||||
}
|
||||
for i in 0..XMA_CONTEXT_COUNT {
|
||||
if !self.pending[i] || !self.enabled[i] {
|
||||
continue;
|
||||
}
|
||||
// Canary `Work` clears is_enabled at entry; a fresh kick re-enables.
|
||||
self.enabled[i] = false;
|
||||
self.work_one(mem, i);
|
||||
}
|
||||
}
|
||||
|
||||
/// One `Work()` pass for context `i`. Faithful to canary's orchestration but
|
||||
/// uses the mainline xma2 decoder (whole-packet driven) for the actual
|
||||
/// frame decode in place of canary's per-frame `Decode()`.
|
||||
fn work_one(&mut self, mem: &GuestMemory, i: usize) {
|
||||
let ctx_va = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE;
|
||||
let data = XmaContextData::read(mem, ctx_va);
|
||||
let initial = data;
|
||||
|
||||
if data.output_buffer_valid == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut data = data;
|
||||
self.decode_into_output(mem, i, ctx_va, &mut data, &initial);
|
||||
}
|
||||
|
||||
/// Decode available input packets into PCM and drain into the output ring.
|
||||
fn decode_into_output(
|
||||
&mut self,
|
||||
mem: &GuestMemory,
|
||||
i: usize,
|
||||
ctx_va: u32,
|
||||
data: &mut XmaContextData,
|
||||
initial: &XmaContextData,
|
||||
) {
|
||||
use xma_decode::*;
|
||||
|
||||
let output_capacity = data.output_buffer_block_count * OUTPUT_BYTES_PER_BLOCK;
|
||||
if output_capacity == 0 {
|
||||
return;
|
||||
}
|
||||
let out_backing = xma_phys_to_backing(data.output_buffer_ptr);
|
||||
let mut write_off = data.output_buffer_write_offset * OUTPUT_BYTES_PER_BLOCK;
|
||||
let read_off = data.output_buffer_read_offset * OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
// write_count: free space in the ring from write to read.
|
||||
let free_bytes = ring_write_count(read_off, write_off, output_capacity);
|
||||
self.decode_state[i].remaining_subframe_blocks_in_output =
|
||||
(free_bytes / OUTPUT_BYTES_PER_BLOCK) as i32;
|
||||
|
||||
let effective_sdc = data.subframe_decode_count.max(1);
|
||||
let min_blocks = effective_sdc as i32 + data.output_buffer_padding as i32;
|
||||
|
||||
if min_blocks > self.decode_state[i].remaining_subframe_blocks_in_output {
|
||||
// No room — write back unchanged and wait for the game to drain.
|
||||
store_merged_pub(mem, ctx_va, data, initial);
|
||||
return;
|
||||
}
|
||||
|
||||
let mut produced_any = false;
|
||||
|
||||
// Ensure codec configured for current rate/channels.
|
||||
let rate = sample_rate_hz(data.sample_rate);
|
||||
let channels = if data.is_stereo != 0 { 2 } else { 1 };
|
||||
self.ensure_codec(i, rate, channels);
|
||||
|
||||
// Main decode loop: while there's output ring room and valid input.
|
||||
loop {
|
||||
if self.decode_state[i].remaining_subframe_blocks_in_output < min_blocks {
|
||||
break;
|
||||
}
|
||||
|
||||
// If we still have undrained subframes from a prior decode, consume
|
||||
// them first (canary Consume before next Decode).
|
||||
if self.decode_state[i].current_frame_remaining_subframes == 0 {
|
||||
// Need a fresh decoded frame. Pull from the codec, feeding input
|
||||
// packets as required.
|
||||
if !self.produce_frame(mem, i, data) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Consume: write up to `effective_sdc` subframes (256B blocks) of
|
||||
// the staged raw_frame into the output ring.
|
||||
let total_subframes =
|
||||
((BYTES_PER_FRAME_CHANNEL / OUTPUT_BYTES_PER_BLOCK) << data.is_stereo) as u8;
|
||||
let remaining = self.decode_state[i].current_frame_remaining_subframes;
|
||||
let to_write = remaining.min(effective_sdc as u8);
|
||||
let frame_read_off = (total_subframes - remaining) as usize * OUTPUT_BYTES_PER_BLOCK as usize;
|
||||
let nbytes = to_write as u32 * OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
// Write into the output ring (handle wrap).
|
||||
let raw = &self.decode_state[i].raw_frame;
|
||||
write_off = ring_write(
|
||||
mem,
|
||||
out_backing,
|
||||
output_capacity,
|
||||
write_off,
|
||||
&raw[frame_read_off..frame_read_off + nbytes as usize],
|
||||
);
|
||||
self.pcm_bytes_total += nbytes as u64;
|
||||
produced_any = true;
|
||||
|
||||
let headroom = if remaining - to_write == 0 {
|
||||
data.output_buffer_padding as i32
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.decode_state[i].remaining_subframe_blocks_in_output -=
|
||||
to_write as i32 + headroom;
|
||||
self.decode_state[i].current_frame_remaining_subframes -= to_write;
|
||||
}
|
||||
|
||||
// Writeback offsets.
|
||||
data.output_buffer_write_offset = write_off / OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
if self.decode_state[i].remaining_subframe_blocks_in_output == 0
|
||||
&& write_off == read_off
|
||||
{
|
||||
data.output_buffer_valid = 0;
|
||||
}
|
||||
if !produced_any && !data.is_any_input_buffer_valid() {
|
||||
data.output_buffer_valid = 0;
|
||||
}
|
||||
|
||||
store_merged_pub(mem, ctx_va, data, initial);
|
||||
}
|
||||
|
||||
/// Configure (or reconfigure) the FFmpeg xma2 codec for this context.
|
||||
fn ensure_codec(&mut self, i: usize, rate: u32, channels: u32) {
|
||||
let st = &mut self.decode_state[i];
|
||||
if st.codec.is_some() && st.codec_rate == rate && st.codec_channels == channels {
|
||||
return;
|
||||
}
|
||||
match crate::xma2_codec::Xma2Codec::new(rate, channels) {
|
||||
Ok(c) => {
|
||||
st.codec = Some(c);
|
||||
st.codec_rate = rate;
|
||||
st.codec_channels = channels;
|
||||
tracing::info!(ctx = i, rate, channels, "xma: xma2 codec configured");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(ctx = i, rate, channels, error = %e, "xma: xma2 codec init failed");
|
||||
st.codec = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Produce one decoded 512-sample frame into `raw_frame` (interleaved S16BE).
|
||||
///
|
||||
/// Input-consumption model (faithful to canary's packet/buffer contract).
|
||||
///
|
||||
/// The mainline xma2 decoder consumes whole 2 KB packets via `send_packet`
|
||||
/// and emits frames in bursts (internal FIFO + lookahead), so its intake
|
||||
/// position can't be read per-frame. We therefore keep TWO cursors:
|
||||
///
|
||||
/// 1. A private FFmpeg *feed* cursor (`feed_buffer`/`feed_packet_index`)
|
||||
/// that hands raw packets to FFmpeg only far enough ahead to keep the
|
||||
/// PCM queue stocked. This follows the same buffer ping-pong as the
|
||||
/// guest but is NOT what the guest observes.
|
||||
/// 2. The guest-visible `input_buffer_read_offset`, advanced by exactly
|
||||
/// ONE compressed frame each time we emit a 512-sample frame to the
|
||||
/// guest — via `advance_read_offset_one_frame`, a faithful port of the
|
||||
/// offset arithmetic in canary's `Decode()`. This crosses packet and
|
||||
/// buffer boundaries (and fires SwapInputBuffer, clearing the drained
|
||||
/// buffer's valid bit) at canary's true per-frame cadence, which is
|
||||
/// what the WMV demuxer polls to refill ADV.wmv.
|
||||
///
|
||||
/// Decoupling the two means FFmpeg's whole-packet burst framing no longer
|
||||
/// freezes the guest-visible offset: the offset now tracks emitted output,
|
||||
/// so the input buffer is consumed and swapped as the movie actually plays.
|
||||
fn produce_frame(&mut self, mem: &GuestMemory, i: usize, data: &mut XmaContextData) -> bool {
|
||||
use xma_decode::*;
|
||||
let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 };
|
||||
let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize;
|
||||
|
||||
// Top up FFmpeg's internal FIFO (and our queue) just enough to satisfy
|
||||
// one frame, feeding raw packets via the private feed cursor.
|
||||
if self.decode_state[i].pcm_queue.len() < frame_bytes {
|
||||
self.feed_codec(mem, i, data);
|
||||
}
|
||||
|
||||
// Pop exactly one 512-sample frame from the queue into raw_frame.
|
||||
if self.decode_state[i].pcm_queue.len() < frame_bytes {
|
||||
return false;
|
||||
}
|
||||
{
|
||||
let st = &mut self.decode_state[i];
|
||||
st.raw_frame.iter_mut().for_each(|b| *b = 0);
|
||||
for b in st.raw_frame[..frame_bytes].iter_mut() {
|
||||
*b = st.pcm_queue.pop_front().unwrap();
|
||||
}
|
||||
st.current_frame_remaining_subframes = (4u8) << data.is_stereo;
|
||||
}
|
||||
|
||||
// We just emitted one frame to the guest — advance its visible read
|
||||
// offset by one compressed frame at canary's cadence (may swap buffer).
|
||||
self.advance_read_offset_one_frame(mem, data);
|
||||
true
|
||||
}
|
||||
|
||||
/// Feed raw 2 KB packets to FFmpeg from the private feed cursor until the
|
||||
/// PCM queue holds at least one frame or the codec stops accepting input.
|
||||
/// The feed cursor follows the guest's `current_buffer` ping-pong but keeps
|
||||
/// its own packet index (`feed_packet_index`), so feeding ahead of the
|
||||
/// guest-visible read offset is fine — the offset advances separately per
|
||||
/// emitted frame.
|
||||
fn feed_codec(&mut self, mem: &GuestMemory, i: usize, data: &XmaContextData) {
|
||||
use xma_decode::*;
|
||||
let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 };
|
||||
let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize;
|
||||
|
||||
// Re-sync the feed buffer to the guest's current buffer if the guest has
|
||||
// swapped past us (the buffer we were feeding was consumed).
|
||||
if self.decode_state[i].feed_buffer != data.current_buffer
|
||||
&& !data.is_input_buffer_valid(self.decode_state[i].feed_buffer)
|
||||
{
|
||||
self.decode_state[i].feed_buffer = data.current_buffer;
|
||||
self.decode_state[i].feed_packet_index = 0;
|
||||
}
|
||||
|
||||
const MAX_FEED: u32 = 8;
|
||||
let mut fed = 0u32;
|
||||
while self.decode_state[i].pcm_queue.len() < frame_bytes && fed < MAX_FEED {
|
||||
let fb = self.decode_state[i].feed_buffer;
|
||||
if !data.is_input_buffer_valid(fb) {
|
||||
// Nothing to feed from this buffer; try the other if valid.
|
||||
let other = fb ^ 1;
|
||||
if data.is_input_buffer_valid(other) {
|
||||
self.decode_state[i].feed_buffer = other;
|
||||
self.decode_state[i].feed_packet_index = 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
let pkt_count = data.input_buffer_packet_count(fb);
|
||||
let pidx = self.decode_state[i].feed_packet_index;
|
||||
if pidx >= pkt_count {
|
||||
// Exhausted this buffer's packets at the feed cursor; advance to
|
||||
// the other buffer if it's valid (it was refilled), else wait.
|
||||
let other = fb ^ 1;
|
||||
if data.is_input_buffer_valid(other) {
|
||||
self.decode_state[i].feed_buffer = other;
|
||||
self.decode_state[i].feed_packet_index = 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
let backing = xma_phys_to_backing(data.input_buffer_address(fb));
|
||||
let pkt_va = backing + pidx * BYTES_PER_PACKET;
|
||||
let mut packet = vec![0u8; BYTES_PER_PACKET as usize];
|
||||
mem.read_bytes(pkt_va, &mut packet);
|
||||
let send_res = match self.decode_state[i].codec.as_mut() {
|
||||
Some(codec) => codec.send_packet(&packet),
|
||||
None => break,
|
||||
};
|
||||
match send_res {
|
||||
Ok(()) => {
|
||||
self.decode_state[i].feed_packet_index += 1;
|
||||
fed += 1;
|
||||
self.drain_codec_frames(i);
|
||||
}
|
||||
// Decoder full — drain what it has and stop; re-offer this same
|
||||
// packet next time (don't advance the feed cursor).
|
||||
Err(ref e) if e == "EAGAIN" => {
|
||||
self.drain_codec_frames(i);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(ctx = i, error = %e, "xma: send_packet failed");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pull all currently-available decoded frames from the codec and append
|
||||
/// their interleaved S16BE PCM to the context's queue.
|
||||
fn drain_codec_frames(&mut self, i: usize) {
|
||||
loop {
|
||||
let out = match self.decode_state[i].codec.as_mut() {
|
||||
Some(c) => c.receive_frame(),
|
||||
None => None,
|
||||
};
|
||||
let Some((nb, bytes)) = out else { break };
|
||||
let st = &mut self.decode_state[i];
|
||||
st.frames_decoded += 1;
|
||||
if !st.first_frame_logged {
|
||||
st.first_frame_logged = true;
|
||||
tracing::info!(
|
||||
ctx = i,
|
||||
samples = nb,
|
||||
pcm_bytes = bytes.len(),
|
||||
"xma: first PCM frame decoded"
|
||||
);
|
||||
}
|
||||
st.pcm_queue.extend(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance `input_buffer_read_offset` by exactly ONE compressed frame,
|
||||
/// faithfully mirroring the offset arithmetic in canary's
|
||||
/// `XmaContextNew::Decode` (frame-size parse + packet-boundary handling +
|
||||
/// SwapInputBuffer when the buffer's packets are exhausted). Called once per
|
||||
/// 512-sample frame we emit to the guest, so the guest-visible read offset
|
||||
/// crosses packet/buffer boundaries at canary's true cadence — independent
|
||||
/// of the mainline xma2 decoder's whole-packet burst framing. This is what
|
||||
/// lets `input_buffer_0_valid` toggle and the WMV demuxer refill ADV.wmv.
|
||||
fn advance_read_offset_one_frame(&mut self, mem: &GuestMemory, data: &mut XmaContextData) {
|
||||
use xma_decode::*;
|
||||
|
||||
if !data.is_any_input_buffer_valid() {
|
||||
return;
|
||||
}
|
||||
if !data.is_current_input_buffer_valid() {
|
||||
self.swap_input_buffer(data);
|
||||
if !data.is_current_input_buffer_valid() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Clamp a header-region offset (canary's Dirt-2 guard).
|
||||
if data.input_buffer_read_offset < BITS_PER_PACKET_HEADER {
|
||||
data.input_buffer_read_offset = BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
|
||||
let pkt_count = data.current_input_buffer_packet_count();
|
||||
let input_size = pkt_count * BYTES_PER_PACKET;
|
||||
let Some(packet_index) = packet_number(input_size, data.input_buffer_read_offset) else {
|
||||
return;
|
||||
};
|
||||
let buf_backing = xma_phys_to_backing(data.current_input_buffer_address());
|
||||
let pkt_va = buf_backing + packet_index * BYTES_PER_PACKET;
|
||||
let mut packet = vec![0u8; BYTES_PER_PACKET as usize];
|
||||
mem.read_bytes(pkt_va, &mut packet);
|
||||
|
||||
let first_frame_offset = packet_frame_offset(&packet);
|
||||
let mut relative_offset = data.input_buffer_read_offset % BITS_PER_PACKET;
|
||||
if relative_offset < first_frame_offset {
|
||||
// Tail of a split frame — skip to this packet's first frame.
|
||||
data.input_buffer_read_offset =
|
||||
packet_index * BITS_PER_PACKET + first_frame_offset;
|
||||
relative_offset = first_frame_offset;
|
||||
}
|
||||
|
||||
let skip_count = packet_skip_count(&packet);
|
||||
// Full-packet skip (0xFF): no frames begin here — advance to the next
|
||||
// packet that does, swapping the buffer if exhausted.
|
||||
if skip_count == 0xFF {
|
||||
let next_packet_index = packet_index + 1;
|
||||
let next_off =
|
||||
self.next_packet_read_offset(mem, data, next_packet_index, pkt_count);
|
||||
if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER {
|
||||
self.swap_input_buffer(data);
|
||||
}
|
||||
data.input_buffer_read_offset = next_off;
|
||||
return;
|
||||
}
|
||||
|
||||
let info = get_packet_info(&packet, relative_offset);
|
||||
let packet_to_skip = (skip_count as u32) + 1;
|
||||
let next_packet_index = packet_index + packet_to_skip;
|
||||
|
||||
// Frame size: clamp to the bits remaining in the packet stream (canary
|
||||
// GetAmountOfBitsToRead over the (packet_index+1)*kBitsPerPacket stream).
|
||||
let stream_remaining =
|
||||
((packet_index + 1) * BITS_PER_PACKET).saturating_sub(data.input_buffer_read_offset);
|
||||
let frame_size = if info.current_frame_size == 0 {
|
||||
// Split header we can't resolve from this packet alone; fall back to
|
||||
// advancing past the rest of this packet so we don't stall.
|
||||
stream_remaining
|
||||
} else {
|
||||
info.current_frame_size
|
||||
};
|
||||
let bits_to_copy = amount_of_bits_to_read(stream_remaining, frame_size);
|
||||
|
||||
if !info.is_last_frame_in_packet() {
|
||||
let next_frame_offset =
|
||||
(data.input_buffer_read_offset + bits_to_copy) % BITS_PER_PACKET;
|
||||
data.input_buffer_read_offset =
|
||||
packet_index * BITS_PER_PACKET + next_frame_offset;
|
||||
return;
|
||||
}
|
||||
|
||||
// Last frame in this packet: move to the next packet's first frame, or
|
||||
// swap the input buffer if the packets are exhausted (canary's
|
||||
// `next_packet_index >= current_input_packet_count`).
|
||||
let mut next_off =
|
||||
self.next_packet_read_offset(mem, data, next_packet_index, pkt_count);
|
||||
if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER {
|
||||
self.swap_input_buffer(data);
|
||||
}
|
||||
if next_off == BITS_PER_PACKET_HEADER && data.is_any_input_buffer_valid() {
|
||||
// At the start of the next buffer: jump to its first frame offset.
|
||||
let nb_backing = xma_phys_to_backing(data.current_input_buffer_address());
|
||||
let mut hdr = [0u8; 4];
|
||||
mem.read_bytes(nb_backing, &mut hdr);
|
||||
let fo = packet_frame_offset(&hdr);
|
||||
if fo <= MAX_FRAME_SIZE_IN_BITS {
|
||||
next_off = fo;
|
||||
}
|
||||
}
|
||||
data.input_buffer_read_offset = next_off;
|
||||
}
|
||||
|
||||
/// Scan forward from `next_packet_index` (possibly into the *next* buffer)
|
||||
/// for the next packet that begins a frame and return its bit offset, or
|
||||
/// `BITS_PER_PACKET_HEADER` if none (canary `GetNextPacketReadOffset`).
|
||||
fn next_packet_read_offset(
|
||||
&self,
|
||||
mem: &GuestMemory,
|
||||
data: &XmaContextData,
|
||||
next_packet_index: u32,
|
||||
current_input_packet_count: u32,
|
||||
) -> u32 {
|
||||
use xma_decode::*;
|
||||
// Resolve which buffer the packet lives in (current or the other).
|
||||
let (buffer_index, mut pidx) = if next_packet_index >= current_input_packet_count {
|
||||
(data.current_buffer ^ 1, next_packet_index - current_input_packet_count)
|
||||
} else {
|
||||
(data.current_buffer, next_packet_index)
|
||||
};
|
||||
if !data.is_input_buffer_valid(buffer_index) {
|
||||
return BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
let addr = data.input_buffer_address(buffer_index);
|
||||
if addr == 0 {
|
||||
return BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
let pkt_count = data.input_buffer_packet_count(buffer_index);
|
||||
let backing = xma_phys_to_backing(addr);
|
||||
while pidx < pkt_count {
|
||||
let mut hdr = [0u8; 4];
|
||||
mem.read_bytes(backing + pidx * BYTES_PER_PACKET, &mut hdr);
|
||||
let fo = packet_frame_offset(&hdr);
|
||||
if fo <= MAX_FRAME_SIZE_IN_BITS {
|
||||
return pidx * BITS_PER_PACKET + fo;
|
||||
}
|
||||
pidx += 1;
|
||||
}
|
||||
BITS_PER_PACKET_HEADER
|
||||
}
|
||||
|
||||
fn swap_input_buffer(&mut self, data: &mut XmaContextData) {
|
||||
use xma_decode::*;
|
||||
tracing::debug!(
|
||||
from = data.current_buffer,
|
||||
to = data.current_buffer ^ 1,
|
||||
"xma: SwapInputBuffer (input buffer consumed)"
|
||||
);
|
||||
if data.current_buffer == 0 {
|
||||
data.input_buffer_0_valid = 0;
|
||||
} else {
|
||||
data.input_buffer_1_valid = 0;
|
||||
}
|
||||
data.current_buffer ^= 1;
|
||||
data.input_buffer_read_offset = BITS_PER_PACKET_HEADER;
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for XmaDecoder {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the [`MmioRegion`] for the XMA register aperture at `0x7FEA0000`.
|
||||
/// Mirrors the GPU's `build_region`: the closures lock the shared decoder,
|
||||
/// compute the dword register index, and dispatch to `read`/`write_register`.
|
||||
pub fn build_mmio_region(dec: Arc<Mutex<XmaDecoder>>) -> MmioRegion {
|
||||
let read_dec = dec.clone();
|
||||
let write_dec = dec;
|
||||
|
||||
MmioRegion {
|
||||
base_address: APERTURE_BASE,
|
||||
mask: APERTURE_MASK,
|
||||
size: APERTURE_SIZE,
|
||||
read_callback: Box::new(move |addr: u32| {
|
||||
let reg_index = (addr & 0xFFFF) / 4;
|
||||
read_dec.lock().unwrap().read_register(reg_index)
|
||||
}),
|
||||
write_callback: Box::new(move |addr: u32, value: u32| {
|
||||
let reg_index = (addr & 0xFFFF) / 4;
|
||||
write_dec.lock().unwrap().write_register(reg_index, value);
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn inited() -> XmaDecoder {
|
||||
let mut d = XmaDecoder::new();
|
||||
// Pick a plausible physical-window VA/phys pair.
|
||||
d.init(0xA010_0000, 0x0010_0000);
|
||||
d
|
||||
}
|
||||
|
||||
/// The guest writes/reads the aperture byte-reversed; `wire(v)` is the raw
|
||||
/// bus value the guest sends to mean host-order `v` (and what a read of a
|
||||
/// host-order `v` returns). Equivalent to `lwbrx`/`stwbrx` semantics.
|
||||
fn wire(v: u32) -> u32 {
|
||||
v.swap_bytes()
|
||||
}
|
||||
|
||||
/// (a) `allocate_context` hands back distinct, increasing pointers spaced by
|
||||
/// the 64-byte stride, exhausts at 320, and `release_context` frees the slot.
|
||||
#[test]
|
||||
fn allocate_distinct_then_exhaust_then_release() {
|
||||
let mut d = inited();
|
||||
let first = d.allocate_context();
|
||||
let second = d.allocate_context();
|
||||
assert_eq!(first, 0xA010_0000);
|
||||
assert_eq!(second, 0xA010_0000 + XMA_CONTEXT_SIZE);
|
||||
assert!(second > first);
|
||||
|
||||
// Drain the remaining slots (2 already taken).
|
||||
for _ in 0..(XMA_CONTEXT_COUNT - 2) {
|
||||
assert_ne!(d.allocate_context(), 0);
|
||||
}
|
||||
// 321st allocation fails.
|
||||
assert_eq!(d.allocate_context(), 0);
|
||||
|
||||
// Free the first slot and re-acquire it.
|
||||
d.release_context(first);
|
||||
assert_eq!(d.allocate_context(), first);
|
||||
}
|
||||
|
||||
/// (b) A Kick to `Context0Kick` with host value `0b101` marks contexts 0
|
||||
/// and 2. The guest sends it byte-reversed (`wire`).
|
||||
#[test]
|
||||
fn kick_context0_marks_correct_contexts() {
|
||||
let mut d = inited();
|
||||
d.write_register(REG_CONTEXT_KICK_BASE, wire(0b101));
|
||||
assert!(d.is_pending(0));
|
||||
assert!(!d.is_pending(1));
|
||||
assert!(d.is_pending(2));
|
||||
assert_eq!(d.kick_count(), 2);
|
||||
}
|
||||
|
||||
/// (c) A Kick to `Context1Kick` (0x651) bit 0 maps to context_id 32.
|
||||
#[test]
|
||||
fn kick_context1_bit0_is_context_32() {
|
||||
let mut d = inited();
|
||||
d.write_register(REG_CONTEXT_KICK_BASE + 1, wire(0b1));
|
||||
assert!(d.is_pending(32));
|
||||
assert!(!d.is_pending(0));
|
||||
assert_eq!(d.kick_count(), 1);
|
||||
}
|
||||
|
||||
/// Regression for the byte-order fix: the guest's real Clear writes were
|
||||
/// `0x01000000`/`0x02000000`/`0x04000000` (bytes-reversed `1`/`2`/`4`),
|
||||
/// meaning contexts 0/1/2 — NOT 24/25/26. Verify the raw bus values decode
|
||||
/// to the low contexts.
|
||||
#[test]
|
||||
fn byte_reversed_clear_targets_low_contexts() {
|
||||
let mut d = inited();
|
||||
for i in 0..3 {
|
||||
d.write_register(REG_CONTEXT_KICK_BASE, wire(1 << i));
|
||||
}
|
||||
assert!(d.is_pending(0) && d.is_pending(1) && d.is_pending(2));
|
||||
// The exact bus values observed from the guest.
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0100_0000);
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0200_0000);
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0400_0000);
|
||||
assert!(!d.is_pending(0) && !d.is_pending(1) && !d.is_pending(2));
|
||||
}
|
||||
|
||||
/// (d) `read_register(0x600)` returns the base byte-reversed (the guest
|
||||
/// `lwbrx`-reverses it back to the host-order base on its side).
|
||||
#[test]
|
||||
fn context_array_address_reads_phys() {
|
||||
let d = inited();
|
||||
assert_eq!(
|
||||
d.read_register(REG_CONTEXT_ARRAY_ADDRESS),
|
||||
wire(0x0010_0000)
|
||||
);
|
||||
}
|
||||
|
||||
/// (e) `CurrentContextIndex` rotates on each read and wraps at the count
|
||||
/// (values returned byte-reversed).
|
||||
#[test]
|
||||
fn current_context_index_rotates() {
|
||||
let d = inited();
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0));
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(1));
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(2));
|
||||
// Advance to the wrap boundary.
|
||||
for _ in 3..XMA_CONTEXT_COUNT as u32 {
|
||||
d.read_register(REG_CURRENT_CONTEXT_INDEX);
|
||||
}
|
||||
// Next read wraps back to 0.
|
||||
assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0));
|
||||
}
|
||||
|
||||
/// Clear must drop a previously-kicked pending flag.
|
||||
#[test]
|
||||
fn clear_resets_pending() {
|
||||
let mut d = inited();
|
||||
d.write_register(REG_CONTEXT_KICK_BASE, wire(0b1));
|
||||
assert!(d.is_pending(0));
|
||||
d.write_register(REG_CONTEXT_CLEAR_BASE, wire(0b1));
|
||||
assert!(!d.is_pending(0));
|
||||
}
|
||||
|
||||
/// The MMIO region routes a guest write at `BASE + 0x600*4` to reg 0x600
|
||||
/// and a read back through the same byte address, applying the byte swap.
|
||||
#[test]
|
||||
fn mmio_region_round_trips_register() {
|
||||
let dec = Arc::new(Mutex::new(inited()));
|
||||
let region = build_mmio_region(dec.clone());
|
||||
let kick_byte = APERTURE_BASE + REG_CONTEXT_KICK_BASE * 4;
|
||||
(region.write_callback)(kick_byte, wire(0b1));
|
||||
assert!(dec.lock().unwrap().is_pending(0));
|
||||
// ContextArrayAddress read-back via the bus (byte-reversed).
|
||||
let addr_byte = APERTURE_BASE + REG_CONTEXT_ARRAY_ADDRESS * 4;
|
||||
assert_eq!((region.read_callback)(addr_byte), wire(0x0010_0000));
|
||||
}
|
||||
}
|
||||
217
crates/xenia-apu/src/xma2_codec.rs
Normal file
217
crates/xenia-apu/src/xma2_codec.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Thin unsafe wrapper around the mainline FFmpeg `AV_CODEC_ID_XMA2` decoder.
|
||||
//!
|
||||
//! Unlike canary's vendored `XMAFRAMES` (one frame per packet, custom padding
|
||||
//! header), the distro xma2 decoder consumes whole 2 KB XMA2 packets
|
||||
//! (`block_align == 2048`), needs `extradata` declaring the channel/stream
|
||||
//! layout, and buffers samples internally across packets. We drive it with the
|
||||
//! guest's raw 2 KB packets and pull whatever 512-sample float-planar frames it
|
||||
//! emits, returning them as interleaved S16 big-endian PCM (canary `ConvertFrame`).
|
||||
|
||||
use std::os::raw::c_int;
|
||||
use std::ptr;
|
||||
|
||||
use ffmpeg_sys_next as ff;
|
||||
|
||||
/// One xma2 decoder instance, configured for a fixed (sample_rate, channels).
|
||||
pub struct Xma2Codec {
|
||||
codec: *const ff::AVCodec,
|
||||
ctx: *mut ff::AVCodecContext,
|
||||
frame: *mut ff::AVFrame,
|
||||
packet: *mut ff::AVPacket,
|
||||
extradata: Vec<u8>,
|
||||
channels: u32,
|
||||
}
|
||||
|
||||
// FFmpeg objects are not Send/Sync by default; the decoder is only ever touched
|
||||
// on the CPU scheduler thread (decode_pending), so this is sound for our use.
|
||||
unsafe impl Send for Xma2Codec {}
|
||||
|
||||
impl Xma2Codec {
|
||||
/// Build XMA2WAVEFORMATEX extradata (34 bytes) for a single XMA2 stream.
|
||||
/// Layout (little-endian, per FFmpeg `xma_decode_init` / xma2defs.h):
|
||||
/// [0..2] NumStreams (u16) = 1
|
||||
/// [2..6] ChannelMask (u32) = mono/stereo mask
|
||||
/// [6..34] remaining XMA2WAVEFORMATEX fields (unused by the decoder)
|
||||
fn build_extradata(channels: u32) -> Vec<u8> {
|
||||
let mut e = vec![0u8; 34];
|
||||
// NumStreams = 1
|
||||
e[0..2].copy_from_slice(&1u16.to_le_bytes());
|
||||
// ChannelMask: 0x3 (FL|FR) for stereo, 0x4 (FC) for mono.
|
||||
let mask: u32 = if channels >= 2 { 0x3 } else { 0x4 };
|
||||
e[2..6].copy_from_slice(&mask.to_le_bytes());
|
||||
e
|
||||
}
|
||||
|
||||
pub fn new(sample_rate: u32, channels: u32) -> Result<Self, String> {
|
||||
unsafe {
|
||||
let codec = ff::avcodec_find_decoder(ff::AVCodecID::AV_CODEC_ID_XMA2);
|
||||
if codec.is_null() {
|
||||
return Err("xma2 decoder not found in libavcodec".into());
|
||||
}
|
||||
let ctx = ff::avcodec_alloc_context3(codec);
|
||||
if ctx.is_null() {
|
||||
return Err("avcodec_alloc_context3 failed".into());
|
||||
}
|
||||
|
||||
let mut extradata = Self::build_extradata(channels);
|
||||
// FFmpeg requires extradata to be allocated with av_malloc and
|
||||
// padded; copy our bytes into an av_malloc'd buffer.
|
||||
let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize;
|
||||
let raw = ff::av_mallocz(extradata.len() + pad) as *mut u8;
|
||||
if raw.is_null() {
|
||||
ff::avcodec_free_context(&mut (ctx as *mut _));
|
||||
return Err("av_mallocz extradata failed".into());
|
||||
}
|
||||
ptr::copy_nonoverlapping(extradata.as_ptr(), raw, extradata.len());
|
||||
(*ctx).extradata = raw;
|
||||
(*ctx).extradata_size = extradata.len() as c_int;
|
||||
|
||||
(*ctx).sample_rate = sample_rate as c_int;
|
||||
(*ctx).block_align = 2048;
|
||||
ff::av_channel_layout_default(&mut (*ctx).ch_layout, channels as c_int);
|
||||
|
||||
let ret = ff::avcodec_open2(ctx, codec, ptr::null_mut());
|
||||
if ret < 0 {
|
||||
let mut ctxm = ctx;
|
||||
ff::avcodec_free_context(&mut ctxm);
|
||||
return Err(format!("avcodec_open2 failed: {}", av_err(ret)));
|
||||
}
|
||||
|
||||
let frame = ff::av_frame_alloc();
|
||||
let packet = ff::av_packet_alloc();
|
||||
if frame.is_null() || packet.is_null() {
|
||||
let mut ctxm = ctx;
|
||||
ff::avcodec_free_context(&mut ctxm);
|
||||
return Err("av_frame_alloc/av_packet_alloc failed".into());
|
||||
}
|
||||
|
||||
// keep our Vec alive as the source of truth for length
|
||||
extradata.shrink_to_fit();
|
||||
|
||||
Ok(Self {
|
||||
codec,
|
||||
ctx,
|
||||
frame,
|
||||
packet,
|
||||
extradata,
|
||||
channels,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn channels(&self) -> u32 {
|
||||
self.channels
|
||||
}
|
||||
|
||||
/// Feed one raw 2 KB XMA2 packet (header + data) to the decoder. Returns the
|
||||
/// number of bytes the decoder accepted (0 = buffered, needs no new packet
|
||||
/// yet / EAGAIN). Decoded frames are pulled via [`receive_frame`].
|
||||
pub fn send_packet(&mut self, packet: &[u8]) -> Result<(), String> {
|
||||
unsafe {
|
||||
// av_packet_from_data takes ownership of an av_malloc buffer; simpler
|
||||
// to point at our own bytes via a stack packet with a padded copy.
|
||||
let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize;
|
||||
let buf = ff::av_malloc(packet.len() + pad) as *mut u8;
|
||||
if buf.is_null() {
|
||||
return Err("av_malloc packet failed".into());
|
||||
}
|
||||
ptr::copy_nonoverlapping(packet.as_ptr(), buf, packet.len());
|
||||
ptr::write_bytes(buf.add(packet.len()), 0, pad);
|
||||
ff::av_packet_unref(self.packet);
|
||||
// Wrap buf so FFmpeg frees it.
|
||||
let ret = ff::av_packet_from_data(self.packet, buf, packet.len() as c_int);
|
||||
if ret < 0 {
|
||||
ff::av_free(buf as *mut _);
|
||||
return Err(format!("av_packet_from_data failed: {}", av_err(ret)));
|
||||
}
|
||||
let ret = ff::avcodec_send_packet(self.ctx, self.packet);
|
||||
if ret == ff::AVERROR(ff::EAGAIN) {
|
||||
// Decoder full — caller should drain frames first then retry.
|
||||
return Err("EAGAIN".into());
|
||||
}
|
||||
if ret < 0 {
|
||||
return Err(format!("avcodec_send_packet failed: {}", av_err(ret)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Signal end-of-stream so the decoder flushes its internal FIFO.
|
||||
pub fn send_eof(&mut self) {
|
||||
unsafe {
|
||||
let _ = ff::avcodec_send_packet(self.ctx, ptr::null());
|
||||
}
|
||||
}
|
||||
|
||||
/// Pull one decoded frame as interleaved S16 big-endian PCM, or None if the
|
||||
/// decoder needs more input (EAGAIN) or is drained (EOF). Returns
|
||||
/// (samples_per_channel, interleaved_s16be_bytes).
|
||||
pub fn receive_frame(&mut self) -> Option<(u32, Vec<u8>)> {
|
||||
unsafe {
|
||||
let ret = ff::avcodec_receive_frame(self.ctx, self.frame);
|
||||
if ret < 0 {
|
||||
return None;
|
||||
}
|
||||
let nb = (*self.frame).nb_samples as u32;
|
||||
if nb == 0 {
|
||||
return None;
|
||||
}
|
||||
let ch = (*self.frame).ch_layout.nb_channels.max(1) as u32;
|
||||
let out = convert_frame_planar_to_s16be(self.frame, ch, nb);
|
||||
Some((nb, out))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Xma2Codec {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
if !self.frame.is_null() {
|
||||
ff::av_frame_free(&mut self.frame);
|
||||
}
|
||||
if !self.packet.is_null() {
|
||||
ff::av_packet_free(&mut self.packet);
|
||||
}
|
||||
if !self.ctx.is_null() {
|
||||
ff::avcodec_free_context(&mut self.ctx);
|
||||
}
|
||||
let _ = &self.codec;
|
||||
let _ = &self.extradata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert FFmpeg planar-float output to interleaved S16 big-endian PCM
|
||||
/// (faithful to canary `XmaContext::ConvertFrame`: saturate to [-1,1], scale by
|
||||
/// 2^15-1, byte-swap each sample). `channels` planes of `nb_samples` floats.
|
||||
unsafe fn convert_frame_planar_to_s16be(
|
||||
frame: *mut ff::AVFrame,
|
||||
channels: u32,
|
||||
nb_samples: u32,
|
||||
) -> Vec<u8> {
|
||||
const SCALE: f32 = ((1i32 << 15) - 1) as f32;
|
||||
let mut out = Vec::with_capacity((nb_samples * channels * 2) as usize);
|
||||
unsafe {
|
||||
// extended_data[ch] points to a plane of f32 (AV_SAMPLE_FMT_FLTP).
|
||||
let ext = (*frame).extended_data;
|
||||
for i in 0..nb_samples as isize {
|
||||
for ch in 0..channels as isize {
|
||||
let plane = *ext.offset(ch) as *const f32;
|
||||
let s = if plane.is_null() { 0.0 } else { *plane.offset(i) };
|
||||
let clamped = s.clamp(-1.0, 1.0) * SCALE;
|
||||
let v = clamped as i16;
|
||||
out.extend_from_slice(&v.to_be_bytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn av_err(code: c_int) -> String {
|
||||
unsafe {
|
||||
let mut buf = [0i8; ff::AV_ERROR_MAX_STRING_SIZE as usize];
|
||||
ff::av_strerror(code, buf.as_mut_ptr(), buf.len());
|
||||
let cstr = std::ffi::CStr::from_ptr(buf.as_ptr());
|
||||
cstr.to_string_lossy().into_owned()
|
||||
}
|
||||
}
|
||||
690
crates/xenia-apu/src/xma_decode.rs
Normal file
690
crates/xenia-apu/src/xma_decode.rs
Normal file
@@ -0,0 +1,690 @@
|
||||
//! Stage 3 — the real XMA2→PCM decoder.
|
||||
//!
|
||||
//! A faithful port of xenia-canary's `apu/xma_context_new.cc` decode pipeline
|
||||
//! (`Work`/`Decode`/`Consume`/`StoreContextMerged`), adapted to the *mainline*
|
||||
//! distro FFmpeg `AV_CODEC_ID_XMA2` decoder rather than canary's vendored
|
||||
//! `AV_CODEC_ID_XMAFRAMES`.
|
||||
//!
|
||||
//! ## Determinism
|
||||
//! There is no host decoder thread. [`super::xma::XmaDecoder::decode_pending`]
|
||||
//! is invoked from the CPU scheduler's per-round coordinator
|
||||
//! (`coord_post_round` in xenia-app). FFmpeg decode is itself deterministic
|
||||
//! (same input bytes → same PCM), so the lockstep golden stays reproducible.
|
||||
//!
|
||||
//! ## FFmpeg framing — why this differs from canary
|
||||
//! Canary feeds FFmpeg one *frame* at a time (it bit-extracts a single 512-
|
||||
//! sample frame from the guest packet stream and hands it to the vendored
|
||||
//! `XMAFRAMES` codec with a custom 1-byte padding header). The mainline
|
||||
//! `xma2` decoder does NOT have `XMAFRAMES`; instead it consumes whole 2 KB
|
||||
//! XMA2 *packets* (`block_align == 2048`), needs `extradata` declaring the
|
||||
//! stream/channel layout, and manages frame splitting + a per-stream sample
|
||||
//! FIFO internally. So this module keeps canary's *guest-facing* contract
|
||||
//! (the `XMA_CONTEXT_DATA` packet/frame bookkeeping, the 256-byte-block output
|
||||
//! ring buffer, the field writeback) but replaces canary's per-frame
|
||||
//! `Decode()` body with: feed the current 2 KB packet to the xma2 decoder,
|
||||
//! pull any 512-sample PCM frames it emits, convert them to interleaved S16BE,
|
||||
//! and stage them as the "raw frame" that `Consume()` drains into the output
|
||||
//! ring.
|
||||
//!
|
||||
//! See `xma2_codec.rs` for the unsafe FFmpeg wrapper.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use xenia_memory::access::MemoryAccess;
|
||||
use xenia_memory::GuestMemory;
|
||||
|
||||
use crate::xma2_codec::Xma2Codec;
|
||||
|
||||
// ---- Constants (canary `XmaContext` / `XmaContextNew`).
|
||||
|
||||
pub const BYTES_PER_PACKET: u32 = 2048;
|
||||
pub const BYTES_PER_PACKET_HEADER: u32 = 4;
|
||||
pub const BYTES_PER_PACKET_DATA: u32 = BYTES_PER_PACKET - BYTES_PER_PACKET_HEADER;
|
||||
pub const BITS_PER_PACKET: u32 = BYTES_PER_PACKET * 8;
|
||||
/// Canary `kBitsPerPacketHeader` (in the *new* context) is 32.
|
||||
pub const BITS_PER_PACKET_HEADER: u32 = 32;
|
||||
pub const BITS_PER_FRAME_HEADER: u32 = 15;
|
||||
|
||||
pub const SAMPLES_PER_FRAME: u32 = 512;
|
||||
pub const BYTES_PER_SAMPLE: u32 = 2;
|
||||
pub const BYTES_PER_FRAME_CHANNEL: u32 = SAMPLES_PER_FRAME * BYTES_PER_SAMPLE; // 1024
|
||||
pub const OUTPUT_BYTES_PER_BLOCK: u32 = 256;
|
||||
pub const OUTPUT_MAX_SIZE_BYTES: u32 = 31 * OUTPUT_BYTES_PER_BLOCK;
|
||||
|
||||
pub const MAX_FRAME_LENGTH: u32 = 0x7FFF;
|
||||
pub const MAX_FRAME_SIZE_IN_BITS: u32 = 0x4000 - BITS_PER_PACKET_HEADER;
|
||||
|
||||
const ID_TO_SAMPLE_RATE: [u32; 4] = [24000, 32000, 44100, 48000];
|
||||
|
||||
/// Project a bare-physical XMA buffer pointer (`0x0xxxxxxx`) to the host-backed
|
||||
/// guest VA used by the rest of the emulator. Identical formula to
|
||||
/// `xenia_gpu::physical_to_backing` for the physical window; the input/output
|
||||
/// buffer pointers in the context are always in the low physical window.
|
||||
#[inline]
|
||||
pub fn xma_phys_to_backing(p: u32) -> u32 {
|
||||
0x4000_0000 | (p & 0x1FFF_FFFF)
|
||||
}
|
||||
|
||||
// ---- XMA_CONTEXT_DATA (canary `xma_context.h`, 64 bytes, 16 dwords).
|
||||
//
|
||||
// Stored big-endian in guest memory. We load all 16 dwords (BE) and unpack the
|
||||
// bitfields exactly per the canary layout (bitfields pack LSB-first within each
|
||||
// host-order dword). All fields below are kept as plain integers.
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct XmaContextData {
|
||||
// DWORD 0
|
||||
pub input_buffer_0_packet_count: u32, // :12
|
||||
pub loop_count: u32, // :8
|
||||
pub input_buffer_0_valid: u32, // :1
|
||||
pub input_buffer_1_valid: u32, // :1
|
||||
pub output_buffer_block_count: u32, // :5
|
||||
pub output_buffer_write_offset: u32, // :5
|
||||
// DWORD 1
|
||||
pub input_buffer_1_packet_count: u32, // :12
|
||||
pub loop_subframe_start: u32, // :2
|
||||
pub loop_subframe_end: u32, // :3
|
||||
pub loop_subframe_skip: u32, // :3
|
||||
pub subframe_decode_count: u32, // :4
|
||||
pub output_buffer_padding: u32, // :3
|
||||
pub sample_rate: u32, // :2
|
||||
pub is_stereo: u32, // :1
|
||||
pub unk_dword_1_c: u32, // :1
|
||||
pub output_buffer_valid: u32, // :1
|
||||
// DWORD 2
|
||||
pub input_buffer_read_offset: u32, // :26
|
||||
pub error_status: u32, // :5
|
||||
pub error_set: u32, // :1
|
||||
// DWORD 3
|
||||
pub loop_start: u32, // :26
|
||||
pub parser_error_status: u32, // :5
|
||||
pub parser_error_set: u32, // :1
|
||||
// DWORD 4
|
||||
pub loop_end: u32, // :26
|
||||
pub packet_metadata: u32, // :5
|
||||
pub current_buffer: u32, // :1
|
||||
// DWORD 5..8
|
||||
pub input_buffer_0_ptr: u32,
|
||||
pub input_buffer_1_ptr: u32,
|
||||
pub output_buffer_ptr: u32,
|
||||
pub work_buffer_ptr: u32,
|
||||
// DWORD 9
|
||||
pub output_buffer_read_offset: u32, // :5
|
||||
pub stop_when_done: u32, // :1 (bit 30)
|
||||
pub interrupt_when_done: u32, // :1 (bit 31)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn bits(v: u32, shift: u32, width: u32) -> u32 {
|
||||
(v >> shift) & ((1u32 << width) - 1)
|
||||
}
|
||||
|
||||
impl XmaContextData {
|
||||
/// Read the 64-byte context struct from guest VA `ctx_va` (already a VA,
|
||||
/// not a physical ptr). Each dword is read big-endian via `read_u32`.
|
||||
pub fn read(mem: &GuestMemory, ctx_va: u32) -> Self {
|
||||
let mut d = [0u32; 16];
|
||||
for (i, w) in d.iter_mut().enumerate() {
|
||||
*w = mem.read_u32(ctx_va + (i as u32) * 4);
|
||||
}
|
||||
let mut c = Self::default();
|
||||
// DWORD 0
|
||||
c.input_buffer_0_packet_count = bits(d[0], 0, 12);
|
||||
c.loop_count = bits(d[0], 12, 8);
|
||||
c.input_buffer_0_valid = bits(d[0], 20, 1);
|
||||
c.input_buffer_1_valid = bits(d[0], 21, 1);
|
||||
c.output_buffer_block_count = bits(d[0], 22, 5);
|
||||
c.output_buffer_write_offset = bits(d[0], 27, 5);
|
||||
// DWORD 1
|
||||
c.input_buffer_1_packet_count = bits(d[1], 0, 12);
|
||||
c.loop_subframe_start = bits(d[1], 12, 2);
|
||||
c.loop_subframe_end = bits(d[1], 14, 3);
|
||||
c.loop_subframe_skip = bits(d[1], 17, 3);
|
||||
c.subframe_decode_count = bits(d[1], 20, 4);
|
||||
c.output_buffer_padding = bits(d[1], 24, 3);
|
||||
c.sample_rate = bits(d[1], 27, 2);
|
||||
c.is_stereo = bits(d[1], 29, 1);
|
||||
c.unk_dword_1_c = bits(d[1], 30, 1);
|
||||
c.output_buffer_valid = bits(d[1], 31, 1);
|
||||
// DWORD 2
|
||||
c.input_buffer_read_offset = bits(d[2], 0, 26);
|
||||
c.error_status = bits(d[2], 26, 5);
|
||||
c.error_set = bits(d[2], 31, 1);
|
||||
// DWORD 3
|
||||
c.loop_start = bits(d[3], 0, 26);
|
||||
c.parser_error_status = bits(d[3], 26, 5);
|
||||
c.parser_error_set = bits(d[3], 31, 1);
|
||||
// DWORD 4
|
||||
c.loop_end = bits(d[4], 0, 26);
|
||||
c.packet_metadata = bits(d[4], 26, 5);
|
||||
c.current_buffer = bits(d[4], 31, 1);
|
||||
// DWORD 5..8
|
||||
c.input_buffer_0_ptr = d[5];
|
||||
c.input_buffer_1_ptr = d[6];
|
||||
c.output_buffer_ptr = d[7];
|
||||
c.work_buffer_ptr = d[8];
|
||||
// DWORD 9
|
||||
c.output_buffer_read_offset = bits(d[9], 0, 5);
|
||||
c.stop_when_done = bits(d[9], 30, 1);
|
||||
c.interrupt_when_done = bits(d[9], 31, 1);
|
||||
c
|
||||
}
|
||||
|
||||
/// Repack the bitfields back into the 16 dwords (host order). Only the
|
||||
/// decoder-owned fields differ from what was read; callers use
|
||||
/// [`store_merged`] to write back without clobbering game-owned fields.
|
||||
fn pack(&self) -> [u32; 16] {
|
||||
let mut d = [0u32; 16];
|
||||
d[0] = (self.input_buffer_0_packet_count & 0xFFF)
|
||||
| ((self.loop_count & 0xFF) << 12)
|
||||
| ((self.input_buffer_0_valid & 1) << 20)
|
||||
| ((self.input_buffer_1_valid & 1) << 21)
|
||||
| ((self.output_buffer_block_count & 0x1F) << 22)
|
||||
| ((self.output_buffer_write_offset & 0x1F) << 27);
|
||||
d[1] = (self.input_buffer_1_packet_count & 0xFFF)
|
||||
| ((self.loop_subframe_start & 0x3) << 12)
|
||||
| ((self.loop_subframe_end & 0x7) << 14)
|
||||
| ((self.loop_subframe_skip & 0x7) << 17)
|
||||
| ((self.subframe_decode_count & 0xF) << 20)
|
||||
| ((self.output_buffer_padding & 0x7) << 24)
|
||||
| ((self.sample_rate & 0x3) << 27)
|
||||
| ((self.is_stereo & 1) << 29)
|
||||
| ((self.unk_dword_1_c & 1) << 30)
|
||||
| ((self.output_buffer_valid & 1) << 31);
|
||||
d[2] = (self.input_buffer_read_offset & 0x3FF_FFFF)
|
||||
| ((self.error_status & 0x1F) << 26)
|
||||
| ((self.error_set & 1) << 31);
|
||||
d[3] = (self.loop_start & 0x3FF_FFFF)
|
||||
| ((self.parser_error_status & 0x1F) << 26)
|
||||
| ((self.parser_error_set & 1) << 31);
|
||||
d[4] = (self.loop_end & 0x3FF_FFFF)
|
||||
| ((self.packet_metadata & 0x1F) << 26)
|
||||
| ((self.current_buffer & 1) << 31);
|
||||
d[5] = self.input_buffer_0_ptr;
|
||||
d[6] = self.input_buffer_1_ptr;
|
||||
d[7] = self.output_buffer_ptr;
|
||||
d[8] = self.work_buffer_ptr;
|
||||
d[9] = (self.output_buffer_read_offset & 0x1F)
|
||||
| ((self.stop_when_done & 1) << 30)
|
||||
| ((self.interrupt_when_done & 1) << 31);
|
||||
d
|
||||
}
|
||||
|
||||
pub fn is_input_buffer_valid(&self, idx: u32) -> bool {
|
||||
if idx == 0 {
|
||||
self.input_buffer_0_valid != 0
|
||||
} else {
|
||||
self.input_buffer_1_valid != 0
|
||||
}
|
||||
}
|
||||
pub fn is_current_input_buffer_valid(&self) -> bool {
|
||||
self.is_input_buffer_valid(self.current_buffer)
|
||||
}
|
||||
pub fn is_any_input_buffer_valid(&self) -> bool {
|
||||
self.input_buffer_0_valid != 0 || self.input_buffer_1_valid != 0
|
||||
}
|
||||
pub fn input_buffer_address(&self, idx: u32) -> u32 {
|
||||
if idx == 0 {
|
||||
self.input_buffer_0_ptr
|
||||
} else {
|
||||
self.input_buffer_1_ptr
|
||||
}
|
||||
}
|
||||
pub fn current_input_buffer_address(&self) -> u32 {
|
||||
self.input_buffer_address(self.current_buffer)
|
||||
}
|
||||
pub fn input_buffer_packet_count(&self, idx: u32) -> u32 {
|
||||
if idx == 0 {
|
||||
self.input_buffer_0_packet_count
|
||||
} else {
|
||||
self.input_buffer_1_packet_count
|
||||
}
|
||||
}
|
||||
pub fn current_input_buffer_packet_count(&self) -> u32 {
|
||||
self.input_buffer_packet_count(self.current_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge decoder-owned fields back into guest memory (canary `StoreContextMerged`).
|
||||
/// Re-reads the current context (game may have raced an update), overwrites only
|
||||
/// the fields the decoder owns, and writes all 16 dwords back BE.
|
||||
fn store_merged(
|
||||
mem: &GuestMemory,
|
||||
ctx_va: u32,
|
||||
data: &XmaContextData,
|
||||
initial: &XmaContextData,
|
||||
) {
|
||||
let mut fresh = XmaContextData::read(mem, ctx_va);
|
||||
// DWORD 0
|
||||
fresh.loop_count = data.loop_count;
|
||||
fresh.output_buffer_write_offset = data.output_buffer_write_offset;
|
||||
if initial.input_buffer_0_valid != 0 && data.input_buffer_0_valid == 0 {
|
||||
fresh.input_buffer_0_valid = 0;
|
||||
}
|
||||
if initial.input_buffer_1_valid != 0 && data.input_buffer_1_valid == 0 {
|
||||
fresh.input_buffer_1_valid = 0;
|
||||
}
|
||||
// DWORD 1
|
||||
if initial.output_buffer_valid != 0 && data.output_buffer_valid == 0 {
|
||||
fresh.output_buffer_valid = 0;
|
||||
}
|
||||
// DWORD 2
|
||||
fresh.input_buffer_read_offset = data.input_buffer_read_offset;
|
||||
fresh.error_status = data.error_status;
|
||||
// DWORD 4
|
||||
fresh.current_buffer = data.current_buffer;
|
||||
// DWORD 9
|
||||
fresh.output_buffer_read_offset = data.output_buffer_read_offset;
|
||||
|
||||
let d = fresh.pack();
|
||||
for (i, w) in d.iter().enumerate() {
|
||||
mem.write_u32(ctx_va + (i as u32) * 4, *w);
|
||||
}
|
||||
}
|
||||
|
||||
/// Public wrapper for [`store_merged`] (called from the orchestrator in xma.rs).
|
||||
pub fn store_merged_pub(
|
||||
mem: &GuestMemory,
|
||||
ctx_va: u32,
|
||||
data: &XmaContextData,
|
||||
initial: &XmaContextData,
|
||||
) {
|
||||
store_merged(mem, ctx_va, data, initial);
|
||||
}
|
||||
|
||||
/// Free byte count in a ring buffer from `write_off` to `read_off`
|
||||
/// (canary `RingBuffer::write_count`).
|
||||
pub fn ring_write_count(read_off: u32, write_off: u32, capacity: u32) -> u32 {
|
||||
if read_off == write_off {
|
||||
capacity
|
||||
} else if write_off < read_off {
|
||||
read_off - write_off
|
||||
} else {
|
||||
(capacity - write_off) + read_off
|
||||
}
|
||||
}
|
||||
|
||||
/// Write `bytes` into the guest ring buffer at `backing + write_off`, wrapping
|
||||
/// at `capacity`. Returns the new write offset (canary `RingBuffer::Write`).
|
||||
pub fn ring_write(
|
||||
mem: &GuestMemory,
|
||||
backing: u32,
|
||||
capacity: u32,
|
||||
write_off: u32,
|
||||
bytes: &[u8],
|
||||
) -> u32 {
|
||||
let count = (bytes.len() as u32).min(capacity);
|
||||
if count == 0 {
|
||||
return write_off;
|
||||
}
|
||||
if write_off + count < capacity {
|
||||
mem.write_bytes(backing + write_off, &bytes[..count as usize]);
|
||||
write_off + count
|
||||
} else {
|
||||
let left = capacity - write_off;
|
||||
mem.write_bytes(backing + write_off, &bytes[..left as usize]);
|
||||
let right = count - left;
|
||||
mem.write_bytes(backing, &bytes[left as usize..(left + right) as usize]);
|
||||
right
|
||||
}
|
||||
}
|
||||
|
||||
// ---- BitStream (port of canary `base/bit_stream.cc`). Big-endian source.
|
||||
|
||||
pub struct BitStream<'a> {
|
||||
buf: &'a [u8],
|
||||
offset_bits: usize,
|
||||
size_bits: usize,
|
||||
}
|
||||
|
||||
impl<'a> BitStream<'a> {
|
||||
pub fn new(buf: &'a [u8], size_bits: usize) -> Self {
|
||||
Self { buf, offset_bits: 0, size_bits }
|
||||
}
|
||||
pub fn offset_bits(&self) -> usize {
|
||||
self.offset_bits
|
||||
}
|
||||
pub fn set_offset(&mut self, off: usize) {
|
||||
self.offset_bits = off.min(self.size_bits);
|
||||
}
|
||||
pub fn advance(&mut self, n: usize) {
|
||||
self.set_offset(self.offset_bits + n);
|
||||
}
|
||||
pub fn bits_remaining(&self) -> usize {
|
||||
self.size_bits - self.offset_bits
|
||||
}
|
||||
/// Peek up to 57 bits (canary contract). Reads 8 bytes BE then shifts.
|
||||
pub fn peek(&self, num_bits: usize) -> u64 {
|
||||
debug_assert!(num_bits <= 57);
|
||||
// offset_bytes = min(offset>>3, (size-64)>>3), matching canary so an
|
||||
// 8-byte load near the buffer end stays in range.
|
||||
let max_byte = if self.size_bits >= 64 {
|
||||
(self.size_bits - 64) >> 3
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let offset_bytes = (self.offset_bits >> 3).min(max_byte);
|
||||
let rel = self.offset_bits - (offset_bytes << 3);
|
||||
let mut tmp = [0u8; 8];
|
||||
let avail = self.buf.len().saturating_sub(offset_bytes).min(8);
|
||||
tmp[..avail].copy_from_slice(&self.buf[offset_bytes..offset_bytes + avail]);
|
||||
let mut value = u64::from_be_bytes(tmp);
|
||||
value >>= 64 - (rel + num_bits);
|
||||
value &= (1u64 << num_bits) - 1;
|
||||
value
|
||||
}
|
||||
pub fn read(&mut self, num_bits: usize) -> u64 {
|
||||
let v = self.peek(num_bits);
|
||||
self.advance(num_bits);
|
||||
v
|
||||
}
|
||||
/// Copy `num_bits` from the stream into `dest` (bit-packed, MSB-first within
|
||||
/// each byte). Returns the starting bit offset within the first byte
|
||||
/// (canary returns `rel_offset_bits` — the frame's intra-byte alignment).
|
||||
pub fn copy(&mut self, dest: &mut [u8], num_bits: usize) -> usize {
|
||||
let offset_bytes = self.offset_bits >> 3;
|
||||
let rel = self.offset_bits - (offset_bytes << 3);
|
||||
let mut bits_left = num_bits;
|
||||
let mut out = 0usize;
|
||||
|
||||
if rel != 0 {
|
||||
let bits = self.peek(8 - rel) as u8;
|
||||
let clear_mask = !(((1u8 << rel) - 1)) as u8;
|
||||
dest[out] &= clear_mask;
|
||||
dest[out] |= bits;
|
||||
bits_left -= 8 - rel;
|
||||
self.advance(8 - rel);
|
||||
out += 1;
|
||||
}
|
||||
if bits_left >= 8 {
|
||||
let nbytes = bits_left / 8;
|
||||
let src_off = (self.offset_bits >> 3).min(self.buf.len());
|
||||
let copy = nbytes.min(self.buf.len().saturating_sub(src_off));
|
||||
dest[out..out + copy]
|
||||
.copy_from_slice(&self.buf[src_off..src_off + copy]);
|
||||
out += nbytes;
|
||||
self.advance(nbytes * 8);
|
||||
bits_left -= nbytes * 8;
|
||||
}
|
||||
if bits_left != 0 {
|
||||
let mut b = self.peek(bits_left) as u8;
|
||||
b <<= 8 - bits_left;
|
||||
let clear_mask = ((1u16 << bits_left) - 1) as u8;
|
||||
dest[out] &= clear_mask;
|
||||
dest[out] |= b;
|
||||
self.advance(bits_left);
|
||||
}
|
||||
rel
|
||||
}
|
||||
}
|
||||
|
||||
// ---- XMA packet header helpers (canary `xma_helpers.h`).
|
||||
|
||||
#[inline]
|
||||
pub fn packet_frame_count(packet: &[u8]) -> u8 {
|
||||
packet[0] >> 2
|
||||
}
|
||||
#[inline]
|
||||
pub fn packet_metadata(packet: &[u8]) -> u8 {
|
||||
packet[2] & 0x7
|
||||
}
|
||||
#[inline]
|
||||
pub fn is_packet_xma2(packet: &[u8]) -> bool {
|
||||
packet_metadata(packet) == 1
|
||||
}
|
||||
#[inline]
|
||||
pub fn packet_skip_count(packet: &[u8]) -> u8 {
|
||||
packet[3]
|
||||
}
|
||||
/// First frame offset in bits (canary `GetPacketFrameOffset`): a 15-bit value
|
||||
/// across bytes 0..2, plus the 32-bit header.
|
||||
#[inline]
|
||||
pub fn packet_frame_offset(packet: &[u8]) -> u32 {
|
||||
let val = (((packet[0] as u32 & 0x3) << 13)
|
||||
| ((packet[1] as u32) << 5)
|
||||
| ((packet[2] as u32) >> 3))
|
||||
& 0xFFFF;
|
||||
val + 32
|
||||
}
|
||||
|
||||
/// Sample-rate id → Hz.
|
||||
pub fn sample_rate_hz(id: u32) -> u32 {
|
||||
ID_TO_SAMPLE_RATE[id.min(3) as usize]
|
||||
}
|
||||
|
||||
// ---- Packet-walk for faithful input-offset advance (canary `GetPacketInfo`,
|
||||
// `GetNextPacketReadOffset`, and the offset arithmetic at the tail of
|
||||
// `XmaContextNew::Decode`). These let us advance `input_buffer_read_offset` one
|
||||
// *frame* at a time at canary's exact cadence — independent of the mainline
|
||||
// xma2 decoder's whole-packet/burst framing — so the offset crosses packet and
|
||||
// buffer boundaries (and triggers SwapInputBuffer) at the true input-drain
|
||||
// rate the guest's WMV demuxer polls.
|
||||
|
||||
/// Info about the frame at a given bit offset within a packet (canary
|
||||
/// `kPacketInfo` / `GetPacketInfo`). `frame_count_` is the number of frames
|
||||
/// that begin in the packet; `current_frame_size_` is the compressed bit size
|
||||
/// of the frame at `frame_offset` (0 if it can't be resolved within this
|
||||
/// packet — a split header).
|
||||
#[derive(Default, Clone, Copy)]
|
||||
pub struct PacketInfo {
|
||||
pub frame_count: u32,
|
||||
pub current_frame: u32,
|
||||
pub current_frame_size: u32,
|
||||
}
|
||||
|
||||
impl PacketInfo {
|
||||
pub fn is_last_frame_in_packet(&self) -> bool {
|
||||
self.current_frame + 1 == self.frame_count
|
||||
}
|
||||
}
|
||||
|
||||
/// Faithful port of canary `XmaContextNew::GetPacketInfo`.
|
||||
pub fn get_packet_info(packet: &[u8], frame_offset: u32) -> PacketInfo {
|
||||
let mut info = PacketInfo::default();
|
||||
let first_frame_offset = packet_frame_offset(packet);
|
||||
let mut stream = BitStream::new(packet, BITS_PER_PACKET as usize);
|
||||
stream.set_offset(first_frame_offset as usize);
|
||||
|
||||
// Split frame from previous packet.
|
||||
if frame_offset < first_frame_offset {
|
||||
info.current_frame = 0;
|
||||
info.current_frame_size = first_frame_offset - frame_offset;
|
||||
}
|
||||
|
||||
loop {
|
||||
if stream.bits_remaining() < BITS_PER_FRAME_HEADER as usize {
|
||||
break;
|
||||
}
|
||||
let frame_size = stream.peek(BITS_PER_FRAME_HEADER as usize) as u32;
|
||||
if frame_size == 0 || frame_size == MAX_FRAME_LENGTH {
|
||||
break;
|
||||
}
|
||||
if stream.offset_bits() == frame_offset as usize {
|
||||
info.current_frame = info.frame_count;
|
||||
info.current_frame_size = frame_size;
|
||||
}
|
||||
info.frame_count += 1;
|
||||
if frame_size as usize > stream.bits_remaining() {
|
||||
// Last frame.
|
||||
break;
|
||||
}
|
||||
stream.advance((frame_size - 1) as usize);
|
||||
// Trailing continuation bit.
|
||||
if stream.read(1) == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if is_packet_xma2(packet) {
|
||||
let xma2_frame_count = packet_frame_count(packet) as u32;
|
||||
if xma2_frame_count > info.frame_count {
|
||||
if info.current_frame_size == 0 {
|
||||
info.current_frame = info.frame_count;
|
||||
}
|
||||
info.frame_count = xma2_frame_count;
|
||||
}
|
||||
}
|
||||
info
|
||||
}
|
||||
|
||||
/// Packet number for a bit offset (canary `GetPacketNumber`). Returns None when
|
||||
/// the offset is in the header or past the buffer.
|
||||
pub fn packet_number(size_bytes: u32, bit_offset: u32) -> Option<u32> {
|
||||
if bit_offset < BITS_PER_PACKET_HEADER {
|
||||
return None;
|
||||
}
|
||||
if bit_offset >= size_bytes * 8 {
|
||||
return None;
|
||||
}
|
||||
Some((bit_offset >> 3) / BYTES_PER_PACKET)
|
||||
}
|
||||
|
||||
/// min(remaining_stream_bits, frame_size) (canary `GetAmountOfBitsToRead`).
|
||||
pub fn amount_of_bits_to_read(remaining_stream_bits: u32, frame_size: u32) -> u32 {
|
||||
remaining_stream_bits.min(frame_size)
|
||||
}
|
||||
|
||||
// ---- Per-context decode state (lives in the XmaDecoder, one per ctx).
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ContextDecodeState {
|
||||
/// FFmpeg xma2 codec for this context (lazily created / reconfigured).
|
||||
pub codec: Option<Xma2Codec>,
|
||||
pub codec_rate: u32,
|
||||
pub codec_channels: u32,
|
||||
/// Staged interleaved S16BE PCM for the current decoded frame
|
||||
/// (`raw_frame_`), drained by Consume in 256-byte blocks.
|
||||
pub raw_frame: Vec<u8>,
|
||||
/// Decoded interleaved S16BE PCM not yet split into per-frame `raw_frame`s.
|
||||
/// The mainline xma2 decoder emits bursts of many 512-sample frames at once
|
||||
/// (internal FIFO + 4096-sample lookahead); we queue the bytes here and
|
||||
/// hand the guest exactly one 512-sample frame per `produce_frame`.
|
||||
pub pcm_queue: VecDeque<u8>,
|
||||
pub current_frame_remaining_subframes: u8,
|
||||
pub remaining_subframe_blocks_in_output: i32,
|
||||
/// Total 512-sample frames decoded for this context (diagnostic).
|
||||
pub frames_decoded: u64,
|
||||
/// Whether a "first frame" diagnostic has been emitted.
|
||||
pub first_frame_logged: bool,
|
||||
/// FFmpeg feed cursor: the next packet index (within the *current* input
|
||||
/// buffer at feed time) we will hand to FFmpeg. This is the decoder's
|
||||
/// internal intake position and is intentionally decoupled from the
|
||||
/// guest-visible `input_buffer_read_offset` (which advances per *emitted*
|
||||
/// frame via the faithful packet-walk). We feed ahead so FFmpeg always has
|
||||
/// enough buffered input to satisfy the guest's drain, while the guest sees
|
||||
/// the read offset move at canary's true per-frame cadence.
|
||||
pub feed_packet_index: u32,
|
||||
/// `current_buffer` the feed cursor is reading from; reset on swap so the
|
||||
/// feed follows the same ping-pong as the guest-visible buffer.
|
||||
pub feed_buffer: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// The bitfield unpack/pack must round-trip every decoder-relevant field at
|
||||
/// the exact canary offsets (regression against a shifted bit).
|
||||
#[test]
|
||||
fn context_bitfields_round_trip() {
|
||||
let mut c = XmaContextData::default();
|
||||
c.input_buffer_0_packet_count = 632;
|
||||
c.loop_count = 0;
|
||||
c.input_buffer_0_valid = 1;
|
||||
c.input_buffer_1_valid = 0;
|
||||
c.output_buffer_block_count = 30;
|
||||
c.output_buffer_write_offset = 5;
|
||||
c.subframe_decode_count = 8;
|
||||
c.output_buffer_padding = 1;
|
||||
c.sample_rate = 3;
|
||||
c.is_stereo = 1;
|
||||
c.output_buffer_valid = 1;
|
||||
c.input_buffer_read_offset = 16416;
|
||||
c.error_status = 4;
|
||||
c.current_buffer = 1;
|
||||
c.input_buffer_0_ptr = 0x0b9f_d000;
|
||||
c.output_buffer_ptr = 0x01f6_6e00;
|
||||
c.output_buffer_read_offset = 7;
|
||||
c.interrupt_when_done = 1;
|
||||
|
||||
// pack → words → re-read via the same word layout.
|
||||
let d = c.pack();
|
||||
// Simulate read() decode from the packed words.
|
||||
let mut c2 = XmaContextData::default();
|
||||
c2.input_buffer_0_packet_count = bits(d[0], 0, 12);
|
||||
c2.input_buffer_0_valid = bits(d[0], 20, 1);
|
||||
c2.output_buffer_block_count = bits(d[0], 22, 5);
|
||||
c2.output_buffer_write_offset = bits(d[0], 27, 5);
|
||||
c2.subframe_decode_count = bits(d[1], 20, 4);
|
||||
c2.output_buffer_padding = bits(d[1], 24, 3);
|
||||
c2.sample_rate = bits(d[1], 27, 2);
|
||||
c2.is_stereo = bits(d[1], 29, 1);
|
||||
c2.output_buffer_valid = bits(d[1], 31, 1);
|
||||
c2.input_buffer_read_offset = bits(d[2], 0, 26);
|
||||
c2.error_status = bits(d[2], 26, 5);
|
||||
c2.current_buffer = bits(d[4], 31, 1);
|
||||
c2.output_buffer_read_offset = bits(d[9], 0, 5);
|
||||
c2.interrupt_when_done = bits(d[9], 31, 1);
|
||||
|
||||
assert_eq!(c2.input_buffer_0_packet_count, 632);
|
||||
assert_eq!(c2.input_buffer_0_valid, 1);
|
||||
assert_eq!(c2.output_buffer_block_count, 30);
|
||||
assert_eq!(c2.output_buffer_write_offset, 5);
|
||||
assert_eq!(c2.subframe_decode_count, 8);
|
||||
assert_eq!(c2.output_buffer_padding, 1);
|
||||
assert_eq!(c2.sample_rate, 3);
|
||||
assert_eq!(c2.is_stereo, 1);
|
||||
assert_eq!(c2.output_buffer_valid, 1);
|
||||
assert_eq!(c2.input_buffer_read_offset, 16416);
|
||||
assert_eq!(c2.error_status, 4);
|
||||
assert_eq!(c2.current_buffer, 1);
|
||||
assert_eq!(c2.output_buffer_read_offset, 7);
|
||||
assert_eq!(c2.interrupt_when_done, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn phys_to_backing_projects_physical_window() {
|
||||
assert_eq!(xma_phys_to_backing(0x0b9f_d000), 0x4b9f_d000);
|
||||
assert_eq!(xma_phys_to_backing(0x01f6_6e00), 0x41f6_6e00);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ring_write_count_matches_canary() {
|
||||
// empty (read==write) → full capacity.
|
||||
assert_eq!(ring_write_count(0, 0, 7680), 7680);
|
||||
// write ahead of read.
|
||||
assert_eq!(ring_write_count(0, 256, 7680), 7680 - 256);
|
||||
// write wrapped behind read.
|
||||
assert_eq!(ring_write_count(512, 256, 7680), 256);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn packet_header_helpers() {
|
||||
// Matches the observed first packet word 0x08000000: byte0=0x08.
|
||||
let pkt = [0x08u8, 0x00, 0x00, 0x00];
|
||||
assert_eq!(packet_frame_count(&pkt), 2); // 0x08>>2 = 2
|
||||
// frame offset: ((0x08&3)<<13 | 0<<5 | 0x00>>3) + 32 = 32.
|
||||
assert_eq!(packet_frame_offset(&pkt), 32);
|
||||
// A non-zero byte2 shifts the offset: 0x08>>3 = 1 → +1.
|
||||
let pkt2 = [0x08u8, 0x00, 0x08, 0x00];
|
||||
assert_eq!(packet_frame_offset(&pkt2), 33);
|
||||
}
|
||||
}
|
||||
|
||||
impl ContextDecodeState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
codec: None,
|
||||
codec_rate: 0,
|
||||
codec_channels: 0,
|
||||
raw_frame: vec![0u8; (BYTES_PER_FRAME_CHANNEL * 2) as usize],
|
||||
pcm_queue: VecDeque::new(),
|
||||
current_frame_remaining_subframes: 0,
|
||||
remaining_subframe_blocks_in_output: 0,
|
||||
frames_decoded: 0,
|
||||
first_frame_logged: false,
|
||||
feed_packet_index: 0,
|
||||
feed_buffer: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -79,6 +79,14 @@ pub struct DecodedBlock {
|
||||
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
|
||||
/// pushes the first decoded word unconditionally).
|
||||
pub instrs: Vec<DecodedInstr>,
|
||||
/// True if this block contains a cross-thread synchronization point
|
||||
/// (`PpcOpcode::is_sync_sensitive`: reserved load/store or a memory
|
||||
/// barrier). Computed once at build time. The superblock runner ends
|
||||
/// the run after executing a sync-sensitive block so the lockstep
|
||||
/// interleaving stays fine-grained at exactly those points (preserving
|
||||
/// the cross-thread ordering the 2E/2F/2J boot work depends on),
|
||||
/// while chaining freely through ordinary straight-line blocks.
|
||||
pub sync_sensitive: bool,
|
||||
}
|
||||
|
||||
/// Per-slot status from a `lookup_or_build` probe. Internal only.
|
||||
@@ -187,11 +195,13 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
|
||||
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
|
||||
let page_base = start_pc & GUEST_PAGE_MASK;
|
||||
let mut cur = start_pc;
|
||||
let mut sync_sensitive = false;
|
||||
|
||||
loop {
|
||||
let raw = mem.read_u32(cur);
|
||||
let decoded = decode(raw, cur);
|
||||
let terminates = decoded.opcode.terminates_block();
|
||||
sync_sensitive |= decoded.opcode.is_sync_sensitive();
|
||||
instrs.push(decoded);
|
||||
|
||||
if terminates {
|
||||
@@ -215,6 +225,7 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
|
||||
end_pc,
|
||||
page_version,
|
||||
instrs,
|
||||
sync_sensitive,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -335,6 +346,40 @@ mod tests {
|
||||
assert_eq!(b.end_pc, 0x110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sync_sensitive_flag_set_for_barrier_block() {
|
||||
// A block containing `sync` (0x7C0004AC) must flag sync_sensitive
|
||||
// so the superblock runner ends the chain there (cross-thread
|
||||
// ordering point). `sync` does NOT terminate a block, so it sits
|
||||
// mid-block followed by straight-line code up to a terminator.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, 0x7C00_04AC); // sync
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
mem.put(0x10C, enc_b_self()); // terminator
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert!(
|
||||
b.sync_sensitive,
|
||||
"block containing `sync` must flag sync_sensitive; decoded last={:?}",
|
||||
b.instrs.iter().map(|i| i.opcode).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sync_sensitive_flag_clear_for_plain_block() {
|
||||
// A straight-line ALU block with no reserved-op / barrier must
|
||||
// NOT flag sync_sensitive (so the superblock runner is free to
|
||||
// chain through it).
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_addi(3, 3, 1));
|
||||
mem.put(0x108, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert!(!b.sync_sensitive, "plain ALU block must not flag sync_sensitive");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_page_boundary() {
|
||||
// Build from 0x1FFC. The next PC (0x2000) is in a different
|
||||
|
||||
217
crates/xenia-cpu/src/dispatch_rec.rs
Normal file
217
crates/xenia-cpu/src/dispatch_rec.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Runtime indirect-dispatch recorder.
|
||||
//!
|
||||
//! A reusable, env-gated facility that captures every indirect call performed
|
||||
//! through CTR (`bcctr`/`bcctrl`/`bctr`) as a unique `(call_site_pc ->
|
||||
//! target_pc)` pair, together with the object register `r3` seen at the call
|
||||
//! and a hit count. It exists to provide GROUND-TRUTH indirect-dispatch
|
||||
//! resolution for reverse-engineering vtable dispatch that the static
|
||||
//! analyzer fails to resolve (e.g. the Sylpheed movie engine vtable
|
||||
//! `0x8200a908`).
|
||||
//!
|
||||
//! ## Gating & overhead
|
||||
//! Recording is OFF by default. It is enabled only when the environment
|
||||
//! variable `XENIA_DISPATCH_REC` is set to a non-empty, non-`0` value at
|
||||
//! process start. When OFF, [`record`] is a single relaxed atomic-bool load
|
||||
//! followed by an early return — no allocation, no locking, no behavior
|
||||
//! change. The recorder is pure: it never reads the clock, never touches
|
||||
//! scheduling, and never mutates guest/CPU state, so enabling it does not
|
||||
//! perturb deterministic runs (only adds a HashMap insert behind a mutex).
|
||||
//!
|
||||
//! ## Focus filters (optional)
|
||||
//! Two env vars narrow what is recorded (both default to "record everything"):
|
||||
//! - `XENIA_DISPATCH_REC_TARGETS=0x82505c08,...` — only edges whose resolved
|
||||
//! target is in the list. Answers "who calls `<target>`": every recorded
|
||||
//! edge then carries the caller `site` and `lr`.
|
||||
//! - `XENIA_DISPATCH_REC_SITES=0x825078d8,...` — only edges from the listed
|
||||
//! call-site PCs.
|
||||
//! When both are set, an edge must satisfy BOTH. These keep a long focused
|
||||
//! run (e.g. the intro-movie trace) producing a small, relevant table instead
|
||||
//! of the whole program-wide dispatch set. Pure observe-only — filtering only
|
||||
//! affects which edges are stored, never guest/CPU state.
|
||||
//!
|
||||
//! ## Output
|
||||
//! On [`dump`] (call at end-of-run) the table is written to the path in
|
||||
//! `XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`), sorted by
|
||||
//! descending hit count, one record per line:
|
||||
//! `callsite_pc target_pc count r3=<obj>` (all hex).
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
/// Enabled flag, resolved once from the environment at first touch.
|
||||
static ENABLED: OnceLock<bool> = OnceLock::new();
|
||||
/// Fast-path mirror of `ENABLED` so the hot path is a single relaxed load
|
||||
/// (avoids the `OnceLock` get + deref on every indirect branch when OFF).
|
||||
static ENABLED_FAST: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
/// One observed indirect-dispatch edge.
|
||||
#[derive(Default, Clone, Copy)]
|
||||
struct Edge {
|
||||
count: u64,
|
||||
/// Last-seen object register (`r3`) at this (site,target) edge. Stable for
|
||||
/// a vtable dispatch where the same call site always dispatches on the
|
||||
/// same kind of object.
|
||||
last_r3: u64,
|
||||
/// Last-seen link register (return address) for the call.
|
||||
last_lr: u64,
|
||||
}
|
||||
|
||||
/// (call_site_pc, target_pc) -> Edge
|
||||
static TABLE: OnceLock<Mutex<HashMap<(u32, u32), Edge>>> = OnceLock::new();
|
||||
|
||||
/// Optional focus filters, resolved once from the environment. When either is
|
||||
/// non-empty, an edge is recorded only if its `target` is in `TARGET_FILTER`
|
||||
/// (when that set is non-empty) AND its `site` is in `SITE_FILTER` (when that
|
||||
/// set is non-empty). Empty sets mean "no constraint on that axis". This lets
|
||||
/// a long focused run (e.g. the intro-movie trace) record ONLY the dispatch
|
||||
/// edges relevant to a target-set under investigation — for example "every
|
||||
/// indirect call whose target is the XMV submit `sub_82505C08`", which answers
|
||||
/// the milestone-2 "who calls submit on the engine" question with the caller
|
||||
/// `lr` — instead of the whole (large) program-wide dispatch table.
|
||||
static TARGET_FILTER: OnceLock<Vec<u32>> = OnceLock::new();
|
||||
static SITE_FILTER: OnceLock<Vec<u32>> = OnceLock::new();
|
||||
|
||||
/// Parse a comma-separated list of hex PCs (`0x` prefix optional) into a
|
||||
/// sorted, deduped Vec. Empty/garbage tokens are skipped.
|
||||
fn parse_pc_list_str(s: &str) -> Vec<u32> {
|
||||
let mut v: Vec<u32> = s
|
||||
.split(',')
|
||||
.map(str::trim)
|
||||
.filter(|t| !t.is_empty())
|
||||
.filter_map(|t| {
|
||||
let hex = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")).unwrap_or(t);
|
||||
u32::from_str_radix(hex, 16).ok()
|
||||
})
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
v.dedup();
|
||||
v
|
||||
}
|
||||
|
||||
/// Parse a PC list from an env var. Missing var → empty Vec (no constraint).
|
||||
fn parse_pc_list(var: &str) -> Vec<u32> {
|
||||
match std::env::var(var) {
|
||||
Ok(s) => parse_pc_list_str(&s),
|
||||
Err(_) => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve the enabled flag (and focus filters) from the environment exactly
|
||||
/// once.
|
||||
fn init_enabled() -> bool {
|
||||
let on = match std::env::var("XENIA_DISPATCH_REC") {
|
||||
Ok(v) => !v.is_empty() && v != "0",
|
||||
Err(_) => false,
|
||||
};
|
||||
ENABLED_FAST.store(on, Ordering::Relaxed);
|
||||
let _ = TARGET_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_TARGETS"));
|
||||
let _ = SITE_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_SITES"));
|
||||
on
|
||||
}
|
||||
|
||||
/// Whether recording is enabled. Cheap after the first call.
|
||||
#[inline(always)]
|
||||
pub fn enabled() -> bool {
|
||||
// Hot path: relaxed atomic load. ENABLED_FAST is initialised by the first
|
||||
// call to `enabled_init` (below); until then it is `false`, which is also
|
||||
// the correct default. We force initialisation eagerly from `install`.
|
||||
ENABLED_FAST.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Force the env resolution (call once early in startup). Idempotent.
|
||||
pub fn install() {
|
||||
let _ = ENABLED.get_or_init(init_enabled);
|
||||
}
|
||||
|
||||
/// Record one indirect (CTR) call edge. No-op when disabled.
|
||||
///
|
||||
/// `site` = PC of the `bcctr`/`bctr` instruction, `target` = resolved CTR
|
||||
/// target, `r3` = object register at the call, `lr` = link register.
|
||||
#[inline(always)]
|
||||
pub fn record(site: u32, target: u32, r3: u64, lr: u64) {
|
||||
// Single predictable branch when OFF.
|
||||
if !ENABLED_FAST.load(Ordering::Relaxed) {
|
||||
return;
|
||||
}
|
||||
// Focus filters (only consulted when recording is ON, i.e. rare). An empty
|
||||
// filter set imposes no constraint on its axis.
|
||||
if let Some(targets) = TARGET_FILTER.get()
|
||||
&& !targets.is_empty()
|
||||
&& targets.binary_search(&target).is_err()
|
||||
{
|
||||
return;
|
||||
}
|
||||
if let Some(sites) = SITE_FILTER.get()
|
||||
&& !sites.is_empty()
|
||||
&& sites.binary_search(&site).is_err()
|
||||
{
|
||||
return;
|
||||
}
|
||||
let table = TABLE.get_or_init(|| Mutex::new(HashMap::new()));
|
||||
if let Ok(mut t) = table.lock() {
|
||||
let e = t.entry((site, target)).or_default();
|
||||
e.count += 1;
|
||||
e.last_r3 = r3;
|
||||
e.last_lr = lr;
|
||||
}
|
||||
}
|
||||
|
||||
/// Dump the recorded table to the output file. No-op when disabled or empty.
|
||||
pub fn dump() {
|
||||
if !enabled() {
|
||||
return;
|
||||
}
|
||||
let path = std::env::var("XENIA_DISPATCH_REC_OUT")
|
||||
.unwrap_or_else(|_| "/tmp/dispatch_rec.txt".to_string());
|
||||
let table = match TABLE.get() {
|
||||
Some(t) => t,
|
||||
None => return,
|
||||
};
|
||||
let guard = match table.lock() {
|
||||
Ok(g) => g,
|
||||
Err(_) => return,
|
||||
};
|
||||
let mut rows: Vec<((u32, u32), Edge)> =
|
||||
guard.iter().map(|(k, v)| (*k, *v)).collect();
|
||||
// Deterministic order: count desc, then site, then target.
|
||||
rows.sort_by(|a, b| {
|
||||
b.1.count
|
||||
.cmp(&a.1.count)
|
||||
.then(a.0 .0.cmp(&b.0 .0))
|
||||
.then(a.0 .1.cmp(&b.0 .1))
|
||||
});
|
||||
let mut out = String::with_capacity(rows.len() * 48);
|
||||
out.push_str("# callsite_pc target_pc count r3 lr\n");
|
||||
for ((site, target), e) in rows {
|
||||
out.push_str(&format!(
|
||||
"{:#010x} {:#010x} {} r3={:#018x} lr={:#018x}\n",
|
||||
site, target, e.count, e.last_r3, e.last_lr
|
||||
));
|
||||
}
|
||||
if let Err(err) = std::fs::write(&path, out) {
|
||||
eprintln!("dispatch_rec: failed to write {}: {}", path, err);
|
||||
} else {
|
||||
eprintln!("dispatch_rec: wrote {} edges to {}", guard.len(), path);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::parse_pc_list_str;
|
||||
|
||||
#[test]
|
||||
fn parse_pc_list_handles_prefixes_whitespace_and_dedup() {
|
||||
// Mixed 0x / bare hex, surrounding whitespace, an empty token, and a
|
||||
// duplicate. Result is sorted + deduped; garbage tokens are dropped.
|
||||
let got = parse_pc_list_str(" 0x82505c08 , 825078d8,, 82505c08 , zzz ");
|
||||
assert_eq!(got, vec![0x82505c08, 0x825078d8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pc_list_empty_is_no_constraint() {
|
||||
assert!(parse_pc_list_str("").is_empty());
|
||||
assert!(parse_pc_list_str(" , , ").is_empty());
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,15 @@ pub enum StepResult {
|
||||
Trap,
|
||||
/// Execution halted (by debugger or error).
|
||||
Halted,
|
||||
/// Executed the `db16cyc` spin-wait hint (`or r31,r31,r31`, encoding
|
||||
/// `0x7FFFFB78`). The PC has already advanced past the hint; this is a
|
||||
/// cooperative-yield signal so the scheduler hands the slot to a Ready
|
||||
/// peer. On real hardware all six HW threads run concurrently and the
|
||||
/// spin resolves naturally; under our round-robin lockstep a spinning
|
||||
/// barrier/spinlock participant would otherwise monopolize its slot and
|
||||
/// starve the co-located thread it is waiting on. Matches canary's
|
||||
/// `InstrEmit_orx` db16cyc → `DelayExecution()` handling.
|
||||
Yield,
|
||||
}
|
||||
|
||||
/// Execute a single PPC instruction.
|
||||
@@ -95,6 +104,9 @@ pub fn step_block(
|
||||
ctx.cycle_count += 1;
|
||||
ctx.timebase += 1;
|
||||
if !matches!(result, StepResult::Continue) {
|
||||
// `Yield` (db16cyc spin hint) terminates the block here so the
|
||||
// scheduler regains control and can rotate the slot; the PC has
|
||||
// already advanced past the hint inside `execute`.
|
||||
return result;
|
||||
}
|
||||
// PC discontinuity within a block. By construction only the
|
||||
@@ -117,65 +129,65 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::addis => {
|
||||
// Xbox 360 user mode is 32-bit ABI (MSR.SF=0), so addis must
|
||||
// produce a value whose upper 32 bits don't pollute downstream
|
||||
// 64-bit arithmetic. The PPC ISA in 64-bit mode sign-extends
|
||||
// simm16 before the shift, producing 0xFFFFFFFF_xxxx0000 for
|
||||
// negative simm16 (high bit set). When this value flows into
|
||||
// a 64-bit subfc against a zero-extended lwz value, the unsigned
|
||||
// 64-bit comparison yields wrong CA. Truncate to 32 bits to
|
||||
// simulate 32-bit ABI behavior.
|
||||
// PPCBUG-020 fix: Xenon is a 64-bit core; `addis` produces the full
|
||||
// 64-bit `RA + (EXTS(SI) << 16)`. Matches canary
|
||||
// (`Add(RA, Int64(EXTS(imm) << 16))`, stores full 64-bit).
|
||||
let ra_val = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
|
||||
let result = ra_val.wrapping_add((instr.simm16() as i64 as u64) << 16);
|
||||
ctx.gpr[instr.rd()] = result as u32 as u64;
|
||||
ctx.gpr[instr.rd()] = result;
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::addic => {
|
||||
// PPCBUG-002: 32-bit ABI. CA must be from a 32-bit unsigned compare;
|
||||
// canary's `AddDidCarry` truncates both operands to int32 first.
|
||||
// PPCBUG-020 fix: full 64-bit `RA + EXTS(SI)` (canary `Add(RA,
|
||||
// Int64(EXTS(imm)))`). CA stays a 32-bit unsigned compare to match
|
||||
// canary's `AddDidCarry` (truncates operands to int32 first).
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let imm32 = instr.simm16() as i32 as u32;
|
||||
let result32 = ra32.wrapping_add(imm32);
|
||||
ctx.xer_ca = if result32 < ra32 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.ra()].wrapping_add(instr.simm16() as i64 as u64);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::addicx => {
|
||||
// PPCBUG-003: same fix as addic plus CR0 i32 view.
|
||||
// PPCBUG-020 fix: full 64-bit result; CA 32-bit; CR0 32-bit i32 view
|
||||
// (= low 32 of the result; unchanged from the pre-fix behaviour).
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let imm32 = instr.simm16() as i32 as u32;
|
||||
let result32 = ra32.wrapping_add(imm32);
|
||||
ctx.xer_ca = if result32 < ra32 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.ra()].wrapping_add(instr.simm16() as i64 as u64);
|
||||
ctx.update_cr_signed(0, result32 as i32 as i64);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::subficx => {
|
||||
// PPCBUG-005: 32-bit ABI. Sign-extended imm has bits 32-63 set for
|
||||
// negative SIMM, poisoning the writeback. Canary uses 32-bit form.
|
||||
// PPCBUG-020 fix: full 64-bit `EXTS(SI) - RA` (canary `Sub(Int64(
|
||||
// EXTS(imm)), RA)`). CA stays a 32-bit compare.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let imm32 = instr.simm16() as i32 as u32;
|
||||
let result32 = imm32.wrapping_sub(ra32);
|
||||
ctx.xer_ca = if imm32 >= ra32 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = (instr.simm16() as i64 as u64).wrapping_sub(ctx.gpr[instr.ra()]);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::mulli => {
|
||||
// PPCBUG-004: 32-bit ABI. Read RA as i32 (low 32, sign-extended for
|
||||
// multiply), product fits in 32 bits per ISA (overflow wraps).
|
||||
let ra = ctx.gpr[instr.ra()] as i32 as i64;
|
||||
// PPCBUG-020 fix: full 64-bit low product of (full 64-bit RA) ×
|
||||
// EXTS(SI). Matches canary InstrEmit_mulli
|
||||
// (`StoreGPR(Mul(LoadGPR(RA), Int64(EXTS(imm))))`).
|
||||
let ra = ctx.gpr[instr.ra()] as i64;
|
||||
let imm = instr.simm16() as i64;
|
||||
ctx.gpr[instr.rd()] = (ra.wrapping_mul(imm) as u32) as u64;
|
||||
ctx.gpr[instr.rd()] = ra.wrapping_mul(imm) as u64;
|
||||
ctx.pc += 4;
|
||||
}
|
||||
|
||||
// ===== ALU: Register =====
|
||||
PpcOpcode::addx => {
|
||||
// PPCBUG-012+020: 32-bit ABI writeback truncation + CR0 i32 view.
|
||||
// PPCBUG-020 fix: full 64-bit `RA + RB` (canary `Add(RA, RB)`).
|
||||
// OV/CR0 keep their 32-bit computation (low 32 of the result is
|
||||
// unchanged), so only the previously-zeroed upper 32 bits change.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let rb32 = ctx.gpr[instr.rb()] as u32;
|
||||
let result32 = ra32.wrapping_add(rb32);
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.ra()].wrapping_add(ctx.gpr[instr.rb()]);
|
||||
if instr.oe() {
|
||||
let true_sum = (ra32 as i32 as i128) + (rb32 as i32 as i128);
|
||||
overflow::apply(ctx, true_sum != (result32 as i32) as i128);
|
||||
@@ -186,12 +198,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::addcx => {
|
||||
// PPCBUG-013+020: 32-bit truncation; CA from u32 unsigned compare.
|
||||
// PPCBUG-020 fix: full 64-bit `RA + RB`; CA stays 32-bit (canary
|
||||
// `AddDidCarry` truncates to int32). Low 32 of result unchanged.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let rb32 = ctx.gpr[instr.rb()] as u32;
|
||||
let result32 = ra32.wrapping_add(rb32);
|
||||
ctx.xer_ca = if result32 < ra32 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.ra()].wrapping_add(ctx.gpr[instr.rb()]);
|
||||
if instr.oe() {
|
||||
let true_sum = (ra32 as i32 as i128) + (rb32 as i32 as i128);
|
||||
overflow::apply(ctx, true_sum != (result32 as i32) as i128);
|
||||
@@ -202,13 +215,15 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::addex => {
|
||||
// PPCBUG-014+020: 32-bit truncation; CA from u32 unsigned compare.
|
||||
// PPCBUG-020 fix: full 64-bit `RA + RB + CA`; CA stays 32-bit.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let rb32 = ctx.gpr[instr.rb()] as u32;
|
||||
let ca = ctx.xer_ca as u32;
|
||||
let result32 = ra32.wrapping_add(rb32).wrapping_add(ca);
|
||||
ctx.xer_ca = if result32 < ra32 || (ca != 0 && result32 == ra32) { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.ra()]
|
||||
.wrapping_add(ctx.gpr[instr.rb()])
|
||||
.wrapping_add(ca as u64);
|
||||
if instr.oe() {
|
||||
let true_sum = (ra32 as i32 as i128) + (rb32 as i32 as i128) + (ca as i128);
|
||||
overflow::apply(ctx, true_sum != (result32 as i32) as i128);
|
||||
@@ -219,12 +234,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::addzex => {
|
||||
// PPCBUG-015+020: 32-bit truncation.
|
||||
// PPCBUG-020 fix: full 64-bit `RA + CA`; CA stays 32-bit.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let ca = ctx.xer_ca as u32;
|
||||
let result32 = ra32.wrapping_add(ca);
|
||||
ctx.xer_ca = if result32 < ra32 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.ra()].wrapping_add(ca as u64);
|
||||
if instr.oe() {
|
||||
let true_sum = (ra32 as i32 as i128) + (ca as i128);
|
||||
overflow::apply(ctx, true_sum != (result32 as i32) as i128);
|
||||
@@ -235,12 +250,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::addmex => {
|
||||
// PPCBUG-016+020: 32-bit truncation. RT = RA + CA - 1.
|
||||
// PPCBUG-020 fix: full 64-bit `RA + CA - 1`; CA stays 32-bit.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let ca = ctx.xer_ca as u32;
|
||||
let result32 = ra32.wrapping_add(ca).wrapping_sub(1);
|
||||
ctx.xer_ca = if ra32 != 0 || ca != 0 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.ra()].wrapping_add(ca as u64).wrapping_sub(1);
|
||||
if instr.oe() {
|
||||
let true_sum = (ra32 as i32 as i128) + (ca as i128) - 1;
|
||||
overflow::apply(ctx, true_sum != (result32 as i32) as i128);
|
||||
@@ -251,11 +266,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::subfx => {
|
||||
// PPCBUG-017+020: 32-bit truncation.
|
||||
// PPCBUG-020 fix: full 64-bit `RB - RA` (canary `Sub(RB, RA)`).
|
||||
// OV/CR0 keep their 32-bit view (low 32 of result unchanged).
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let rb32 = ctx.gpr[instr.rb()] as u32;
|
||||
let result32 = rb32.wrapping_sub(ra32);
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.rb()].wrapping_sub(ctx.gpr[instr.ra()]);
|
||||
if instr.oe() {
|
||||
let true_diff = (rb32 as i32 as i128) - (ra32 as i32 as i128);
|
||||
overflow::apply(ctx, true_diff != (result32 as i32) as i128);
|
||||
@@ -266,14 +282,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::subfcx => {
|
||||
// PPCBUG-007: 32-bit ABI. The `rb >= ra` u64 unsigned compare is
|
||||
// exactly the shape that broke addis. Defensive 32-bit truncation
|
||||
// is required for correct CA even after upstream cleanup.
|
||||
// PPCBUG-020 fix: full 64-bit `RB - RA`; CA stays a 32-bit `rb >= ra`
|
||||
// compare (canary `SubDidCarry` truncates to int32).
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let rb32 = ctx.gpr[instr.rb()] as u32;
|
||||
let result32 = rb32.wrapping_sub(ra32);
|
||||
ctx.xer_ca = if rb32 >= ra32 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = ctx.gpr[instr.rb()].wrapping_sub(ctx.gpr[instr.ra()]);
|
||||
if instr.oe() {
|
||||
let true_diff = (rb32 as i32 as i128) - (ra32 as i32 as i128);
|
||||
overflow::apply(ctx, true_diff != (result32 as i32) as i128);
|
||||
@@ -284,14 +299,16 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::subfex => {
|
||||
// PPCBUG-008: 32-bit ABI. Compute in u32 space — `!ra` on u64 always
|
||||
// pollutes the upper 32 bits, making this an active poisoner.
|
||||
// PPCBUG-020 fix: full 64-bit `~RA + RB + CA` (canary semantics).
|
||||
// CA keeps its 32-bit compare. Low 32 of the result is unchanged.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let rb32 = ctx.gpr[instr.rb()] as u32;
|
||||
let ca = ctx.xer_ca as u32;
|
||||
let result32 = (!ra32).wrapping_add(rb32).wrapping_add(ca);
|
||||
ctx.xer_ca = if rb32 > ra32 || (rb32 == ra32 && ca != 0) { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = (!ctx.gpr[instr.ra()])
|
||||
.wrapping_add(ctx.gpr[instr.rb()])
|
||||
.wrapping_add(ca as u64);
|
||||
if instr.oe() {
|
||||
// RT <- !RA + RB + CA == RB - RA - 1 + CA (32-bit semantics).
|
||||
let true_sum = (rb32 as i32 as i128) - (ra32 as i32 as i128) - 1 + (ca as i128);
|
||||
@@ -303,14 +320,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::subfzex => {
|
||||
// PPCBUG-018: same active-poisoning shape as subfex; operate in u32.
|
||||
// PPCBUG-020 fix: full 64-bit `~RA + CA` (canary semantics).
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let ca = ctx.xer_ca as u32;
|
||||
let result32 = (!ra32).wrapping_add(ca);
|
||||
// RT <- !RA + CA (no -1 term). 32-bit carry-out only when
|
||||
// !ra32 = u32::MAX (i.e. ra32 = 0) AND ca = 1.
|
||||
// CA: 32-bit carry-out only when !ra32 = u32::MAX (ra32 = 0) AND ca = 1.
|
||||
ctx.xer_ca = if ra32 == 0 && ca != 0 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = (!ctx.gpr[instr.ra()]).wrapping_add(ca as u64);
|
||||
if instr.oe() {
|
||||
let true_sum = -(ra32 as i32 as i128) - 1 + (ca as i128);
|
||||
overflow::apply(ctx, true_sum != (result32 as i32) as i128);
|
||||
@@ -321,13 +337,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::subfmex => {
|
||||
// PPCBUG-019: also fixes the always-true CA edge — `!ra` on u64
|
||||
// is non-zero when ra32==0xFFFFFFFF and ca==0, so CA was stuck at 1.
|
||||
// PPCBUG-020 fix: full 64-bit `~RA + CA - 1` (canary semantics). CA
|
||||
// uses the 32-bit `!ra32` so it isn't stuck at 1 from u64 inversion.
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let ca = ctx.xer_ca as u32;
|
||||
let result32 = (!ra32).wrapping_add(ca).wrapping_sub(1);
|
||||
ctx.xer_ca = if (!ra32) != 0 || ca != 0 { 1 } else { 0 };
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = (!ctx.gpr[instr.ra()]).wrapping_add(ca as u64).wrapping_sub(1);
|
||||
if instr.oe() {
|
||||
let true_sum = -(ra32 as i32 as i128) - 2 + (ca as i128);
|
||||
overflow::apply(ctx, true_sum != (result32 as i32) as i128);
|
||||
@@ -338,12 +354,11 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::negx => {
|
||||
// PPCBUG-006: 32-bit ABI. `(!ra).wrapping_add(1)` on u64 always
|
||||
// sets upper 32 bits — every neg poisoned the GPR. neg_ov also
|
||||
// checks at 64-bit INT_MIN; should be 32-bit INT_MIN.
|
||||
// PPCBUG-020 fix: full 64-bit `-RA` (canary `Sub(0, RA)`). OV keeps
|
||||
// the 32-bit INT_MIN check (low 32 of the result is unchanged).
|
||||
let ra32 = ctx.gpr[instr.ra()] as u32;
|
||||
let result32 = (!ra32).wrapping_add(1);
|
||||
ctx.gpr[instr.rd()] = result32 as u64;
|
||||
ctx.gpr[instr.rd()] = 0u64.wrapping_sub(ctx.gpr[instr.ra()]);
|
||||
if instr.oe() {
|
||||
overflow::apply(ctx, ra32 == 0x8000_0000);
|
||||
}
|
||||
@@ -353,12 +368,15 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::mullwx => {
|
||||
// PPCBUG-009: 32-bit ABI. Truncate product to u32 — overflow detection
|
||||
// (mullw_ov) still uses the full i64 product to catch the overflow.
|
||||
// PPCBUG-020 fix: full 64-bit low product of EXTS(RA[32:63]) ×
|
||||
// EXTS(RB[32:63]) (canary InstrEmit_mullwx stores the full i64
|
||||
// product). A 32×32 product can occupy the upper 32 bits (e.g.
|
||||
// 0x10000 × 0x10000 = 0x1_0000_0000); the old `as u32` dropped them.
|
||||
// OV uses the full product; CR0 keeps its 32-bit (low-word) view.
|
||||
let ra = ctx.gpr[instr.ra()] as i32 as i64;
|
||||
let rb = ctx.gpr[instr.rb()] as i32 as i64;
|
||||
let product = ra.wrapping_mul(rb);
|
||||
ctx.gpr[instr.rd()] = product as u32 as u64;
|
||||
ctx.gpr[instr.rd()] = product as u64;
|
||||
if instr.oe() {
|
||||
overflow::apply(ctx, overflow::mullw_ov(product));
|
||||
}
|
||||
@@ -542,6 +560,18 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] | ctx.gpr[instr.rb()];
|
||||
if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as u32 as i32 as i64); }
|
||||
ctx.pc += 4;
|
||||
// `or r31,r31,r31` with encoding 0x7FFFFB78 is the Xenon `db16cyc`
|
||||
// spin-wait hint (a no-op write of r31 onto itself). Canary's
|
||||
// `InstrEmit_orx` special-cases exactly this code → `DelayExecution()`.
|
||||
// Under our round-robin lockstep, a guest spinlock/barrier loop that
|
||||
// executes db16cyc would otherwise consume its whole block every round
|
||||
// and starve the co-located thread it is waiting on (the lock holder /
|
||||
// barrier peer). Surface it as a cooperative yield so the scheduler can
|
||||
// hand the slot to a Ready peer. The semantic result of the op is
|
||||
// already applied (r31 |= r31 is a no-op), so yielding is value-neutral.
|
||||
if instr.raw == 0x7FFF_FB78 {
|
||||
return StepResult::Yield;
|
||||
}
|
||||
}
|
||||
PpcOpcode::orcx => {
|
||||
// PPCBUG-028: same shape as andcx — operate in u32.
|
||||
@@ -620,7 +650,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
PpcOpcode::slwx => {
|
||||
// PPCBUG-044: 32-bit ABI CR0 view. A result with bit 31 set
|
||||
// (e.g. 0x80000000) is negative in i32 view but positive in i64.
|
||||
let sh = ctx.gpr[instr.rb()] as u32;
|
||||
// Shift amount is RB[58:63] (6 bits): if >=32 the result is zeroed,
|
||||
// else shift by the low bits. Matches canary InstrEmit_slwx, which
|
||||
// masks `rb & 0x3F` then tests bit 5 — NOT a full-u32 `< 32` test
|
||||
// (a count like 0x40 has low-6-bits 0 and must pass the value
|
||||
// through, not zero it).
|
||||
let sh = ctx.gpr[instr.rb()] as u32 & 0x3F;
|
||||
ctx.gpr[instr.ra()] = if sh < 32 {
|
||||
((ctx.gpr[instr.rs()] as u32) << sh) as u64
|
||||
} else { 0 };
|
||||
@@ -630,7 +665,9 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
PpcOpcode::srwx => {
|
||||
// PPCBUG-044: 32-bit ABI CR0 view (zero-extended right shift can never
|
||||
// have bit 31 set, but use the canonical form for consistency).
|
||||
let sh = ctx.gpr[instr.rb()] as u32;
|
||||
// Shift amount masked to RB[58:63] (6 bits) to match canary
|
||||
// InstrEmit_srwx (`rb & 0x3F`, test bit 5).
|
||||
let sh = ctx.gpr[instr.rb()] as u32 & 0x3F;
|
||||
ctx.gpr[instr.ra()] = if sh < 32 {
|
||||
((ctx.gpr[instr.rs()] as u32) >> sh) as u64
|
||||
} else { 0 };
|
||||
@@ -638,37 +675,46 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::srawx => {
|
||||
// PPCBUG-041+043 coupled: 32-bit ABI writeback truncation + CR0 i32.
|
||||
// CA logic is independently correct (uses u32 shifted-out test).
|
||||
// sraw: 32-bit arithmetic shift right. Per PowerISA the 32-bit result
|
||||
// is SIGN-extended into the full 64-bit RA (`RA <- r&m | (i64.s)&¬m`),
|
||||
// matching canary InstrEmit_srawx (`v = f.SignExtend(v, INT64_TYPE)`).
|
||||
// Earlier ours zero-extended (`result as u32 as u64`) — the PPCBUG-041
|
||||
// "writeback truncation" band-aid — which corrupts any negative shift
|
||||
// result consumed as a 64-bit value. CA logic is independently correct
|
||||
// (uses the u32 shifted-out test) and the CR0 view is unchanged (the
|
||||
// sign-extended i64 has the same i32 view).
|
||||
let rs = ctx.gpr[instr.rs()] as i32;
|
||||
let sh = ctx.gpr[instr.rb()] as u32 & 0x3F;
|
||||
if sh == 0 {
|
||||
ctx.gpr[instr.ra()] = rs as u32 as u64;
|
||||
let result: i32 = if sh == 0 {
|
||||
ctx.xer_ca = 0;
|
||||
rs
|
||||
} else if sh < 32 {
|
||||
let result = rs >> sh;
|
||||
ctx.xer_ca = if rs < 0 && (rs as u32) << (32 - sh) != 0 { 1 } else { 0 };
|
||||
ctx.gpr[instr.ra()] = result as u32 as u64;
|
||||
rs >> sh
|
||||
} else {
|
||||
ctx.gpr[instr.ra()] = if rs < 0 { 0xFFFF_FFFFu64 } else { 0 };
|
||||
// sh >= 32: result is all sign bits of rs.
|
||||
ctx.xer_ca = if rs < 0 { 1 } else { 0 };
|
||||
}
|
||||
if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as u32 as i32 as i64); }
|
||||
rs >> 31
|
||||
};
|
||||
ctx.gpr[instr.ra()] = result as i64 as u64;
|
||||
if instr.rc_bit() { ctx.update_cr_signed(0, result as i64); }
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::srawix => {
|
||||
// PPCBUG-042+043 coupled: same shape as srawx for the sh-immediate form.
|
||||
// srawi: same as srawx for the sh-immediate form (sh in 0..31).
|
||||
// Sign-extend the 32-bit result into the full 64-bit RA per PowerISA /
|
||||
// canary InstrEmit_srawix.
|
||||
let rs = ctx.gpr[instr.rs()] as i32;
|
||||
let sh = instr.sh();
|
||||
if sh == 0 {
|
||||
ctx.gpr[instr.ra()] = rs as u32 as u64;
|
||||
let result: i32 = if sh == 0 {
|
||||
ctx.xer_ca = 0;
|
||||
rs
|
||||
} else {
|
||||
let result = rs >> sh;
|
||||
ctx.xer_ca = if rs < 0 && (rs as u32) << (32 - sh) != 0 { 1 } else { 0 };
|
||||
ctx.gpr[instr.ra()] = result as u32 as u64;
|
||||
}
|
||||
if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as u32 as i32 as i64); }
|
||||
rs >> sh
|
||||
};
|
||||
ctx.gpr[instr.ra()] = result as i64 as u64;
|
||||
if instr.rc_bit() { ctx.update_cr_signed(0, result as i64); }
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::sldx => {
|
||||
@@ -966,7 +1012,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
|
||||
if cond_ok {
|
||||
let next_pc = ctx.pc + 4;
|
||||
ctx.pc = (ctx.ctr as u32) & !3;
|
||||
let target = (ctx.ctr as u32) & !3;
|
||||
// Ground-truth indirect-dispatch recording (env-gated, off by
|
||||
// default; pure record-only, no scheduling/state change).
|
||||
if crate::dispatch_rec::enabled() {
|
||||
crate::dispatch_rec::record(ctx.pc, target, ctx.gpr[3], ctx.lr);
|
||||
}
|
||||
ctx.pc = target;
|
||||
if instr.lk() {
|
||||
ctx.lr = next_pc as u64;
|
||||
}
|
||||
@@ -1605,7 +1657,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
match spr {
|
||||
crate::context::spr::XER => ctx.set_xer(val as u32),
|
||||
crate::context::spr::LR => ctx.lr = val,
|
||||
crate::context::spr::CTR => ctx.ctr = val as u32 as u64,
|
||||
// CTR is a 64-bit SPR — store the full GPR, matching canary
|
||||
// InstrEmit_mtspr (`f.StoreCTR(rt)`, no truncation). The PPCBUG-054
|
||||
// `val as u32 as u64` band-aid dropped the upper 32 bits, which a
|
||||
// later `mfspr rX, CTR` would read back wrong. (bdnz/bcctr only
|
||||
// ever consume CTR's low 32 bits, so branching is unaffected.)
|
||||
crate::context::spr::CTR => ctx.ctr = val,
|
||||
crate::context::spr::DEC => ctx.dec = val as u32,
|
||||
crate::context::spr::TBL_WRITE => {
|
||||
ctx.timebase = (ctx.timebase & 0xFFFF_FFFF_0000_0000) | (val & 0xFFFF_FFFF);
|
||||
@@ -5015,6 +5072,106 @@ mod tests {
|
||||
assert_eq!(ctx.pc, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_db16cyc_yields() {
|
||||
// `or r31,r31,r31` encoding 0x7FFFFB78 is the Xenon db16cyc spin hint.
|
||||
// It must (a) be value-neutral (r31 unchanged), (b) advance PC, and
|
||||
// (c) report StepResult::Yield so the scheduler can hand off the slot.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
write_instr(&mut mem, 0, 0x7FFF_FB78);
|
||||
ctx.pc = 0;
|
||||
ctx.gpr[31] = 0x1234_5678_9ABC_DEF0;
|
||||
let r = step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[31], 0x1234_5678_9ABC_DEF0, "db16cyc is value-neutral");
|
||||
assert_eq!(ctx.pc, 4, "PC advances past the hint");
|
||||
assert_eq!(r, StepResult::Yield, "db16cyc surfaces as a cooperative yield");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_plain_or_self_is_not_yield() {
|
||||
// A regular `or rN,rN,rN` that is NOT the db16cyc encoding (e.g. r3)
|
||||
// is an ordinary no-op move and must keep executing (Continue), so we
|
||||
// only yield on the exact spin-hint code canary special-cases.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
// or r3, r3, r3 (RT=RA=RB=3, Rc=0): 31<<26 | 3<<21 | 3<<16 | 3<<11 | 444<<1
|
||||
let raw = (31u32 << 26) | (3 << 21) | (3 << 16) | (3 << 11) | (444 << 1);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
ctx.gpr[3] = 0xCAFE;
|
||||
let r = step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[3], 0xCAFE);
|
||||
assert_eq!(ctx.pc, 4);
|
||||
assert_eq!(r, StepResult::Continue, "non-db16cyc or-self stays Continue");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smt_priority_hints_are_nops_not_yields() {
|
||||
// iterate-2H spin/yield/sync hint-class audit. The PowerPC SMT
|
||||
// thread-priority hints `or 1,1,1` / `or 2,2,2` / `or 3,3,3` / `or 6,6,6`
|
||||
// (and the db8cyc family `or 26..30`) are reserved no-op encodings.
|
||||
// Canary's `InstrEmit_orx` emits `f.Nop()` for EVERY `or rX,rX,rX`
|
||||
// (RT==RB==RA && !Rc) form EXCEPT the exact db16cyc code 0x7FFFFB78,
|
||||
// which alone gets `f.DelayExecution()`. So ours must NOT yield on any
|
||||
// of these — over-yielding would diverge from canary and perturb the
|
||||
// deterministic schedule. (Audit evidence: none of 1/2/3/6/26..30 even
|
||||
// appear in Sylpheed's image; only `or 31,31,31` (db16cyc) is used as a
|
||||
// spin hint. This test locks the no-over-yield invariant regardless.)
|
||||
for r in [1u32, 2, 3, 6, 26, 27, 28, 29, 30] {
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
// or rN,rN,rN, Rc=0: 31<<26 | r<<21 | r<<16 | r<<11 | 444<<1
|
||||
let raw = (31u32 << 26) | (r << 21) | (r << 16) | (r << 11) | (444 << 1);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
ctx.gpr[r as usize] = 0xDEAD_BEEF_F00D_BA11;
|
||||
let res = step(&mut ctx, &mut mem);
|
||||
assert_eq!(
|
||||
ctx.gpr[r as usize], 0xDEAD_BEEF_F00D_BA11,
|
||||
"or {r},{r},{r} is value-neutral"
|
||||
);
|
||||
assert_eq!(ctx.pc, 4, "or {r},{r},{r} advances PC");
|
||||
assert_eq!(
|
||||
res,
|
||||
StepResult::Continue,
|
||||
"priority hint or {r},{r},{r} is a plain no-op (canary Nop), NOT a yield"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lwsync_ptesync_eieio_isync_decode_as_benign_noops() {
|
||||
// Memory/sync barrier class. Canary keys `sync` on XO=598 only, so
|
||||
// sync (L=0), lwsync (L=1), ptesync (L=2) all map to the same
|
||||
// `InstrEmit_sync` -> `MemoryBarrier`; `eieio` -> `MemoryBarrier`;
|
||||
// `isync` -> `Nop`. Under our single-host interpreter every one is a
|
||||
// value-neutral no-op that advances PC and must DECODE (never trap as
|
||||
// unknown). This guards the L-field disambiguation and the decode path.
|
||||
let cases: &[(u32, &str)] = &[
|
||||
(0x7C00_04AC, "sync"), // L=0
|
||||
(0x7C20_04AC, "lwsync"), // L=1
|
||||
(0x7C40_04AC, "ptesync"), // L=2
|
||||
(0x7C00_06AC, "eieio"),
|
||||
(0x4C00_012C, "isync"),
|
||||
];
|
||||
for &(raw, name) in cases {
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
let pre_xer = ctx.xer();
|
||||
let pre_fpscr = ctx.fpscr;
|
||||
let pre_gpr = ctx.gpr;
|
||||
write_instr(&mut mem, 0x200, raw);
|
||||
ctx.pc = 0x200;
|
||||
let res = step(&mut ctx, &mut mem);
|
||||
assert_eq!(res, StepResult::Continue, "{name} continues");
|
||||
assert_eq!(ctx.pc, 0x204, "{name} advances PC (decoded, did not trap)");
|
||||
assert_eq!(ctx.xer(), pre_xer, "{name} leaves XER");
|
||||
assert_eq!(ctx.fpscr, pre_fpscr, "{name} leaves FPSCR");
|
||||
assert_eq!(ctx.gpr, pre_gpr, "{name} leaves GPRs");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fadd() {
|
||||
let mut ctx = PpcContext::new();
|
||||
@@ -5332,15 +5489,17 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.xer_ov, 1);
|
||||
// -INT_MIN wraps to INT_MIN (low 32 bits) with upper 32 bits zero.
|
||||
assert_eq!(ctx.gpr[5], 0x0000_0000_8000_0000);
|
||||
assert_eq!(ctx.xer_ov, 1, "32-bit INT_MIN check (preserved) sets OV");
|
||||
// PPCBUG-020 fix: neg is full 64-bit `0 - RA` (canary `Sub(0, RA)`).
|
||||
// RA = 0x0000_0000_8000_0000 → 0xFFFF_FFFF_8000_0000. (OV remains the
|
||||
// preserved 32-bit INT_MIN flag.)
|
||||
assert_eq!(ctx.gpr[5], 0xFFFF_FFFF_8000_0000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn neg_clean_input_no_upper_bits() {
|
||||
// PPCBUG-006 regression: neg r3=5 must produce 0x00000000_FFFFFFFB,
|
||||
// not 0xFFFFFFFF_FFFFFFFB (the 64-bit !ra-then-add-1 result).
|
||||
// PPCBUG-020 fix: neg r3=5 = `0 - 5` = -5 = 0xFFFFFFFF_FFFFFFFB on a
|
||||
// 64-bit core (canary `Sub(0, RA)`), not the truncated 0x00000000_FFFFFFFB.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 5;
|
||||
@@ -5348,7 +5507,7 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0x0000_0000_FFFF_FFFB);
|
||||
assert_eq!(ctx.gpr[5], 0xFFFF_FFFF_FFFF_FFFB);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -5502,9 +5661,10 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mullwx_overflow_truncates_to_32() {
|
||||
// PPCBUG-009: mullwo r5, r3, r4 with ra=0x10000, rb=0x10000 → product
|
||||
// 0x100000000 (overflow). Low 32 = 0; OE must fire.
|
||||
fn mullwx_overflow_keeps_full_64bit_product() {
|
||||
// PPCBUG-020 fix: mullwo r5, r3, r4 with ra=0x10000, rb=0x10000 → full
|
||||
// 64-bit product 0x1_0000_0000 (canary stores the full i64 product, not
|
||||
// the truncated low 32). OE still fires (the product overflows int32).
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0x10000;
|
||||
@@ -5514,7 +5674,7 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0, "low 32 bits = 0");
|
||||
assert_eq!(ctx.gpr[5], 0x0000_0001_0000_0000, "full 64-bit product");
|
||||
assert_eq!(ctx.xer_ov, 1, "overflow detected");
|
||||
}
|
||||
|
||||
@@ -5536,9 +5696,74 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn srawx_negative_value_zero_extends_upper() {
|
||||
// PPCBUG-041+043: srawx of negative i32 by 1 produces a negative i32;
|
||||
// writeback must zero-extend to u64 (not sign-extend).
|
||||
fn slwx_shift_count_masks_to_6_bits() {
|
||||
// slw masks the shift count to RB[58:63] (6 bits): a count of 0x40 has
|
||||
// low-6-bits 0, so the value passes through unchanged — it must NOT be
|
||||
// zeroed by a naive full-u32 `>= 32` test. Matches canary InstrEmit_slwx.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0x0000_1234u64;
|
||||
ctx.gpr[4] = 0x40; // count & 0x3F == 0 → shift by 0
|
||||
// slwx r5, r3, r4 (XO=24)
|
||||
let raw = (31u32 << 26) | (3 << 21) | (5 << 16) | (4 << 11) | (24 << 1);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0x0000_1234u64, "0x40 masks to 0 → passthrough");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn slwx_count_32_to_63_zeroes() {
|
||||
// A masked count in [32,63] (bit 5 set) zeroes the result.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0xFFFF_FFFFu64;
|
||||
ctx.gpr[4] = 0x60; // & 0x3F = 0x20 (32) → zero
|
||||
let raw = (31u32 << 26) | (3 << 21) | (5 << 16) | (4 << 11) | (24 << 1);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn srwx_shift_count_masks_to_6_bits() {
|
||||
// srw, same 6-bit mask. Count 0x48 → low-6-bits = 8 → logical >> 8.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0x0000_FF00u64;
|
||||
ctx.gpr[4] = 0x48; // & 0x3F = 8
|
||||
// srwx r5, r3, r4 (XO=536)
|
||||
let raw = (31u32 << 26) | (3 << 21) | (5 << 16) | (4 << 11) | (536 << 1);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0x0000_00FFu64, "0x48 masks to 8 → >>8");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlwinm_mb_greater_than_me_wraparound_mask() {
|
||||
// rlwinm with MB > ME produces a wraparound mask covering bits
|
||||
// [0..ME] ∪ [MB..31] (a "split" mask). PowerISA MASK(mb,me) wraps when
|
||||
// mb > me. Here rotate by 0, MB=28, ME=3 → mask = 0xF000000F.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0xFFFF_FFFFu64;
|
||||
// rlwinm r5, r3, SH=0, MB=28, ME=3 (opcode 21)
|
||||
let raw = (21u32 << 26) | (3 << 21) | (5 << 16) | (0 << 11) | (28 << 6) | (3 << 1);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0x0000_0000_F000_000Fu64,
|
||||
"MB>ME wraparound mask = bits [0..3] | [28..31]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn srawx_negative_value_sign_extends_upper() {
|
||||
// sraw of negative i32 by 1 produces a negative i32 result that PowerISA
|
||||
// SIGN-extends into the full 64-bit RA (canary InstrEmit_srawx uses
|
||||
// `f.SignExtend`). 0x80000000 >> 1 = 0xC0000000 (i32) → 0xFFFFFFFF_C0000000.
|
||||
// (Was 0x00000000_C0000000 under the PPCBUG-041 zero-extend band-aid.)
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0x8000_0000u64; // i32::MIN
|
||||
@@ -5548,14 +5773,15 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0x0000_0000_C000_0000u64);
|
||||
assert_eq!(ctx.gpr[5], 0xFFFF_FFFF_C000_0000u64);
|
||||
assert!(ctx.cr[0].lt);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn srawix_high_count_negative_input_yields_low32_all_ones() {
|
||||
// PPCBUG-042+043: srawi with count=31 on negative input → low 32 bits
|
||||
// all ones (0xFFFFFFFF), upper 32 zero (was u64::MAX before fix).
|
||||
fn srawix_high_count_negative_input_sign_extends_all_ones() {
|
||||
// srawi count=31 on negative input → result is -1 (0xFFFFFFFF as i32),
|
||||
// sign-extended to the full 64-bit RA: 0xFFFFFFFF_FFFFFFFF (canary
|
||||
// InstrEmit_srawix). Was 0x00000000_FFFFFFFF under the zero-extend band-aid.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0x8000_0000u64;
|
||||
@@ -5564,7 +5790,7 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0x0000_0000_FFFF_FFFFu64);
|
||||
assert_eq!(ctx.gpr[5], 0xFFFF_FFFF_FFFF_FFFFu64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -5598,17 +5824,18 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
// Result low 32: 0x00000001 + 0xFFFFFFFF = 0x00000000 with carry.
|
||||
assert_eq!(ctx.gpr[4], 0);
|
||||
// PPCBUG-020 fix: full 64-bit `RA + EXTS(-1)` = 0xFFFFFFFF_00000001 +
|
||||
// 0xFFFFFFFF_FFFFFFFF = 0xFFFFFFFF_00000000 (canary). CA still comes
|
||||
// from the 32-bit compare (low 32: 0x00000001 + 0xFFFFFFFF = 0, carry).
|
||||
assert_eq!(ctx.gpr[4], 0xFFFFFFFF_00000000u64);
|
||||
assert_eq!(ctx.xer_ca, 1, "32-bit compare must see CA=1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mulli_overflow_wraps_to_32() {
|
||||
// PPCBUG-004: mulli must truncate to 32 bits even when the upper 32 bits
|
||||
// of RA are polluted (e.g. by upstream bugs). Pre-fix: ra = u64::MAX as
|
||||
// i64 = -1, * 2 = -2, written to GPR as `0xFFFFFFFF_FFFFFFFE`. Post-fix:
|
||||
// truncated to `0xFFFFFFFE`. Discriminating regression test.
|
||||
fn mulli_full_64bit_product() {
|
||||
// PPCBUG-020 fix: mulli uses the full 64-bit RA (canary
|
||||
// `Mul(LoadGPR(RA), Int64(EXTS(imm)))`). RA = u64::MAX = -1, × 2 = -2
|
||||
// = 0xFFFFFFFF_FFFFFFFE (full 64-bit), not the truncated 0xFFFFFFFE.
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = u64::MAX;
|
||||
@@ -5617,13 +5844,14 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[4], 0xFFFF_FFFEu64, "low 32 bits = -2 in i32; upper 32 zero");
|
||||
assert_eq!(ctx.gpr[4], 0xFFFF_FFFF_FFFF_FFFEu64, "full 64-bit -2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn subficx_neg_simm_zero_extends() {
|
||||
// PPCBUG-005: subfic r4, r3, -1 with r3=5: imm-ra = 0xFFFFFFFF - 5 = 0xFFFFFFFA.
|
||||
// Buggy form: imm sign-extended to u64 0xFFFFFFFFFFFFFFFF - 5 = poisoned.
|
||||
fn subficx_full_64bit_result() {
|
||||
// PPCBUG-020 fix: subfic r4, r3, -1 with r3=5 = `EXTS(-1) - RA` =
|
||||
// 0xFFFFFFFF_FFFFFFFF - 5 = 0xFFFFFFFF_FFFFFFFA (canary `Sub(Int64(
|
||||
// EXTS(imm)), RA)`). CA stays a 32-bit compare (0xFFFFFFFF >= 5 → 1).
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 5;
|
||||
@@ -5632,7 +5860,7 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[4], 0x0000_0000_FFFF_FFFAu64);
|
||||
assert_eq!(ctx.gpr[4], 0xFFFF_FFFF_FFFF_FFFAu64);
|
||||
assert_eq!(ctx.xer_ca, 1, "0xFFFFFFFF >= 5 → CA=1");
|
||||
}
|
||||
|
||||
@@ -6538,12 +6766,13 @@ mod tests {
|
||||
assert_eq!(ctx.pc, 4);
|
||||
}
|
||||
|
||||
// PPCBUG-054: mtspr CTR must truncate the source GPR to 32 bits, matching
|
||||
// canary's `f.Truncate(ctr, INT32_TYPE)`. Prevents upstream 64-bit GPR
|
||||
// pollution from poisoning the 32-bit CTR counter independently of the
|
||||
// bcx zero-test fix.
|
||||
// CTR is a 64-bit SPR. mtspr CTR stores the full GPR (canary
|
||||
// InstrEmit_mtspr: `f.StoreCTR(rt)`, no truncation). The bdnz/bclr zero-TEST
|
||||
// still truncates to 32 bits (separate, canary-faithful — see the bcx tests
|
||||
// above); the earlier PPCBUG-054 store-side truncation was a band-aid that a
|
||||
// later `mfspr rX, CTR` would read back wrong.
|
||||
#[test]
|
||||
fn mtspr_ctr_truncates_to_32_bits() {
|
||||
fn mtspr_ctr_keeps_full_64_bits() {
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0xFFFF_FFFF_8000_0001;
|
||||
@@ -6553,7 +6782,26 @@ mod tests {
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.ctr, 0x8000_0001);
|
||||
assert_eq!(ctx.ctr, 0xFFFF_FFFF_8000_0001);
|
||||
}
|
||||
|
||||
// mfspr rX, CTR must read back the full 64-bit CTR (round-trips the value
|
||||
// mtspr stored). This is the observable consequence of the mtspr fix.
|
||||
#[test]
|
||||
fn mfspr_ctr_reads_full_64_bits() {
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.gpr[3] = 0xFFFF_FFFF_8000_0001;
|
||||
// mtspr CTR, r3 then mfspr r5, CTR
|
||||
let spr_swapped = ((9u32 & 0x1F) << 5) | ((9u32 >> 5) & 0x1F);
|
||||
let mt = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (467 << 1);
|
||||
let mf = (31u32 << 26) | (5 << 21) | (spr_swapped << 11) | (339 << 1);
|
||||
write_instr(&mut mem, 0, mt);
|
||||
write_instr(&mut mem, 4, mf);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.gpr[5], 0xFFFF_FFFF_8000_0001);
|
||||
}
|
||||
|
||||
// ───────────────────────────────────────────────────────────────────────
|
||||
@@ -7640,8 +7888,8 @@ mod tests {
|
||||
ctx.xer_ca = 0;
|
||||
step(&mut ctx, &mem);
|
||||
assert_eq!(ctx.xer_ca, 0, "ra=0, ca=0 should produce CA=0");
|
||||
// PPCBUG-018: 32-bit ABI. !0u32 + 0 = u32::MAX, with upper 32 bits zero.
|
||||
assert_eq!(ctx.gpr[3], 0xFFFF_FFFFu64, "result = !0u32 + 0 = u32::MAX");
|
||||
// PPCBUG-020 fix: full 64-bit `!RA + CA` = !0u64 + 0 = u64::MAX.
|
||||
assert_eq!(ctx.gpr[3], 0xFFFF_FFFF_FFFF_FFFFu64, "result = !0u64 + 0");
|
||||
}
|
||||
// Case 3: ra=1, ca=0 → CA=0 (old buggy code reported CA=1)
|
||||
{
|
||||
@@ -7653,8 +7901,8 @@ mod tests {
|
||||
ctx.xer_ca = 0;
|
||||
step(&mut ctx, &mem);
|
||||
assert_eq!(ctx.xer_ca, 0, "ra=1, ca=0 should produce CA=0");
|
||||
// PPCBUG-018: 32-bit ABI. !1u32 + 0 = u32::MAX - 1, with upper 32 bits zero.
|
||||
assert_eq!(ctx.gpr[3], 0xFFFF_FFFEu64, "result = !1u32 + 0 = u32::MAX - 1");
|
||||
// PPCBUG-020 fix: full 64-bit `!1u64 + 0` = u64::MAX - 1.
|
||||
assert_eq!(ctx.gpr[3], 0xFFFF_FFFF_FFFF_FFFEu64, "result = !1u64 + 0");
|
||||
}
|
||||
// Case 4: ra=u32::MAX, ca=1 → CA=0; result = !u32::MAX + 1 = 1.
|
||||
{
|
||||
@@ -7666,7 +7914,9 @@ mod tests {
|
||||
ctx.xer_ca = 1;
|
||||
step(&mut ctx, &mem);
|
||||
assert_eq!(ctx.xer_ca, 0, "ra=u32::MAX, ca=1 should produce CA=0");
|
||||
assert_eq!(ctx.gpr[3], 1, "result = !u32::MAX + 1 = 1");
|
||||
// PPCBUG-020 fix: full 64-bit `!RA + CA`. RA = 0x0000_0000_FFFF_FFFF
|
||||
// → !RA = 0xFFFF_FFFF_0000_0000, + 1 = 0xFFFF_FFFF_0000_0001.
|
||||
assert_eq!(ctx.gpr[3], 0xFFFF_FFFF_0000_0001u64, "result = !RA + 1");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod block_cache;
|
||||
pub mod context;
|
||||
pub mod decoder;
|
||||
pub mod dispatch_rec;
|
||||
pub mod disasm;
|
||||
pub mod fpscr;
|
||||
pub mod interpreter;
|
||||
|
||||
@@ -204,6 +204,34 @@ impl PpcOpcode {
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns true if this opcode is a cross-thread synchronization
|
||||
/// point at which the superblock runner MUST yield back to the
|
||||
/// round-robin scheduler so the lockstep interleaving stays
|
||||
/// fine-grained enough to preserve correct cross-thread ordering:
|
||||
///
|
||||
/// - reserved load/store (`lwarx`/`ldarx`/`stwcx.`/`stdcx.`): the
|
||||
/// atomic primitive other threads race on. Running past one
|
||||
/// without returning to the scheduler would let a single slot
|
||||
/// win/lose a reservation across many blocks before any peer
|
||||
/// observes it.
|
||||
/// - memory barriers (`sync`/`eieio`/`isync`): the guest explicitly
|
||||
/// demands a global ordering point here; honour it by ending the
|
||||
/// superblock so the scheduler re-interleaves.
|
||||
///
|
||||
/// Purely a function of the opcode (no guest data), so the yield
|
||||
/// decision is deterministic and the schedule reproduces byte-identically.
|
||||
/// Note: `sc` (syscall) and traps already `terminates_block`, and
|
||||
/// import-thunk / halt-sentinel PCs are handled by the per-block
|
||||
/// prologue re-check in the superblock loop — they are not listed here.
|
||||
#[inline]
|
||||
pub fn is_sync_sensitive(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::lwarx | Self::ldarx | Self::stwcx | Self::stdcx
|
||||
| Self::sync | Self::eieio | Self::isync
|
||||
)
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Invalid => "invalid",
|
||||
|
||||
@@ -35,6 +35,20 @@ pub const INITIAL_GUEST_TID: u32 = 1;
|
||||
/// Axis 1 carries the field on every thread but doesn't decrement yet.
|
||||
pub const QUANTUM_DEFAULT: u32 = 50_000;
|
||||
|
||||
/// Anti-starvation floor. On a cooperative single-host slot, strict-priority
|
||||
/// `pick_runnable` lets a high-priority CPU-bound spinner (e.g. a pri-15
|
||||
/// time-critical poll loop pinned by affinity) win every round forever,
|
||||
/// permanently starving a co-located lower-priority peer that the spinner is
|
||||
/// actually *waiting on* — a deadlock that never occurs on real hardware,
|
||||
/// where SMT contexts run those threads concurrently.
|
||||
///
|
||||
/// Once a Ready thread has been passed over this many consecutive slot
|
||||
/// visits, `pick_runnable` grants it ONE pick (then its counter resets). The
|
||||
/// limit is large enough that the genuinely-higher-priority thread still wins
|
||||
/// the overwhelming majority of visits (here: ~4095/4096); the boost only
|
||||
/// guarantees *bounded* forward progress, it does not invert priority.
|
||||
pub const STARVE_LIMIT: u32 = 4096;
|
||||
|
||||
/// Above this depth, `spawn` prunes `Exited` entries from a slot's runqueue
|
||||
/// before pushing the new thread. Keeps peer `ThreadRef`s stable on the
|
||||
/// common (low-depth) path — a game that spawns a handful of long-lived
|
||||
@@ -117,6 +131,20 @@ pub struct GuestThread {
|
||||
/// Axis 3 instruction budget. Decremented per retired step on this
|
||||
/// thread; on zero, slot rotates within same-priority tier.
|
||||
pub quantum_remaining: u32,
|
||||
/// Anti-starvation counter. Incremented each slot visit this thread is
|
||||
/// Ready but NOT picked; reset to 0 when picked. When it reaches
|
||||
/// `STARVE_LIMIT`, `pick_runnable` grants this thread one boosted pick so
|
||||
/// a monopolizing higher-priority peer on the same slot cannot starve it
|
||||
/// indefinitely. Deterministic: a pure function of pick history.
|
||||
pub steps_starved: u32,
|
||||
/// SpawnParams.entry — the BL target the trampoline jumped to.
|
||||
/// Persisted so kernel exports can filter syscalls by spawning
|
||||
/// chain (e.g. the silph UI auto-signal POC). 0 for the initial
|
||||
/// thread (uses `install_initial_thread`, not `spawn`).
|
||||
pub start_entry: u32,
|
||||
/// SpawnParams.start_context — initial r3 at spawn. Persisted for
|
||||
/// the same filtering reason as `start_entry`.
|
||||
pub start_context: u32,
|
||||
}
|
||||
|
||||
impl GuestThread {
|
||||
@@ -136,6 +164,9 @@ impl GuestThread {
|
||||
affinity_mask: 0xFF,
|
||||
ideal_processor: None,
|
||||
quantum_remaining: QUANTUM_DEFAULT,
|
||||
steps_starved: 0,
|
||||
start_entry: 0,
|
||||
start_context: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -174,6 +205,21 @@ pub enum BlockReason {
|
||||
CriticalSection(u32),
|
||||
}
|
||||
|
||||
/// Floor of the **synthetic park-handle** range. Handles at or above this
|
||||
/// value are deliberately OUTSIDE the kernel object allocator (which starts
|
||||
/// at `0x1000`); they are used to park threads that must NEVER be woken by
|
||||
/// the normal signal/wait machinery — currently the dedicated audio-worker
|
||||
/// threads (`xenia_kernel::xaudio::XAUDIO_SYNTHETIC_HANDLE_BASE = 0xF000_0000`),
|
||||
/// which are only ever un-parked by audio-callback injection. The deadlock
|
||||
/// force-wake ([`Scheduler::unblock_on_deadlock`]) must skip waiters parked
|
||||
/// solely on such handles: they are not deadlock participants (the guest
|
||||
/// genuinely blocked on its own objects), and waking one runs its thread
|
||||
/// entry to the `LR_HALT` sentinel → premature exit, which then drops every
|
||||
/// subsequent injection. Kept in `xenia-cpu` (not imported from
|
||||
/// `xenia-kernel`, which depends on this crate); the kernel const must stay
|
||||
/// within `[SYNTHETIC_PARK_HANDLE_FLOOR, u32::MAX]`.
|
||||
pub const SYNTHETIC_PARK_HANDLE_FLOOR: u32 = 0xF000_0000;
|
||||
|
||||
/// Sink for PCR+0x2C writes — the scheduler writes the guest-visible
|
||||
/// current-processor-id here at spawn and Axis 4 rewrites on affinity
|
||||
/// migration. Implemented by `xenia-kernel` for `GuestMemory`; keeping it
|
||||
@@ -208,15 +254,35 @@ impl Default for HwSlot {
|
||||
impl HwSlot {
|
||||
/// Index of the highest-priority Ready/ServicingIrq thread in this
|
||||
/// slot's runqueue. Tiebreak: prefer lower index (deterministic).
|
||||
///
|
||||
/// Selection is by *effective* priority: a Ready thread that has been
|
||||
/// passed over for `STARVE_LIMIT` consecutive visits is boosted so it
|
||||
/// wins exactly one pick, then [`Scheduler::begin_slot_visit`] resets its
|
||||
/// counter. This restores the guest-visible invariant that every Ready
|
||||
/// thread makes forward progress, without inverting the intended priority
|
||||
/// order (a starved thread only beats its monopolizer once per
|
||||
/// `STARVE_LIMIT` visits). The boost is a pure function of the per-thread
|
||||
/// counters/priority/index, so picks stay deterministic.
|
||||
pub fn pick_runnable(&self) -> Option<usize> {
|
||||
self.runqueue
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, t)| matches!(t.state, HwState::Ready | HwState::ServicingIrq(_)))
|
||||
.max_by_key(|(i, t)| (t.priority, -(*i as i64)))
|
||||
.max_by_key(|(i, t)| (Self::effective_priority(t), -(*i as i64)))
|
||||
.map(|(i, _)| i)
|
||||
}
|
||||
|
||||
/// Priority used for selection. A thread starved for `STARVE_LIMIT`
|
||||
/// visits is lifted to `i32::MAX` so it wins the next pick regardless of
|
||||
/// peer priority; otherwise its nominal priority is used unchanged.
|
||||
fn effective_priority(t: &GuestThread) -> i32 {
|
||||
if t.steps_starved >= STARVE_LIMIT {
|
||||
i32::MAX
|
||||
} else {
|
||||
t.priority
|
||||
}
|
||||
}
|
||||
|
||||
/// How many non-Exited threads currently live on this slot (used by
|
||||
/// placement policies).
|
||||
pub fn live_depth(&self) -> usize {
|
||||
@@ -341,6 +407,28 @@ pub struct Scheduler {
|
||||
/// Sorted by deadline ascending. Scheduler wakes the first entry via
|
||||
/// `advance_to_next_wake` when a round finds nothing runnable.
|
||||
timed_waits: Vec<(u64, ThreadRef)>,
|
||||
/// Coherent monotonic "now" clock — the single authoritative basis the
|
||||
/// kernel deadline-arithmetic (`KernelState::now_basis_at`) reads in
|
||||
/// BOTH execution modes. Per-thread `ctx(hw_id).timebase` is NOT a
|
||||
/// coherent "now":
|
||||
/// * In `--parallel`, workers extract their `PpcContext` (leaving a
|
||||
/// zeroed timebase in the slot) and step unlocked.
|
||||
/// * In **lockstep**, a parked/poll thread has `running_idx == None`,
|
||||
/// so `ctx()` returns `idle_ctx` (timebase 0); a `parse_timeout`
|
||||
/// reading that basis registers `deadline = 0 + relative`, a value
|
||||
/// permanently in the past, and `coord_idle_advance` re-arms that
|
||||
/// same constant deadline forever (timebase-desync livelock — the
|
||||
/// render-gate root: the submitter's 16ms re-wait never fires).
|
||||
/// So a coordinator/parked thread reading per-thread timebase can see a
|
||||
/// stale/zero basis decoupled from the deadline it just advanced to.
|
||||
/// This field is that coherent basis instead. It is DETERMINISTIC: a
|
||||
/// pure function of retired guest instructions (never wall-clock).
|
||||
/// Advanced by `advance_global_clock` (per-block retired count on each
|
||||
/// parallel writeback), `advance_global_clock_to` (floored up to the
|
||||
/// deterministic per-round `stats.instruction_count` in lockstep), and
|
||||
/// floored up by `advance_all_timebases_to`. Two cold lockstep runs
|
||||
/// read identical values, so the lockstep trace stays bit-reproducible.
|
||||
global_clock: u64,
|
||||
/// Global count of TLS slots allocated — `spawn` pre-sizes new threads'
|
||||
/// `tls_values` to this.
|
||||
tls_slot_count: usize,
|
||||
@@ -379,6 +467,7 @@ impl Scheduler {
|
||||
order,
|
||||
rng_state,
|
||||
timed_waits: Vec::new(),
|
||||
global_clock: 0,
|
||||
tls_slot_count: 0,
|
||||
non_empty_runnable: 0,
|
||||
rotation_cursor: 0,
|
||||
@@ -500,6 +589,17 @@ impl Scheduler {
|
||||
self.current.expect("no current thread")
|
||||
}
|
||||
|
||||
/// `(start_entry, start_context)` of the currently-running thread.
|
||||
/// Returns None if there is no current thread or its ref is stale.
|
||||
/// Used by `KernelState::maybe_register_silph_autosignal` to filter
|
||||
/// `NtCreateEvent` calls by spawning chain.
|
||||
pub fn current_thread_entry_and_ctx(&self) -> Option<(u32, u32)> {
|
||||
let r = self.current?;
|
||||
let slot = self.slots.get(r.hw_id as usize)?;
|
||||
let t = slot.runqueue.get(r.idx as usize)?;
|
||||
Some((t.start_entry, t.start_context))
|
||||
}
|
||||
|
||||
// ----- Guest-thread lookup -----
|
||||
|
||||
/// Find the `ThreadRef` of the (non-Exited) thread with `tid`.
|
||||
@@ -614,6 +714,8 @@ impl Scheduler {
|
||||
t.priority = params.priority;
|
||||
t.affinity_mask = mask;
|
||||
t.ideal_processor = params.ideal_processor;
|
||||
t.start_entry = params.entry;
|
||||
t.start_context = params.start_context;
|
||||
// M3.7 — populate the inter-thread reservation handle + slot id
|
||||
// so the interpreter can route lwarx/stwcx through the table.
|
||||
t.ctx.hw_id = slot_id;
|
||||
@@ -708,31 +810,46 @@ impl Scheduler {
|
||||
/// the fast path — zero bits mean no slot has work and the caller
|
||||
/// falls through to `advance_to_next_wake`.
|
||||
pub fn round_schedule(&mut self) -> Vec<u8> {
|
||||
let mut buf = [0u8; HW_THREAD_COUNT];
|
||||
let n = self.round_schedule_into(&mut buf);
|
||||
buf[..n].to_vec()
|
||||
}
|
||||
|
||||
/// Allocation-free variant of [`Self::round_schedule`] (Tier-A perf #2).
|
||||
/// Fills `buf` with the runnable slot ids and returns the count `n`; the
|
||||
/// valid range is `buf[..n]`. The hot scheduler loop (lockstep +
|
||||
/// parallel) calls this with a reusable stack array so it does not
|
||||
/// `__rust_alloc`/`__rust_dealloc` a fresh `Vec` every round (~7 instr
|
||||
/// apart at boot-to-splash → millions of churned allocations). Identical
|
||||
/// ordering / RNG-advance semantics to `round_schedule`, so the schedule
|
||||
/// — and thus the lockstep digest — is byte-for-byte unchanged.
|
||||
pub fn round_schedule_into(&mut self, buf: &mut [u8; HW_THREAD_COUNT]) -> usize {
|
||||
if self.non_empty_runnable == 0 {
|
||||
return Vec::new();
|
||||
return 0;
|
||||
}
|
||||
let start = self.rotation_cursor as usize;
|
||||
let mut out: Vec<u8> = Vec::with_capacity(HW_THREAD_COUNT);
|
||||
let mut n = 0usize;
|
||||
for off in 0..HW_THREAD_COUNT {
|
||||
let i = (start + off) % HW_THREAD_COUNT;
|
||||
if self.non_empty_runnable & (1 << i) != 0 {
|
||||
out.push(i as u8);
|
||||
buf[n] = i as u8;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
// Seeded mode layers a deterministic shuffle on top of the
|
||||
// already-filtered list. Same spawn/wake sequence + same seed ⇒
|
||||
// same schedule (invariant preserved from pre-Axis-1).
|
||||
if let OrderMode::Seeded { .. } = self.order {
|
||||
for i in (1..out.len()).rev() {
|
||||
for i in (1..n).rev() {
|
||||
self.rng_state ^= self.rng_state << 13;
|
||||
self.rng_state ^= self.rng_state >> 7;
|
||||
self.rng_state ^= self.rng_state << 17;
|
||||
let j = (self.rng_state as usize) % (i + 1);
|
||||
out.swap(i, j);
|
||||
buf.swap(i, j);
|
||||
}
|
||||
}
|
||||
self.rotation_cursor = ((start + 1) % HW_THREAD_COUNT) as u8;
|
||||
out
|
||||
n
|
||||
}
|
||||
|
||||
pub fn begin_round(&mut self) {
|
||||
@@ -744,10 +861,22 @@ impl Scheduler {
|
||||
/// stashes `self.current` so exports can reach it.
|
||||
pub fn begin_slot_visit(&mut self, hw_id: u8) {
|
||||
let slot = &mut self.slots[hw_id as usize];
|
||||
slot.running_idx = slot.pick_runnable();
|
||||
self.current = slot
|
||||
.running_idx
|
||||
.map(|idx| ThreadRef::new(hw_id, idx as u16));
|
||||
let picked = slot.pick_runnable();
|
||||
slot.running_idx = picked;
|
||||
// Anti-starvation bookkeeping: reset the picked thread's counter,
|
||||
// increment every other Ready peer that was passed over this visit.
|
||||
// Once a passed-over thread reaches STARVE_LIMIT it wins the next
|
||||
// pick_runnable (effective_priority -> i32::MAX), then lands here as
|
||||
// `picked` and resets — bounding any thread's starvation. Pure
|
||||
// function of pick history, so it stays deterministic.
|
||||
for (i, t) in slot.runqueue.iter_mut().enumerate() {
|
||||
if Some(i) == picked {
|
||||
t.steps_starved = 0;
|
||||
} else if matches!(t.state, HwState::Ready | HwState::ServicingIrq(_)) {
|
||||
t.steps_starved = t.steps_starved.saturating_add(1);
|
||||
}
|
||||
}
|
||||
self.current = picked.map(|idx| ThreadRef::new(hw_id, idx as u16));
|
||||
}
|
||||
|
||||
/// Clear `current` at the end of each per-slot visit.
|
||||
@@ -803,6 +932,41 @@ impl Scheduler {
|
||||
false
|
||||
}
|
||||
|
||||
/// Cooperative yield: the currently-running thread executed a `db16cyc`
|
||||
/// spin-wait hint (see `StepResult::Yield`). It is busy-spinning on a
|
||||
/// guest spinlock/barrier whose release depends on a *co-located* peer
|
||||
/// that cannot make progress while this thread keeps winning the slot.
|
||||
///
|
||||
/// Promote every Ready peer on this slot past `STARVE_LIMIT` so the next
|
||||
/// `begin_slot_visit` picks one of them (their `effective_priority` →
|
||||
/// `i32::MAX`), and reset the yielder's own counter. Each promoted peer
|
||||
/// runs once and resets to 0 in `begin_slot_visit`; once all peers have
|
||||
/// had their turn the spinner is picked again, spins, and re-yields —
|
||||
/// producing a fair round-robin between the spinner and the threads it is
|
||||
/// waiting on. This mirrors real hardware, where all six HW threads run
|
||||
/// concurrently and the spin resolves as soon as the peer releases.
|
||||
///
|
||||
/// Pure function of the slot's current state (no RNG, no wall-clock), so
|
||||
/// it preserves lockstep determinism. No-op if there is no Ready peer
|
||||
/// (the spinner is alone on its slot — nothing to hand off to).
|
||||
///
|
||||
/// Returns `true` if at least one peer was promoted.
|
||||
pub fn yield_current(&mut self) -> bool {
|
||||
let Some(r) = self.current else { return false; };
|
||||
let slot = &mut self.slots[r.hw_id as usize];
|
||||
let me = r.idx as usize;
|
||||
let mut promoted = false;
|
||||
for (i, t) in slot.runqueue.iter_mut().enumerate() {
|
||||
if i == me {
|
||||
t.steps_starved = 0;
|
||||
} else if matches!(t.state, HwState::Ready | HwState::ServicingIrq(_)) {
|
||||
t.steps_starved = STARVE_LIMIT;
|
||||
promoted = true;
|
||||
}
|
||||
}
|
||||
promoted
|
||||
}
|
||||
|
||||
// ----- Park / wake / exit -----
|
||||
|
||||
pub fn park_current(&mut self, reason: BlockReason) {
|
||||
@@ -1091,6 +1255,42 @@ impl Scheduler {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Keep the parallel-mode coherent clock at least as far forward as
|
||||
// any deadline we fast-forward to (idle/timer/wake advances). This
|
||||
// only mutates the new `global_clock` field — lockstep never reads
|
||||
// it — so it cannot perturb the deterministic lockstep trace.
|
||||
self.global_clock = self.global_clock.max(deadline);
|
||||
}
|
||||
|
||||
/// Parallel-mode coherent "now" (see [`Self::global_clock`] field doc).
|
||||
/// Read by the kernel deadline-arithmetic ONLY when
|
||||
/// `KernelState::parallel_active`; lockstep keeps reading per-thread
|
||||
/// `ctx(hw_id).timebase`.
|
||||
#[inline]
|
||||
pub fn global_clock(&self) -> u64 {
|
||||
self.global_clock
|
||||
}
|
||||
|
||||
/// Advance the parallel-mode coherent clock by `n` retired instructions.
|
||||
/// Called from the parallel worker writeback with the block's executed
|
||||
/// count so "now" tracks aggregate guest progress.
|
||||
#[inline]
|
||||
pub fn advance_global_clock(&mut self, n: u64) {
|
||||
self.global_clock = self.global_clock.saturating_add(n);
|
||||
}
|
||||
|
||||
/// Floor the coherent clock up to `now` (monotonic; never goes
|
||||
/// backwards). Used by the **lockstep** outer loop once per round to
|
||||
/// track the deterministic retired-instruction count
|
||||
/// (`stats.instruction_count`) as the single coherent "now". A plain
|
||||
/// floor-up rather than `saturating_add` because the lockstep caller
|
||||
/// passes an absolute monotonic counter (not a per-block delta), and
|
||||
/// because `advance_all_timebases_to` may already have pushed
|
||||
/// `global_clock` past the instruction count when fast-forwarding to a
|
||||
/// future deadline — clamping with `max` keeps both sources monotone.
|
||||
#[inline]
|
||||
pub fn advance_global_clock_to(&mut self, now: u64) {
|
||||
self.global_clock = self.global_clock.max(now);
|
||||
}
|
||||
|
||||
/// Fast-forward the timebase to the earliest pending timed wait and
|
||||
@@ -1123,7 +1323,15 @@ impl Scheduler {
|
||||
};
|
||||
t.quantum_remaining = QUANTUM_DEFAULT;
|
||||
self.recompute_slot_runnable(r.hw_id);
|
||||
tracing::info!(
|
||||
// DEBUG, not INFO: this fires once per timed-wait deadline-wake, which
|
||||
// during the boot idle-spin happens hundreds of thousands of times. At
|
||||
// INFO it floods the console/log file and throttles the interactive
|
||||
// `exec --ui` path so hard (≈286K lines flushed to disk) that the guest
|
||||
// crawls and never reaches the ~30–150M-instruction splash window —
|
||||
// which masqueraded as a "--ui early termination" (iterate-3R). The
|
||||
// headless `check` path runs `--quiet` (WARN) so it was never throttled.
|
||||
// No execution-semantics change; deterministic golden is unaffected.
|
||||
tracing::debug!(
|
||||
"scheduler: advanced to deadline {} waking hw={} idx={}",
|
||||
deadline,
|
||||
r.hw_id,
|
||||
@@ -1161,6 +1369,28 @@ impl Scheduler {
|
||||
})
|
||||
}
|
||||
|
||||
/// True if any thread is currently `Blocked` on a `WaitAny`/`WaitAll`
|
||||
/// whose handle set contains `handle`. Used by the handle-slab recycler
|
||||
/// (AUDIT-059 R34) to avoid an ABA hazard: if a closed handle's slot is
|
||||
/// returned to the free list while a thread is still parked on it, a
|
||||
/// later `alloc_handle` could hand the same slot to a NEW object, and a
|
||||
/// signal on that new object would wake the stale waiter that was
|
||||
/// waiting on the OLD (closed) object. Canary sidesteps this by keeping
|
||||
/// the object alive via an object_ref while waiters hold references; we
|
||||
/// instead simply decline to recycle a still-waited slot (leaking it,
|
||||
/// matching the pre-R34 bump-only behaviour for that rare case).
|
||||
pub fn any_thread_waiting_on(&self, handle: u32) -> bool {
|
||||
self.slots.iter().any(|slot| {
|
||||
slot.runqueue.iter().any(|t| match &t.state {
|
||||
HwState::Blocked(BlockReason::WaitAny { handles, .. })
|
||||
| HwState::Blocked(BlockReason::WaitAll { handles, .. }) => {
|
||||
handles.contains(&handle)
|
||||
}
|
||||
_ => false,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// Snapshot thread states for diagnostic logging. One entry per live
|
||||
/// guest thread (Exited are included so post-mortem can see exit codes).
|
||||
pub fn diagnostic_snapshot(&self) -> Vec<(ThreadRef, Option<u32>, HwState)> {
|
||||
@@ -1184,6 +1414,27 @@ impl Scheduler {
|
||||
let mut woken = Vec::new();
|
||||
for (hw_id, slot) in self.slots.iter_mut().enumerate() {
|
||||
for (idx, t) in slot.runqueue.iter_mut().enumerate() {
|
||||
// Skip threads parked SOLELY on synthetic park-handles
|
||||
// (audio workers). They are not deadlock participants — the
|
||||
// guest blocked on its own objects — and waking one runs its
|
||||
// thread entry to the LR_HALT sentinel, exiting it and
|
||||
// dropping every subsequent audio-callback injection. Only
|
||||
// audio-callback injection may un-park them. A wait whose
|
||||
// handle set mixes synthetic and real handles is still
|
||||
// eligible (the real handle makes it a genuine waiter).
|
||||
let synthetic_park = match &t.state {
|
||||
HwState::Blocked(BlockReason::WaitAny { handles, .. })
|
||||
| HwState::Blocked(BlockReason::WaitAll { handles, .. }) => {
|
||||
!handles.is_empty()
|
||||
&& handles
|
||||
.iter()
|
||||
.all(|&h| h >= SYNTHETIC_PARK_HANDLE_FLOOR)
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
if synthetic_park {
|
||||
continue;
|
||||
}
|
||||
if matches!(
|
||||
t.state,
|
||||
HwState::Blocked(BlockReason::WaitAny { .. })
|
||||
@@ -1270,6 +1521,41 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unblock_on_deadlock_skips_synthetic_park_waiters() {
|
||||
// The audio worker parks on a synthetic handle (>= FLOOR) and must
|
||||
// survive the deadlock force-wake; a peer parked on a real handle
|
||||
// must be woken. Regression for the milestone-2 stall where the
|
||||
// force-wake destroyed the audio worker → all callbacks dropped.
|
||||
let mut s = mk_scheduler_with_initial();
|
||||
s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
|
||||
s.spawn(worker_spawn_params(3, 0x2010), &mut NullPcr).unwrap();
|
||||
let audio = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
|
||||
let real = ThreadRef { hw_id: 2, idx: 0, generation: 0 };
|
||||
s.thread_mut(audio).state = HwState::Blocked(BlockReason::WaitAny {
|
||||
handles: vec![SYNTHETIC_PARK_HANDLE_FLOOR],
|
||||
deadline: None,
|
||||
});
|
||||
s.thread_mut(real).state = HwState::Blocked(BlockReason::WaitAny {
|
||||
handles: vec![0x1234],
|
||||
deadline: None,
|
||||
});
|
||||
let woken = s.unblock_on_deadlock();
|
||||
assert!(
|
||||
woken.contains(&real),
|
||||
"real-handle waiter must be force-woken"
|
||||
);
|
||||
assert!(
|
||||
!woken.contains(&audio),
|
||||
"synthetic-park audio worker must NOT be force-woken"
|
||||
);
|
||||
assert!(matches!(
|
||||
s.thread(audio).state,
|
||||
HwState::Blocked(BlockReason::WaitAny { .. })
|
||||
));
|
||||
assert_eq!(s.thread(real).state, HwState::Ready);
|
||||
}
|
||||
|
||||
// ---- preserved from pre-Axis-1 (updated names and params) ----
|
||||
|
||||
#[test]
|
||||
@@ -1858,6 +2144,118 @@ mod tests {
|
||||
assert_eq!(t.quantum_remaining, QUANTUM_DEFAULT, "quantum reloaded");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anti_starvation_bounded_progress() {
|
||||
// Reproduces the Sylpheed render-gate deadlock: a high-priority
|
||||
// CPU-bound spinner (the pri-15 poll loop) co-located on one slot
|
||||
// with a pri-0 worker (the submitter) the spinner is waiting on.
|
||||
// Strict priority would starve the worker forever; the anti-starve
|
||||
// floor must hand it a pick within STARVE_LIMIT+1 visits, then the
|
||||
// spinner reclaims the slot (priority is NOT inverted).
|
||||
let mut s = mk_empty_scheduler();
|
||||
let mut spinner = SpawnParams::default();
|
||||
spinner.guest_tid = 1;
|
||||
spinner.thread_handle = 0x1000;
|
||||
spinner.affinity_mask = 0b0001;
|
||||
spinner.pcr_base = 0x4000_0000;
|
||||
spinner.priority = 15;
|
||||
s.spawn(spinner, &mut NullPcr).unwrap();
|
||||
let mut worker = SpawnParams::default();
|
||||
worker.guest_tid = 2;
|
||||
worker.thread_handle = 0x1004;
|
||||
worker.affinity_mask = 0b0001;
|
||||
worker.pcr_base = 0x4000_1000;
|
||||
worker.priority = 0;
|
||||
s.spawn(worker, &mut NullPcr).unwrap();
|
||||
|
||||
let mut worker_picks = 0u32;
|
||||
let mut spinner_picks = 0u32;
|
||||
// Both stay Ready (the spinner never blocks — that's the bug shape).
|
||||
for _ in 0..(STARVE_LIMIT + 2) {
|
||||
s.begin_slot_visit(0);
|
||||
match s.thread(s.current.unwrap()).tid {
|
||||
1 => spinner_picks += 1,
|
||||
2 => worker_picks += 1,
|
||||
other => panic!("unexpected tid {other}"),
|
||||
}
|
||||
s.end_slot_visit();
|
||||
}
|
||||
assert_eq!(
|
||||
worker_picks, 1,
|
||||
"starved worker gets exactly one bounded pick within STARVE_LIMIT+2 visits"
|
||||
);
|
||||
assert_eq!(
|
||||
spinner_picks,
|
||||
STARVE_LIMIT + 1,
|
||||
"high-priority spinner still dominates — priority is not inverted"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_db16cyc_yield_hands_slot_to_peer() {
|
||||
// Reproduces the Sylpheed title-screen gate: a guest spinlock/barrier
|
||||
// participant (tid=1) executes the `db16cyc` spin hint each round and
|
||||
// would otherwise win `pick_runnable` forever (equal priority, lower
|
||||
// index), starving the co-located peer (tid=2) it is waiting on.
|
||||
// `yield_current` must promote the Ready peer so the very next
|
||||
// `begin_slot_visit` picks it — without waiting STARVE_LIMIT rounds.
|
||||
let mut s = mk_empty_scheduler();
|
||||
for tid in [1u32, 2] {
|
||||
let mut p = SpawnParams::default();
|
||||
p.guest_tid = tid;
|
||||
p.thread_handle = 0x1000 + tid * 4;
|
||||
p.affinity_mask = 0b0001;
|
||||
p.pcr_base = 0x4000_0000 + tid * 0x1000;
|
||||
p.priority = 0; // equal priority — index would otherwise decide
|
||||
s.spawn(p, &mut NullPcr).unwrap();
|
||||
}
|
||||
|
||||
// Round 1: the spinner (lower index) wins.
|
||||
s.begin_slot_visit(0);
|
||||
let spinner = s.thread(s.current.unwrap()).tid;
|
||||
assert_eq!(spinner, 1, "lower-index equal-priority thread wins first pick");
|
||||
// It spins (db16cyc) → cooperative yield.
|
||||
assert!(s.yield_current(), "yield promotes the Ready peer");
|
||||
s.end_slot_visit();
|
||||
|
||||
// Round 2: the promoted peer must now be picked, not the spinner.
|
||||
s.begin_slot_visit(0);
|
||||
let after_yield = s.thread(s.current.unwrap()).tid;
|
||||
assert_eq!(
|
||||
after_yield, 2,
|
||||
"after db16cyc yield the co-located peer runs (no STARVE_LIMIT wait)"
|
||||
);
|
||||
s.end_slot_visit();
|
||||
|
||||
// Round 3: peer's boost was consumed (reset to 0 when picked), so the
|
||||
// spinner reclaims the slot — fair alternation, no priority inversion.
|
||||
s.begin_slot_visit(0);
|
||||
assert_eq!(
|
||||
s.thread(s.current.unwrap()).tid,
|
||||
1,
|
||||
"spinner reclaims the slot after the peer has had its turn"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_yield_current_noop_when_alone() {
|
||||
// A spinner with no Ready peer on its slot has nothing to hand off to;
|
||||
// yield_current must be a no-op (returns false) and not panic.
|
||||
let mut s = mk_empty_scheduler();
|
||||
let mut p = SpawnParams::default();
|
||||
p.guest_tid = 1;
|
||||
p.thread_handle = 0x1004;
|
||||
p.affinity_mask = 0b0001;
|
||||
p.pcr_base = 0x4000_0000;
|
||||
s.spawn(p, &mut NullPcr).unwrap();
|
||||
s.begin_slot_visit(0);
|
||||
assert!(!s.yield_current(), "no peer to promote → no-op");
|
||||
// Still the same thread next round.
|
||||
s.end_slot_visit();
|
||||
s.begin_slot_visit(0);
|
||||
assert_eq!(s.thread(s.current.unwrap()).tid, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cooperative_yield_does_not_need_quantum() {
|
||||
let mut s = mk_empty_scheduler();
|
||||
|
||||
@@ -293,28 +293,23 @@ pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ─────────────────────────
|
||||
// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel.
|
||||
// vupkhpx / vupklpx reverse the operation.
|
||||
//
|
||||
// Format: input 32-bit word holds
|
||||
// bits 0-6: unused (0)
|
||||
// bit 7: alpha-select (→ bit 15 of output)
|
||||
// bits 8-15: R (top 5 bits kept)
|
||||
// bits 16-23: G (top 5 bits kept)
|
||||
// bits 24-31: B (top 5 bits kept)
|
||||
// Output 16-bit word:
|
||||
// bit 15: A (from input bit 7)
|
||||
// bits 10-14: R
|
||||
// bits 5-9: G
|
||||
// bits 0-4: B
|
||||
// ─── pixel pack (vpkpx / vupkhpx / vupklpx) ───────────────────────────────
|
||||
// PPC vpkpx packs each 32-bit lane into a 16-bit 1-5-5-5 pixel.
|
||||
// Mapping transcribed EXACTLY from xenia-canary
|
||||
// `ppc_emit_altivec.cc::vkpkx_in_low` (lines 1795-1808):
|
||||
// tmp1 = (input >> 9) & 0xFC00 // out bits 15:10 = in bits 24:19
|
||||
// tmp2 = (input >> 6) & 0x3E0 // out bits 9:5 = in bits 14:10
|
||||
// tmp3 = (input >> 3) & 0x1F // out bits 4:0 = in bits 7:3
|
||||
// result = tmp1 | tmp2 | tmp3
|
||||
// This is a pure shift/mask: there is NO standalone alpha select. Output
|
||||
// bit 15 is simply input bit 24 (the top of the 6-bit field masked by
|
||||
// 0xFC00) — NOT input bit 7. The red field is 6 bits wide here.
|
||||
|
||||
#[inline] pub fn pack_pixel_555(input: u32) -> u16 {
|
||||
let a = (input >> 7) & 0x1;
|
||||
let r = (input >> 8) & 0xFF;
|
||||
let g = (input >> 16) & 0xFF;
|
||||
let b = (input >> 24) & 0xFF;
|
||||
((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16
|
||||
let tmp1 = (input >> 9) & 0xFC00;
|
||||
let tmp2 = (input >> 6) & 0x3E0;
|
||||
let tmp3 = (input >> 3) & 0x1F;
|
||||
(tmp1 | tmp2 | tmp3) as u16
|
||||
}
|
||||
|
||||
#[inline] pub fn unpack_pixel_555(input: u16) -> u32 {
|
||||
@@ -801,9 +796,38 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_unpack_pixel_555() {
|
||||
let encoded = pack_pixel_555(0x80_F8_F8_F8);
|
||||
assert_eq!(encoded & 0x8000, 0x8000);
|
||||
fn pack_pixel_555_matches_canary() {
|
||||
// Mapping (canary ppc_emit_altivec.cc::vkpkx_in_low):
|
||||
// out[15:10] = in[24:19], out[9:5] = in[14:10], out[4:0] = in[7:3]
|
||||
// Pure shift/mask, NO standalone alpha bit.
|
||||
|
||||
// All three colour fields exercised. Expected (hand-computed):
|
||||
// (0x018844C0 >> 9)&0xFC00 = 0xC400
|
||||
// (0x018844C0 >> 6)&0x3E0 = 0x100
|
||||
// (0x018844C0 >> 3)&0x1F = 0x18
|
||||
// => 0xC518
|
||||
assert_eq!(pack_pixel_555(0x01_88_44_C0), 0xC518);
|
||||
|
||||
// Boundary the audit flagged: low byte 0xF8 has bit 7 set. Canary does
|
||||
// NOT turn that into output bit 15 (alpha). Output bit 15 = in bit 24,
|
||||
// which is 0 here => high bit clear. (Old impl wrongly produced 0x8000.)
|
||||
assert_eq!(pack_pixel_555(0x80_F8_F8_F8), 0x7FFF);
|
||||
assert_eq!(pack_pixel_555(0x80_F8_F8_F8) & 0x8000, 0);
|
||||
|
||||
// Lone source bit 7 (0x80) lands in the blue field, not in bit 15.
|
||||
assert_eq!(pack_pixel_555(0x00_00_00_80), 0x0010);
|
||||
|
||||
// Output bit 15 is sourced from input bit 24, not bit 7.
|
||||
assert_eq!(pack_pixel_555(0x01_00_00_00), 0x8000);
|
||||
|
||||
// Saturated input -> all field bits set.
|
||||
assert_eq!(pack_pixel_555(0xFF_FF_FF_FF), 0xFFFF);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unpack_pixel_555_roundtrip() {
|
||||
// vupkhpx/vupklpx are NOTIMPLEMENTED in canary, so unpack_pixel_555 is
|
||||
// unchanged; just sanity-check the alpha-replicate path still holds.
|
||||
let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F);
|
||||
assert_eq!(w & 0xFF000000, 0xFF000000);
|
||||
}
|
||||
|
||||
372
crates/xenia-gpu/src/draw_capture.rs
Normal file
372
crates/xenia-gpu/src/draw_capture.rs
Normal file
@@ -0,0 +1,372 @@
|
||||
//! Per-draw geometry capture for the host UI's faithful-render path.
|
||||
//!
|
||||
//! The deterministic headless core (`check --gpu-inline`) never touches this
|
||||
//! module — it is populated only when a UI bridge is installed and consumed
|
||||
//! only by `crates/xenia-ui`. The goal is to hand the UI the *real* guest
|
||||
//! geometry behind each `PM4_DRAW_INDX*` packet so it can rasterize the
|
||||
//! actual splash vertices instead of synthetic placeholder shapes.
|
||||
//!
|
||||
//! What the WGSL pipeline needs to reconstruct one draw (see
|
||||
//! `shaders/xenos_interp.wgsl` `vs_main` / `interpret_vertex_fetch`):
|
||||
//! * the active VS/PS blob keys (already published as assets),
|
||||
//! * the primitive type + the host vertex count to issue,
|
||||
//! * the raw guest vertex-buffer bytes for the fetched window, and
|
||||
//! * the *dword base* of that window so the shader can rebase the absolute
|
||||
//! fetch-constant address into the uploaded buffer.
|
||||
//!
|
||||
//! The hard part is sourcing the vertex window: the VS reads a vertex-fetch
|
||||
//! constant (`xe_gpu_vertex_fetch_t`) whose dword-0 carries the absolute
|
||||
//! guest dword address. We parse the active VS, find its first vertex fetch,
|
||||
//! read that fetch constant out of the register file, then copy a bounded
|
||||
//! window of guest memory starting at the fetch base.
|
||||
|
||||
use xenia_memory::access::MemoryAccess;
|
||||
|
||||
use crate::draw_state::{IndexSize, IndexSource, PrimitiveType};
|
||||
use crate::register_file::RegisterFile;
|
||||
|
||||
/// Texture-fetch / vertex-fetch constant region base, in register indices.
|
||||
/// Each fetch constant is 6 dwords (`xe_gpu_*_fetch_t`).
|
||||
const CONST_BASE_FETCH: u32 = 0x4800;
|
||||
|
||||
/// Upper bound (in dwords) on the vertex window we copy per draw. The splash
|
||||
/// UI draws are tiny (3–4 verts × ≤4 dwords); 64 KiB of dwords is generous
|
||||
/// slack while bounding the per-frame copy cost and the 16 MiB host buffer.
|
||||
const MAX_WINDOW_DWORDS: u32 = 16 * 1024;
|
||||
|
||||
/// One captured draw, with enough real state for the UI to replay it through
|
||||
/// the existing wgpu Xenos pipeline.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DrawCapture {
|
||||
/// Monotonic global draw index (matches `GpuStats::draws_seen` at capture).
|
||||
pub draw_index: u32,
|
||||
/// Xenos primitive-type code (see `SwapInfo::last_draw_prim` encoding).
|
||||
pub prim_code: u32,
|
||||
/// Host vertex count to issue (post primitive-processor rewrite).
|
||||
pub host_vertex_count: u32,
|
||||
/// Active VS blob key at draw time (0 = none).
|
||||
pub vs_key: u32,
|
||||
/// Active PS blob key at draw time (0 = none).
|
||||
pub ps_key: u32,
|
||||
/// Raw guest dwords of the fetched vertex window (host-endian as stored in
|
||||
/// guest memory — the WGSL applies the per-format endian swap). `addr 0`
|
||||
/// of this buffer corresponds to guest dword `window_base_dwords`.
|
||||
pub vertex_dwords: Vec<u32>,
|
||||
/// Guest dword address that maps to index 0 of `vertex_dwords`. The shader
|
||||
/// subtracts this from the fetch-constant base to index `vertex_dwords`.
|
||||
pub window_base_dwords: u32,
|
||||
/// `true` when we successfully resolved a real vertex window. When `false`
|
||||
/// the UI falls back to its procedural geometry for this draw (honest:
|
||||
/// nothing faked, just "couldn't source real vertices").
|
||||
pub has_real_vertices: bool,
|
||||
/// iterate-3S: per-draw NDC transform derived from the guest viewport /
|
||||
/// clip / VTE registers (mirrors canary `GetHostViewportInfo`). The host VS
|
||||
/// converts the guest-VS position to wgpu clip space via
|
||||
/// `clip.xy = pos.xy * ndc_scale + ndc_offset * pos.w`. The Y component
|
||||
/// already carries the render-target → wgpu Y-flip (negated).
|
||||
pub ndc_scale: [f32; 2],
|
||||
pub ndc_offset: [f32; 2],
|
||||
/// iterate-3T: the decoded texture(s) this draw's active pixel shader
|
||||
/// samples, keyed off its real `tfetch` fetch-constant slots (the 3M
|
||||
/// decoder makes these decode). The UI uploads + binds the FIRST entry
|
||||
/// per-draw so the textured logo samples the real artwork instead of the
|
||||
/// magenta stub. Empty for flat (no-tfetch) draws. Populated by
|
||||
/// `gpu_system` after decode (left empty by `build`).
|
||||
///
|
||||
/// Each entry is `(key, content_version, bytes)`. iterate-3AD: the
|
||||
/// `content_version` (from `span_max_version` over the texel span) lets the
|
||||
/// UI host texture cache RE-UPLOAD when the guest fills more of an evolving
|
||||
/// atlas. The publisher and the 2nd splash logo share one K8888 surface
|
||||
/// (base `0x4dbee000`); the 2nd logo's texels are CPU-written *after* the
|
||||
/// publisher's first upload. Without the real version the host cache (which
|
||||
/// previously pinned `version_when_uploaded = 1`) kept the first partial
|
||||
/// upload, so the 2nd logo sampled its still-zero atlas region as black.
|
||||
pub textures: Vec<(crate::texture_cache::TextureKey, u64, Vec<u8>)>,
|
||||
/// iterate-3Y: per-draw color/blend render state captured from the
|
||||
/// register file so the host pipeline composites the way the guest
|
||||
/// intends (instead of one fixed alpha-blend state). Mirrors the fields
|
||||
/// canary feeds into `GetCurrentStateDescription` (D3D12
|
||||
/// `pipeline_cache.cc`):
|
||||
/// * `blend_control` = `RB_BLENDCONTROL0` (RT0 src/dst factors + op,
|
||||
/// color and alpha). The Xbox 360 has no separate "blend enable" bit;
|
||||
/// `One,Zero,Add` *is* the opaque case.
|
||||
/// * `color_mask` = RT0 nibble of `RB_COLOR_MASK` (per-channel write
|
||||
/// enable). When 0, canary forces `One,Zero` (no blend).
|
||||
/// * `color_control` = `RB_COLORCONTROL` (alpha-test enable/func).
|
||||
/// * `depth_control` = `RB_DEPTHCONTROL` (z-test enable/func/write).
|
||||
pub blend_control: u32,
|
||||
pub color_mask: u8,
|
||||
pub color_control: u32,
|
||||
pub depth_control: u32,
|
||||
}
|
||||
|
||||
/// iterate-3S: compute the guest→host NDC XY transform for a draw, mirroring
|
||||
/// canary's `draw_util.cc::GetHostViewportInfo` (the XY half). The Xbox 360 VS
|
||||
/// emits a clip-space position which the HW then scales/offsets by the viewport
|
||||
/// (`PA_CL_VPORT_*`, gated by `PA_CL_VTE_CNTL`) into render-target pixels, OR,
|
||||
/// when clipping is disabled (`PA_CL_CLIP_CNTL.clip_disable`), the VS emits
|
||||
/// render-target-pixel coordinates directly (the screen-space UI / clear case —
|
||||
/// this is what Sylpheed's splash quads do). Either way we must rescale into the
|
||||
/// host's [-1,1] clip space and flip Y (render-target Y-down → wgpu Y-up).
|
||||
///
|
||||
/// Returns `(ndc_scale[2], ndc_offset[2])` such that
|
||||
/// `host_clip.xy = guest_pos.xy * ndc_scale + ndc_offset * guest_pos.w`.
|
||||
/// The Y entries are pre-negated to flip into wgpu's Y-up clip space.
|
||||
pub fn compute_ndc_xy(rf: &RegisterFile) -> ([f32; 2], [f32; 2]) {
|
||||
const PA_CL_CLIP_CNTL: u32 = 0x2204;
|
||||
const PA_SU_SC_MODE_CNTL: u32 = 0x2205;
|
||||
const PA_CL_VTE_CNTL: u32 = 0x2206;
|
||||
const PA_SU_VTX_CNTL: u32 = 0x2302;
|
||||
const PA_CL_VPORT_XSCALE: u32 = 0x210F;
|
||||
const PA_CL_VPORT_XOFFSET: u32 = 0x2110;
|
||||
const PA_CL_VPORT_YSCALE: u32 = 0x2111;
|
||||
const PA_CL_VPORT_YOFFSET: u32 = 0x2112;
|
||||
const PA_SC_WINDOW_OFFSET: u32 = 0x2080;
|
||||
const PA_SC_WINDOW_SCISSOR_BR: u32 = 0x2082;
|
||||
const RB_SURFACE_INFO: u32 = 0x2000;
|
||||
|
||||
let clip_cntl = rf.read(PA_CL_CLIP_CNTL);
|
||||
let vte = rf.read(PA_CL_VTE_CNTL);
|
||||
let su_sc_mode = rf.read(PA_SU_SC_MODE_CNTL);
|
||||
let su_vtx = rf.read(PA_SU_VTX_CNTL);
|
||||
let fbits = |r: u32| f32::from_bits(rf.read(r));
|
||||
|
||||
// VTE enable bits (xenos.h PA_CL_VTE_CNTL): bit0 vport_x_scale_ena,
|
||||
// bit1 vport_x_offset_ena, bit2 vport_y_scale_ena, bit3 vport_y_offset_ena.
|
||||
let scale_x = if vte & (1 << 0) != 0 { fbits(PA_CL_VPORT_XSCALE) } else { 1.0 };
|
||||
let off_x = if vte & (1 << 1) != 0 { fbits(PA_CL_VPORT_XOFFSET) } else { 0.0 };
|
||||
let scale_y = if vte & (1 << 2) != 0 { fbits(PA_CL_VPORT_YSCALE) } else { 1.0 };
|
||||
let off_y = if vte & (1 << 3) != 0 { fbits(PA_CL_VPORT_YOFFSET) } else { 0.0 };
|
||||
|
||||
// Render-target extent in guest pixels: clamp to the texture max (2048),
|
||||
// sourced from the window scissor BR (matches canary `x_max`/`y_max`).
|
||||
let br = rf.read(PA_SC_WINDOW_SCISSOR_BR);
|
||||
let x_max = ((br & 0x7FFF).max(1)).min(2048) as f32;
|
||||
let y_max = (((br >> 16) & 0x7FFF).max(1)).min(2048) as f32;
|
||||
let _ = RB_SURFACE_INFO;
|
||||
|
||||
// Half-pixel + window offsets added in render-target pixels.
|
||||
let mut add_x = 0.0f32;
|
||||
let mut add_y = 0.0f32;
|
||||
if su_sc_mode & (1 << 16) != 0 {
|
||||
let wo = rf.read(PA_SC_WINDOW_OFFSET);
|
||||
// 15-bit signed each (x: [14:0], y: [30:16]).
|
||||
let sx = (((wo & 0x7FFF) << 1) as i32) >> 1;
|
||||
let sy = ((((wo >> 16) & 0x7FFF) << 1) as i32) >> 1;
|
||||
add_x += sx as f32;
|
||||
add_y += sy as f32;
|
||||
}
|
||||
if su_vtx & 1 == 0 {
|
||||
// pix_center == kD3DZero → +0.5 half-pixel offset.
|
||||
add_x += 0.5;
|
||||
add_y += 0.5;
|
||||
}
|
||||
|
||||
let (s, o);
|
||||
if clip_cntl & (1 << 16) != 0 {
|
||||
// clip_disable: VS outputs render-target-*pixel* coords (Y-DOWN: pixel
|
||||
// y=0 is the top row of the render target). Rescale the whole RT extent
|
||||
// to [-1,1] and FLIP Y so pixel-top → wgpu clip-top (canary's
|
||||
// huge-host-viewport path; the framebuffer→clip flip is real here).
|
||||
let px2ndc_x = 2.0 / x_max;
|
||||
let px2ndc_y = 2.0 / y_max;
|
||||
let sx = scale_x * px2ndc_x;
|
||||
let ox = (off_x - x_max * 0.5 + add_x) * px2ndc_x;
|
||||
let sy = scale_y * px2ndc_y;
|
||||
let oy = (off_y - y_max * 0.5 + add_y) * px2ndc_y;
|
||||
// Flip Y: pixel-Y-down → wgpu clip-Y-up.
|
||||
s = [sx, -sy];
|
||||
o = [ox, -oy];
|
||||
} else {
|
||||
// iterate-3AA (DEFECT 1 ROOT): clipping enabled → the VS already emits
|
||||
// *clip-space* coordinates (Y-UP: +Y is the top of the screen), exactly
|
||||
// the convention the Xbox 360's D3D9 and wgpu BOTH use for clip space
|
||||
// (NDC +Y → framebuffer top in each API; the framebuffer Y-direction is
|
||||
// an internal viewport detail handled identically by both). A clip-space
|
||||
// position is therefore portable to wgpu with NO Y-flip. The previous
|
||||
// code unconditionally negated Y (the same flip the screen-space pixel
|
||||
// path needs), which mirrored the publisher logo vertically: its quad is
|
||||
// centered (±0.085 around 0) so the *position* stayed centered, but the
|
||||
// negation swapped top↔bottom vertices while the texture V was unchanged
|
||||
// → the sampled sub-rect (UV v 0.001→0.090) read bottom-up → "SQUARE
|
||||
// ENIX" rendered upside down in place. Measured (readback): the red dots
|
||||
// sit at 43% from the texture top but rendered at 58% from the top
|
||||
// (= a clean vertical mirror); removing the flip restores them to 43%.
|
||||
// Identity XY (no flip) maps guest clip-Y-up straight to wgpu clip-Y-up.
|
||||
s = [1.0, 1.0];
|
||||
o = [0.0, 0.0];
|
||||
return (s, o);
|
||||
}
|
||||
(s, o)
|
||||
}
|
||||
|
||||
/// Encode a [`PrimitiveType`] as the raw Xenos code used across the bridge.
|
||||
pub fn prim_code(p: PrimitiveType) -> u32 {
|
||||
match p {
|
||||
PrimitiveType::None => 0,
|
||||
PrimitiveType::PointList => 1,
|
||||
PrimitiveType::LineList => 2,
|
||||
PrimitiveType::LineStrip => 3,
|
||||
PrimitiveType::TriangleList => 4,
|
||||
PrimitiveType::TriangleFan => 5,
|
||||
PrimitiveType::TriangleStrip => 6,
|
||||
PrimitiveType::RectangleList => 8,
|
||||
PrimitiveType::QuadList => 13,
|
||||
PrimitiveType::Unknown(x) => x as u32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve the first vertex-fetch window referenced by the parsed VS.
|
||||
///
|
||||
/// Walks the VS instruction stream for the first `vfetch` (mini) instruction,
|
||||
/// reads its fetch constant from `rf`, and copies a bounded window of guest
|
||||
/// memory starting at the fetch base. Returns `(dwords, window_base_dwords)`
|
||||
/// or `None` if the VS has no vertex fetch or the constant is malformed.
|
||||
fn resolve_vertex_window(
|
||||
parsed_vs: &crate::ucode::ParsedShader,
|
||||
rf: &RegisterFile,
|
||||
mem: &dyn MemoryAccess,
|
||||
) -> Option<(Vec<u32>, u32)> {
|
||||
// iterate-3W (GPUBUG-109): the instruction block packs ALU and fetch
|
||||
// instructions identically (96 bits / 3 dwords each); ONLY the owning
|
||||
// `Exec` control-flow clause's `sequence` bitmap (2 bits per instruction,
|
||||
// bit[2*i]=fetch/ALU) tells them apart. The previous blind triple-walk
|
||||
// decoded ALU triples as fetches → garbage fetch-constant indices and a
|
||||
// bogus `type==3` guard, never reaching the real vertex fetch. Walk the CF
|
||||
// exec clauses exactly as the translator does (`translator.rs::emit_exec`)
|
||||
// and take the FIRST sequence-flagged *vertex* fetch.
|
||||
let instrs = &parsed_vs.instructions;
|
||||
let mut const_off: Option<u32> = None;
|
||||
'clauses: for clause in &parsed_vs.cf {
|
||||
let crate::ucode::control_flow::ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
..
|
||||
} = *clause
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
for i in 0..(count as usize) {
|
||||
// bit[2*i] of the sequence bitmap: 1 = fetch, 0 = ALU.
|
||||
if (sequence >> (i * 2)) & 1 == 0 {
|
||||
continue;
|
||||
}
|
||||
let base = (address as usize + i) * 3;
|
||||
if base + 2 >= instrs.len() {
|
||||
break;
|
||||
}
|
||||
if let crate::ucode::fetch::FetchInstruction::Vertex(vf) =
|
||||
crate::ucode::fetch::decode_fetch([instrs[base], instrs[base + 1], instrs[base + 2]])
|
||||
{
|
||||
const_off = Some(vf.const_reg_offset());
|
||||
break 'clauses;
|
||||
}
|
||||
}
|
||||
}
|
||||
// iterate-3X (GPUBUG-110): vertex fetch constants are addressed by
|
||||
// `const_index * 3 + const_index_sel` (canary `ucode.h:700` —
|
||||
// `VertexFetchInstruction::fetch_constant_index`), NOT by `const_index`
|
||||
// alone. The register region packs 3 two-dword vertex-fetch constants per
|
||||
// 6-dword group, so the constant lives at
|
||||
// `0x4800 + const_index*6 + const_index_sel*2`. The previous decode dropped
|
||||
// `const_index_sel` and read sub-slot 0 (`fc*6`), which for the publisher
|
||||
// logo (`const_index=31, sel=2`) held `0x00000001` (an unused slot) instead
|
||||
// of the real vertex-buffer base at sub-slot 2 (`0x48BE`). That made
|
||||
// `has_real_vertices=false` → the logo fell to the procedural fullscreen
|
||||
// magenta fallback. (Refutes iterate-3W's "geometry is auto-generated from
|
||||
// vertex_id" — measured: the real fetch constant is a 4-vertex QuadList
|
||||
// buffer at `0x0adf60f0`.)
|
||||
let const_reg = CONST_BASE_FETCH + const_off?;
|
||||
let dword0 = rf.read(const_reg);
|
||||
let dword1 = rf.read(const_reg + 1);
|
||||
// address:30 at bits[31:2] of dword0 (in bytes once masked). The fetch
|
||||
// constant carries a guest *physical* dword address — canary reads the
|
||||
// vertex buffer via `Memory::TranslatePhysical(fetch.address * 4)`
|
||||
// (`draw_util.cc:961`). On the Xbox 360 the physical range is mirrored at
|
||||
// several virtual windows; ours only maps the cached-physical window at
|
||||
// `0x4000_0000` (`gpu_system::physical_to_backing`). Reading the bare low
|
||||
// address (`0x0adf_xxxx`) hits an unmapped VA and returns zeros, so rebase
|
||||
// a low physical base onto the mapped `0x4000_0000` alias when the raw VA
|
||||
// is not itself mapped. `window_base_dwords` keeps the *original* base so
|
||||
// the shader's rebase against the (unmodified) fetch-constant address still
|
||||
// indexes the uploaded window correctly.
|
||||
let base_bytes = dword0 & 0xFFFF_FFFC;
|
||||
if base_bytes == 0 {
|
||||
return None;
|
||||
}
|
||||
let read_base = if mem.translate(base_bytes).is_some() {
|
||||
base_bytes
|
||||
} else if base_bytes < 0x2000_0000 && mem.translate(base_bytes | 0x4000_0000).is_some() {
|
||||
base_bytes | 0x4000_0000
|
||||
} else {
|
||||
base_bytes
|
||||
};
|
||||
// size:24 at bits[25:2] of dword1, in dwords. Clamp to our window cap.
|
||||
let size_dwords = ((dword1 >> 2) & 0x00FF_FFFF).clamp(1, MAX_WINDOW_DWORDS);
|
||||
let window_base_dwords = base_bytes >> 2;
|
||||
let mut dwords = Vec::with_capacity(size_dwords as usize);
|
||||
for i in 0..size_dwords {
|
||||
let addr = read_base.wrapping_add(i * 4);
|
||||
if addr < read_base {
|
||||
break; // wrap guard
|
||||
}
|
||||
// `read_u32` composes big-endian bytes into the u32 value; the WGSL's
|
||||
// `gpu_swap` expects the *raw little-endian dword* as it sits in guest
|
||||
// memory, so undo the BE composition with `swap_bytes`.
|
||||
dwords.push(mem.read_u32(addr).swap_bytes());
|
||||
}
|
||||
if dwords.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some((dwords, window_base_dwords))
|
||||
}
|
||||
|
||||
/// Build a [`DrawCapture`] for one draw. Best-effort: when the vertex window
|
||||
/// can't be resolved, `has_real_vertices` is `false` and the UI falls back to
|
||||
/// procedural geometry (never fabricated pixels).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn build(
|
||||
draw_index: u32,
|
||||
primitive: PrimitiveType,
|
||||
host_vertex_count: u32,
|
||||
_index_source: IndexSource,
|
||||
_index_size: IndexSize,
|
||||
vs_key: u32,
|
||||
ps_key: u32,
|
||||
parsed_vs: Option<&crate::ucode::ParsedShader>,
|
||||
rf: &RegisterFile,
|
||||
mem: &dyn MemoryAccess,
|
||||
) -> DrawCapture {
|
||||
let (vertex_dwords, window_base_dwords, has_real) = match parsed_vs
|
||||
.and_then(|vs| resolve_vertex_window(vs, rf, mem))
|
||||
{
|
||||
Some((d, base)) => (d, base, true),
|
||||
None => (Vec::new(), 0, false),
|
||||
};
|
||||
let (ndc_scale, ndc_offset) = compute_ndc_xy(rf);
|
||||
// iterate-3Y: capture RT0 color/blend/depth render state. Registers per
|
||||
// canary `registers.h`: RB_BLENDCONTROL0=0x2201, RB_COLOR_MASK=0x2104
|
||||
// (RT0 = bits[3:0]), RB_COLORCONTROL=0x2202, RB_DEPTHCONTROL=0x2200.
|
||||
const RB_BLENDCONTROL_0: u32 = 0x2201;
|
||||
const RB_COLOR_MASK: u32 = 0x2104;
|
||||
const RB_COLORCONTROL: u32 = 0x2202;
|
||||
const RB_DEPTHCONTROL: u32 = 0x2200;
|
||||
DrawCapture {
|
||||
draw_index,
|
||||
prim_code: prim_code(primitive),
|
||||
host_vertex_count,
|
||||
vs_key,
|
||||
ps_key,
|
||||
vertex_dwords,
|
||||
window_base_dwords,
|
||||
has_real_vertices: has_real,
|
||||
ndc_scale,
|
||||
ndc_offset,
|
||||
textures: Vec::new(),
|
||||
blend_control: rf.read(RB_BLENDCONTROL_0),
|
||||
color_mask: (rf.read(RB_COLOR_MASK) & 0xF) as u8,
|
||||
color_control: rf.read(RB_COLORCONTROL),
|
||||
depth_control: rf.read(RB_DEPTHCONTROL),
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,80 @@ use crate::primitive::{self, ProcessedPrimitive};
|
||||
use crate::register_file::RegisterFile;
|
||||
use crate::ring_view::RingBufferView;
|
||||
|
||||
/// The guest-virtual window that physical allocations are committed into.
|
||||
/// `xenia-kernel`'s `heap_alloc` bumps its cursor through `0x4000_0000..=
|
||||
/// 0x6FFF_FFFF` and commits the host backing for `MmAllocatePhysicalMemoryEx`
|
||||
/// there, so this write-combine mirror is the canonical home of physical DRAM.
|
||||
/// Keep in sync with `KernelState::heap_cursor`'s initial value.
|
||||
pub const PHYSICAL_BACKING_BASE: u32 = 0x4000_0000;
|
||||
|
||||
/// Re-project a guest *physical* address — as handed to the Vd/GPU ABI and
|
||||
/// embedded in PM4 pointers (`INDIRECT_BUFFER`, `WAIT_REG_MEM`-memory,
|
||||
/// `MEM_WRITE`, `EVENT_WRITE*`, `IM_LOAD`, …) — onto the guest-virtual window
|
||||
/// where its host backing is actually committed.
|
||||
///
|
||||
/// The Xbox 360 maps its 512 MB of physical DRAM into several virtual mirror
|
||||
/// windows that differ only in cache policy: bare physical (`0x0xxxxxxx`),
|
||||
/// write-combine (`0x4xxxxxxx`), and the cached `0xA/0xC/0xExxxxxxx` mirrors —
|
||||
/// all aliasing `addr & 0x1FFF_FFFF`. On real hardware (and in xenia-canary
|
||||
/// via overlapping `mmap`s) these are literally the same bytes.
|
||||
///
|
||||
/// Ours has a single flat `membase` and `MmAllocatePhysicalMemoryEx` commits
|
||||
/// physical backing in the write-combine `0x4xxxxxxx` window. The guest then
|
||||
/// masks its allocation base to *bare physical* before passing it to
|
||||
/// `VdInitializeRingBuffer` / `VdEnableRingBufferRPtrWriteBack`, and PM4
|
||||
/// pointers are likewise bare-physical. A flat `membase + phys` access
|
||||
/// therefore hits a never-committed, zero-filled page instead of the committed
|
||||
/// `0x4xxxxxxx` backing — so the GPU decoded zero PM4 headers and never ran
|
||||
/// the real command stream.
|
||||
///
|
||||
/// Projecting any physical-mirror address back onto the `0x4xxxxxxx` window
|
||||
/// lands on the page `heap_alloc` actually backed, regardless of which mirror
|
||||
/// the guest used (idempotent for `0x4xxxxxxx` itself). The projection is
|
||||
/// derived from `heap_alloc`'s placement, not a guess — if that window ever
|
||||
/// moves, `PHYSICAL_BACKING_BASE` must move with it.
|
||||
///
|
||||
/// This is deliberately applied only at the GPU/Vd boundary (where addresses
|
||||
/// arrive in their bare-physical form), NOT on the CPU's flat load/store path:
|
||||
/// the guest CPU already accesses its allocations through the `0x4xxxxxxx`
|
||||
/// base, and non-physical guest-virtual addresses (image `0x82xxxxxx`, stacks
|
||||
/// `0x7xxxxxxx`) must stay flat.
|
||||
#[inline]
|
||||
pub fn physical_to_backing(addr: u32) -> u32 {
|
||||
match addr {
|
||||
0x0000_0000..=0x1FFF_FFFF
|
||||
| 0x4000_0000..=0x4FFF_FFFF
|
||||
| 0xA000_0000..=0xBFFF_FFFF
|
||||
| 0xC000_0000..=0xDFFF_FFFF
|
||||
| 0xE000_0000..=0xFFFF_FFFF => PHYSICAL_BACKING_BASE | (addr & 0x1FFF_FFFF),
|
||||
_ => addr,
|
||||
}
|
||||
}
|
||||
|
||||
/// Max guest page-version over the `[base, base+len)` span, walking 4 KiB
|
||||
/// pages via the `MemoryAccess` trait's `page_version`.
|
||||
///
|
||||
/// The concrete heap exposes an inherent `max_page_version(base, len)`, but
|
||||
/// the draw handler only holds `&dyn MemoryAccess` (which carries the coarser
|
||||
/// `page_version(addr)` accessor). This is byte-equivalent to
|
||||
/// `heap::max_page_version` and stays a pure function of the per-page write
|
||||
/// counters (no wall-clock), so texture-decode timing remains deterministic.
|
||||
fn span_max_version(mem: &dyn MemoryAccess, base: u32, len: u32) -> u64 {
|
||||
const PAGE: u32 = 0x1000;
|
||||
let last = base.saturating_add(len.saturating_sub(1));
|
||||
let mut page = base & !(PAGE - 1);
|
||||
let last_page = last & !(PAGE - 1);
|
||||
let mut max = 0u64;
|
||||
loop {
|
||||
max = max.max(mem.page_version(page));
|
||||
if page >= last_page {
|
||||
break;
|
||||
}
|
||||
page = page.wrapping_add(PAGE);
|
||||
}
|
||||
max
|
||||
}
|
||||
|
||||
/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ShaderBlob {
|
||||
@@ -58,21 +132,37 @@ pub enum WaitCmp {
|
||||
GreaterEq,
|
||||
/// value > ref
|
||||
Greater,
|
||||
/// Always — caller wants to sleep regardless.
|
||||
/// Always — caller wants to sleep regardless (selector bit 7).
|
||||
Always,
|
||||
/// Never matches — `wait_info & 7 == 0` selects bit 0 of canary's
|
||||
/// selector word, which is always zero.
|
||||
Never,
|
||||
}
|
||||
|
||||
impl WaitCmp {
|
||||
/// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`.
|
||||
/// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`
|
||||
/// (`pm4_command_processor_implement.h:685-696`). Canary forms a selector
|
||||
/// `((value<ref)<<1) | ((value<=ref)<<2) | ((value==ref)<<3) |
|
||||
/// ((value!=ref)<<4) | ((value>=ref)<<5) | ((value>ref)<<6) | (1<<7)` and
|
||||
/// evaluates `(selector >> (wait_info & 7)) & 1`. So the index is the bit
|
||||
/// position: 1=Less, 2=LessEq, 3=Equal, 4=NotEqual, 5=GreaterEq,
|
||||
/// 6=Greater, 7=always-true, 0=never (bit 0 is always clear).
|
||||
///
|
||||
/// GPUBUG: the prior mapping was off by one (it started at `0 => Less`),
|
||||
/// so `wait_info & 7 == 3` decoded as `NotEqual` instead of `Equal`. That
|
||||
/// inverted the standard CP coherency wait
|
||||
/// (`WAIT_REG_MEM COHER_STATUS_HOST, Equal 0`): the GPU parked forever on
|
||||
/// the first INDIRECT_BUFFER and never reached any draw.
|
||||
pub fn from_wait_info(wait_info: u32) -> Self {
|
||||
match wait_info & 0x7 {
|
||||
0 => WaitCmp::Less,
|
||||
1 => WaitCmp::LessEq,
|
||||
2 => WaitCmp::Equal,
|
||||
3 => WaitCmp::NotEqual,
|
||||
4 => WaitCmp::GreaterEq,
|
||||
5 => WaitCmp::Greater,
|
||||
_ => WaitCmp::Always,
|
||||
1 => WaitCmp::Less,
|
||||
2 => WaitCmp::LessEq,
|
||||
3 => WaitCmp::Equal,
|
||||
4 => WaitCmp::NotEqual,
|
||||
5 => WaitCmp::GreaterEq,
|
||||
6 => WaitCmp::Greater,
|
||||
7 => WaitCmp::Always,
|
||||
_ => WaitCmp::Never,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,6 +175,7 @@ impl WaitCmp {
|
||||
WaitCmp::GreaterEq => value >= reference,
|
||||
WaitCmp::Greater => value > reference,
|
||||
WaitCmp::Always => true,
|
||||
WaitCmp::Never => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -333,12 +424,24 @@ pub struct GpuSystem {
|
||||
/// on every texture-fetch resolution; the UI thread sees the decoded
|
||||
/// bytes via `UiBridge::publish_texture`.
|
||||
pub texture_cache: crate::texture_cache::TextureCache,
|
||||
/// P5b: textures decoded at the most recent `PM4_DRAW_INDX*`, keyed off
|
||||
/// the *active* pixel shader's real `tfetch` fetch-constant slots (not a
|
||||
/// hardcoded slot). `vd_swap` publishes the first of these to the UI so
|
||||
/// the replay binds the texture the draw actually samples. Cleared and
|
||||
/// repopulated each draw; empty when the active PS issues no `tfetch`.
|
||||
pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, u64, Vec<u8>)>,
|
||||
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
|
||||
/// (future) host-render-target readback; read by the resolve byte-copy
|
||||
/// path that writes tiled pixels into guest memory. Allocated once at
|
||||
/// `GpuSystem::new` and lives for the whole GPU lifetime — no
|
||||
/// per-frame churn.
|
||||
pub edram: crate::edram::ShadowEdram,
|
||||
/// UI-only: when `Some`, every `PM4_DRAW_INDX*` appends a
|
||||
/// [`crate::draw_capture::DrawCapture`] here so the host UI can replay the
|
||||
/// real guest geometry. `None` in headless/deterministic mode — the
|
||||
/// `--gpu-inline` golden never enables this, so capture is entirely inert
|
||||
/// for `check`. Drained (taken) by `vd_swap` at each present.
|
||||
pub frame_captures: Option<Vec<crate::draw_capture::DrawCapture>>,
|
||||
}
|
||||
|
||||
impl GpuSystem {
|
||||
@@ -364,7 +467,17 @@ impl GpuSystem {
|
||||
rt_cache: crate::render_target_cache::RenderTargetCache::new(),
|
||||
last_resolve: None,
|
||||
texture_cache: crate::texture_cache::TextureCache::new(),
|
||||
last_draw_textures: Vec::new(),
|
||||
edram: crate::edram::ShadowEdram::new(),
|
||||
frame_captures: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable per-draw geometry capture for the host UI. Inert (and never
|
||||
/// called) in headless/deterministic mode. Idempotent.
|
||||
pub fn enable_frame_capture(&mut self) {
|
||||
if self.frame_captures.is_none() {
|
||||
self.frame_captures = Some(Vec::new());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -536,14 +649,21 @@ impl GpuSystem {
|
||||
/// Release.
|
||||
pub fn sync_with_mmio(&mut self) {
|
||||
let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire);
|
||||
if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 {
|
||||
self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords;
|
||||
// CP_RB_WPTR governs ONLY the primary ring. While an indirect buffer
|
||||
// is executing, the active `self.ring` is a fixed linear sub-stream
|
||||
// and the primary ring is saved at the bottom of the IB stack —
|
||||
// applying the (primary) write pointer to the IB would corrupt its
|
||||
// extent (e.g. `wptr % ib_size`) and strand the GPU mid-buffer.
|
||||
let primary = self.ib_stack.first_mut().unwrap_or(&mut self.ring);
|
||||
if wptr_dwords != primary.write_offset_dwords && primary.size_dwords != 0 {
|
||||
primary.write_offset_dwords = wptr_dwords % primary.size_dwords;
|
||||
}
|
||||
// Mirror our read pointer (Release pairs with any guest-side
|
||||
let primary_rptr = primary.read_offset_dwords;
|
||||
// Mirror the *primary* read pointer (Release pairs with any guest-side
|
||||
// Acquire-load of CP_RB_RPTR for ring writeback bookkeeping).
|
||||
self.mmio
|
||||
.cp_rb_rptr
|
||||
.store(self.ring.read_offset_dwords, Ordering::Release);
|
||||
.store(primary_rptr, Ordering::Release);
|
||||
}
|
||||
|
||||
/// True iff `execute_one` is expected to make progress without blocking.
|
||||
@@ -551,7 +671,11 @@ impl GpuSystem {
|
||||
if let Some(block) = &self.pending_block {
|
||||
return block.is_satisfied(mem, &self.register_file);
|
||||
}
|
||||
self.ring.has_pending()
|
||||
// Pending work may be in the active ring OR in a saved caller ring
|
||||
// further down the IB stack (an exhausted IB still needs `execute_one`
|
||||
// to pop back and resume the primary ring, whose WPTR may have since
|
||||
// advanced).
|
||||
self.ring.has_pending() || self.ib_stack.iter().any(|r| r.has_pending())
|
||||
}
|
||||
|
||||
/// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when
|
||||
@@ -561,6 +685,12 @@ impl GpuSystem {
|
||||
pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome {
|
||||
// 0) If currently parked, probe the condition and either wake up or stay blocked.
|
||||
if let Some(block) = self.pending_block.clone() {
|
||||
// Re-service the CP coherency handshake on each probe so a
|
||||
// COHER_STATUS_HOST wait can clear (canary does this in its WAIT
|
||||
// loop body, not just at entry).
|
||||
if let GpuBlock::WaitRegMem { poll_addr, is_memory: false, .. } = &block {
|
||||
self.make_coherent(*poll_addr);
|
||||
}
|
||||
if block.is_satisfied(mem, &self.register_file) {
|
||||
tracing::debug!(?block, "gpu: wait satisfied — resuming");
|
||||
self.pending_block = None;
|
||||
@@ -642,10 +772,13 @@ impl GpuSystem {
|
||||
width,
|
||||
height,
|
||||
});
|
||||
self.pending_interrupts.push(PendingInterrupt {
|
||||
source: InterruptSource::Swap,
|
||||
cpu_mask: 0x1,
|
||||
});
|
||||
// iterate-2T: do NOT raise a CP swap-complete interrupt here. Canary's
|
||||
// `VdSwap`/PM4_XE_SWAP path raises no interrupt; swap-complete CP
|
||||
// interrupts come ONLY from in-stream `PM4_INTERRUPT` packets, which
|
||||
// are naturally ordered after D3D has armed the swap-callback slot.
|
||||
// Synthesizing one out of band (as we did pre-2T) delivered a CP
|
||||
// interrupt while the slot still held the `0xBADF00D` placeholder,
|
||||
// tripping the graphics ISR's "Unanticipated CPU_INTERRUPT" assert.
|
||||
tracing::info!(
|
||||
frame = self.swap_counter,
|
||||
fb = format_args!("{frontbuffer_phys:#010x}"),
|
||||
@@ -657,9 +790,21 @@ impl GpuSystem {
|
||||
|
||||
/// Called by `VdInitializeRingBuffer` to give us the primary ring.
|
||||
pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
|
||||
let size_bytes = 1u32 << size_log2.min(31);
|
||||
// Canary `CommandProcessor::InitializeRingBuffer` (command_processor.cc:
|
||||
// 436): `primary_buffer_size_ = 1 << (size_log2 + 3)` *bytes*. The
|
||||
// `VdInitializeRingBuffer` `r4` argument is log2(size-in-quadwords),
|
||||
// so the byte size is `1 << (size_log2 + 3)` (× 8 bytes/quadword), i.e.
|
||||
// `1 << (size_log2 + 1)` dwords. (Sylpheed passes size_log2=12 →
|
||||
// 32768 bytes / 8192 dwords; the previous `1 << size_log2` undersized
|
||||
// the ring 8× and desynced WPTR wrap math from the guest.)
|
||||
let size_bytes = 1u32 << size_log2.saturating_add(3).min(31);
|
||||
// The guest hands us a bare *physical* ring base; project it onto the
|
||||
// committed backing window so ring reads hit real PM4 packets (see
|
||||
// `physical_to_backing`).
|
||||
let base = physical_to_backing(base);
|
||||
self.ring.base = base;
|
||||
self.ring.size_dwords = size_bytes / 4;
|
||||
self.ring.indirect = false;
|
||||
self.ring.read_offset_dwords = 0;
|
||||
// `write_offset` is driven by the guest — start at 0 so the ring
|
||||
// appears empty until MMIO writes advance it.
|
||||
@@ -675,6 +820,10 @@ impl GpuSystem {
|
||||
/// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest
|
||||
/// expects us to mirror `read_offset_dwords`.
|
||||
pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) {
|
||||
// The guest registers a bare *physical* writeback address and polls
|
||||
// the same allocation through its `0x4xxxxxxx` base; project so our
|
||||
// RPtr store lands on the page the guest actually reads.
|
||||
let addr = physical_to_backing(addr);
|
||||
self.ring.rptr_writeback_addr = addr;
|
||||
self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31);
|
||||
tracing::info!(
|
||||
@@ -724,6 +873,58 @@ impl GpuSystem {
|
||||
/// upstream packet effects (memory writes, register file updates
|
||||
/// the guest reads via subsequent MMIO) happen-before the
|
||||
/// CPU-visible RPTR bump.
|
||||
/// Service a CP coherency request, mirroring canary's
|
||||
/// `CommandProcessor::MakeCoherent` (`command_processor.cc:801-838`).
|
||||
///
|
||||
/// The guest requests a vertex/texture-cache flush by writing
|
||||
/// `COHER_STATUS_HOST` with its status bit (bit 31) set, then spins on a
|
||||
/// `WAIT_REG_MEM COHER_STATUS_HOST, Equal 0`. We have no host cache to
|
||||
/// flush (memory is shared, coherency is implicit), so completing the
|
||||
/// request is simply clearing the register — which lets the wait satisfy.
|
||||
/// No-op unless `poll_addr` is `COHER_STATUS_HOST` and its status bit is
|
||||
/// set, so it is safe to call on every coherency-register WAIT probe.
|
||||
fn make_coherent(&mut self, poll_addr: u32) {
|
||||
if poll_addr != reg::COHER_STATUS_HOST {
|
||||
return;
|
||||
}
|
||||
let status = self.register_file.read(reg::COHER_STATUS_HOST);
|
||||
if status & 0x8000_0000 != 0 {
|
||||
self.register_file.write(reg::COHER_STATUS_HOST, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// CP scratch-register memory writeback, mirroring canary's
|
||||
/// `CommandProcessor::HandleSpecialRegisterWrite`
|
||||
/// (`command_processor.cc:545-552`). Every register write runs through
|
||||
/// here; when the target is one of the eight `SCRATCH_REG{n}`
|
||||
/// (`0x0578..=0x057F`) **and** the matching bit in `SCRATCH_UMSK` is set,
|
||||
/// the value is also written (big-endian, as `mem.write_u32` already
|
||||
/// stores) to `SCRATCH_ADDR + n*4` in guest physical memory.
|
||||
///
|
||||
/// Sylpheed arms its CP swap-complete interrupt callback through this
|
||||
/// path: it programs `SCRATCH_ADDR` to the GPU command-block descriptor
|
||||
/// (`[gfx+10772]`, runtime `0x0b1d5000`), `SCRATCH_UMSK` bit 4, then a
|
||||
/// Type-0 write of the callback PC `0x824ce2b8` into `SCRATCH_REG4`
|
||||
/// (`0x057C`). The writeback lands it at descriptor+16 (`0x4b1d5010`),
|
||||
/// which the graphics ISR (`sub_824BE9A0`) reads via `[[gfx+10772]+16]`
|
||||
/// and `bcctrl`s to fire the swap-complete callback. Without this
|
||||
/// writeback the slot stayed NULL, the ISR skipped the callback, the
|
||||
/// swap counter never advanced, and the title's per-frame manager
|
||||
/// re-fired once then plateaued.
|
||||
fn scratch_register_writeback(&self, mem: &dyn MemoryAccess, index: u32, value: u32) {
|
||||
if !(reg::SCRATCH_REG0..=reg::SCRATCH_REG7).contains(&index) {
|
||||
return;
|
||||
}
|
||||
let scratch_reg = index - reg::SCRATCH_REG0;
|
||||
let umsk = self.register_file.read(reg::SCRATCH_UMSK);
|
||||
if (1u32 << scratch_reg) & umsk == 0 {
|
||||
return;
|
||||
}
|
||||
let scratch_addr = self.register_file.read(reg::SCRATCH_ADDR);
|
||||
let mem_addr = physical_to_backing(scratch_addr.wrapping_add(scratch_reg * 4));
|
||||
mem.write_u32(mem_addr, value);
|
||||
}
|
||||
|
||||
fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) {
|
||||
if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() {
|
||||
mem.write_u32_fence(
|
||||
@@ -748,6 +949,7 @@ impl GpuSystem {
|
||||
let value = mem.read_u32(dword_addr);
|
||||
let target = if write_one { base_index } else { base_index + i };
|
||||
self.register_file.write(target, value);
|
||||
self.scratch_register_writeback(mem, target, value);
|
||||
}
|
||||
tracing::trace!(
|
||||
base = format_args!("{base_index:#x}"),
|
||||
@@ -770,6 +972,8 @@ impl GpuSystem {
|
||||
let b = mem.read_u32(b_addr);
|
||||
self.register_file.write(reg_index_1, a);
|
||||
self.register_file.write(reg_index_2, b);
|
||||
self.scratch_register_writeback(mem, reg_index_1, a);
|
||||
self.scratch_register_writeback(mem, reg_index_2, b);
|
||||
tracing::trace!(
|
||||
r1 = format_args!("{reg_index_1:#x}"),
|
||||
r2 = format_args!("{reg_index_2:#x}"),
|
||||
@@ -816,7 +1020,9 @@ impl GpuSystem {
|
||||
}
|
||||
pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => {
|
||||
self.stats.indirect_buffer_jumps += 1;
|
||||
let ib_ptr = self.read_payload(mem, 1);
|
||||
// The IB pointer is a guest *physical* address — project it
|
||||
// onto the committed backing window (see `physical_to_backing`).
|
||||
let ib_ptr = physical_to_backing(self.read_payload(mem, 1));
|
||||
let ib_size = self.read_payload(mem, 2);
|
||||
// Advance past the IB header + payload before recursing so
|
||||
// the return location is correct.
|
||||
@@ -832,6 +1038,10 @@ impl GpuSystem {
|
||||
write_offset_dwords: ib_size, // IB is fully-written at jump time
|
||||
rptr_writeback_addr: 0,
|
||||
rptr_writeback_block_dwords: 0,
|
||||
// Linear sub-stream: drain [0, ib_size) then pop. Never
|
||||
// wraps, and `sync_with_mmio`'s CP_RB_WPTR must not touch
|
||||
// it (canary executes IBs through a separate reader).
|
||||
indirect: true,
|
||||
};
|
||||
tracing::debug!(
|
||||
ib_ptr = format_args!("{ib_ptr:#010x}"),
|
||||
@@ -854,7 +1064,8 @@ impl GpuSystem {
|
||||
let is_memory = (wait_info & 0x10) != 0;
|
||||
let cmp = WaitCmp::from_wait_info(wait_info);
|
||||
let poll_addr = if is_memory {
|
||||
poll_addr_raw & !3
|
||||
// Physical memory poll address → committed backing.
|
||||
physical_to_backing(poll_addr_raw & !3)
|
||||
} else {
|
||||
poll_addr_raw
|
||||
};
|
||||
@@ -865,6 +1076,12 @@ impl GpuSystem {
|
||||
mask,
|
||||
cmp,
|
||||
};
|
||||
// A WAIT polling COHER_STATUS_HOST is the CP coherency
|
||||
// handshake: service it now so the status bit clears (see
|
||||
// `make_coherent`), exactly as canary does in its WAIT loop.
|
||||
if !is_memory {
|
||||
self.make_coherent(poll_addr);
|
||||
}
|
||||
if block.is_satisfied(mem, &self.register_file) {
|
||||
// Condition already true; proceed past this packet.
|
||||
tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied");
|
||||
@@ -908,7 +1125,7 @@ impl GpuSystem {
|
||||
pm4::PM4_REG_TO_MEM => {
|
||||
// payload[0] = reg_index, payload[1] = mem addr
|
||||
let reg_index = self.read_payload(mem, 1) & 0x1FFF;
|
||||
let dst = self.read_payload(mem, 2) & !3;
|
||||
let dst = physical_to_backing(self.read_payload(mem, 2) & !3);
|
||||
let value = self.register_file.read(reg_index);
|
||||
mem.write_u32(dst, value);
|
||||
tracing::trace!(
|
||||
@@ -920,7 +1137,7 @@ impl GpuSystem {
|
||||
}
|
||||
pm4::PM4_MEM_WRITE => {
|
||||
// payload[0] = dst, payload[1..=count-1] = values
|
||||
let mut dst = self.read_payload(mem, 1) & !3;
|
||||
let mut dst = physical_to_backing(self.read_payload(mem, 1) & !3);
|
||||
for i in 2..=count {
|
||||
let val = self.read_payload(mem, i);
|
||||
mem.write_u32(dst, val);
|
||||
@@ -936,7 +1153,7 @@ impl GpuSystem {
|
||||
let mask = self.read_payload(mem, 4);
|
||||
let is_memory = (wait_info & 0x10) != 0;
|
||||
let cmp = WaitCmp::from_wait_info(wait_info);
|
||||
let poll_addr = if is_memory { poll_raw & !3 } else { poll_raw };
|
||||
let poll_addr = if is_memory { physical_to_backing(poll_raw & !3) } else { poll_raw };
|
||||
let cur_raw = if is_memory {
|
||||
mem.read_u32(poll_addr)
|
||||
} else {
|
||||
@@ -946,7 +1163,7 @@ impl GpuSystem {
|
||||
let write_addr = self.read_payload(mem, 5);
|
||||
let write_data = self.read_payload(mem, 6);
|
||||
if (wait_info & 0x100) != 0 {
|
||||
mem.write_u32(write_addr & !3, write_data);
|
||||
mem.write_u32(physical_to_backing(write_addr & !3), write_data);
|
||||
} else {
|
||||
self.register_file
|
||||
.write(write_addr & 0x1FFF, write_data);
|
||||
@@ -965,7 +1182,7 @@ impl GpuSystem {
|
||||
// payload[0] = initiator (bit 31: write counter, else write `value`)
|
||||
// payload[1] = address, payload[2] = value
|
||||
let initiator = self.read_payload(mem, 1);
|
||||
let address = self.read_payload(mem, 2);
|
||||
let address = physical_to_backing(self.read_payload(mem, 2));
|
||||
let value = self.read_payload(mem, 3);
|
||||
self.register_file
|
||||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||||
@@ -993,7 +1210,7 @@ impl GpuSystem {
|
||||
// payload[0] = initiator, [1] = address. Writes 6 u16 extents
|
||||
// (min/max x/y/z) — we're not tracking scissors yet, so write zeros.
|
||||
let initiator = self.read_payload(mem, 1);
|
||||
let address = self.read_payload(mem, 2) & !3;
|
||||
let address = physical_to_backing(self.read_payload(mem, 2) & !3);
|
||||
self.register_file
|
||||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||||
self.handle_event_initiator(initiator & 0x3F, mem);
|
||||
@@ -1093,7 +1310,146 @@ impl GpuSystem {
|
||||
"gpu: DRAW_INDX captured"
|
||||
);
|
||||
self.last_draw = Some(ds);
|
||||
let host_vertex_count = processed.host_vertex_count;
|
||||
self.last_primitive = Some(processed);
|
||||
|
||||
// iterate-3O: UI-only per-draw geometry capture. Resolves the
|
||||
// real guest vertex window behind this draw (from the active
|
||||
// VS's vertex-fetch constant) so the host UI can replay the
|
||||
// actual splash geometry instead of synthetic shapes. Entirely
|
||||
// inert in headless/deterministic mode (`frame_captures` is
|
||||
// `None`), so the `--gpu-inline` golden is unaffected.
|
||||
if self.frame_captures.is_some() {
|
||||
let vs_key = self.active_vs_key.unwrap_or(0);
|
||||
let ps_key = self.active_ps_key.unwrap_or(0);
|
||||
let parsed_vs = self
|
||||
.active_vs_key
|
||||
.and_then(|k| self.shader_blobs.get(&k))
|
||||
.map(|b| crate::ucode::parse_shader(&b.dwords));
|
||||
let (idx_src, idx_size) = match ds.index_source {
|
||||
crate::draw_state::IndexSource::Dma { index_size, .. } => {
|
||||
(ds.index_source, index_size)
|
||||
}
|
||||
crate::draw_state::IndexSource::Immediate { index_size } => {
|
||||
(ds.index_source, index_size)
|
||||
}
|
||||
crate::draw_state::IndexSource::AutoIndex => {
|
||||
(ds.index_source, crate::draw_state::IndexSize::Sixteen)
|
||||
}
|
||||
};
|
||||
let cap = crate::draw_capture::build(
|
||||
self.stats.draws_seen as u32,
|
||||
ds.primitive,
|
||||
host_vertex_count,
|
||||
idx_src,
|
||||
idx_size,
|
||||
vs_key,
|
||||
ps_key,
|
||||
parsed_vs.as_ref(),
|
||||
&self.register_file,
|
||||
mem,
|
||||
);
|
||||
if let Some(caps) = self.frame_captures.as_mut() {
|
||||
// Bound the per-frame list so a runaway frame can't grow
|
||||
// host memory without limit; keep the most recent.
|
||||
const MAX_CAPS: usize = 4096;
|
||||
if caps.len() >= MAX_CAPS {
|
||||
caps.remove(0);
|
||||
}
|
||||
caps.push(cap);
|
||||
}
|
||||
}
|
||||
|
||||
// P5b: decode the textures the *active pixel shader* actually
|
||||
// samples. Parse the bound PS, collect its `tfetch`
|
||||
// fetch-constant slots, read each 6-dword fetch constant from
|
||||
// the register file, and decode+cache it. `vd_swap` publishes
|
||||
// the result. Empty for flat (no-tfetch) shaders — the
|
||||
// dominant case on Sylpheed's current splash, where this stays
|
||||
// inert until the textured logo draw is reached.
|
||||
self.last_draw_textures.clear();
|
||||
if let Some(ps_key) = self.active_ps_key {
|
||||
// Collect slots under an immutable borrow of `shader_blobs`,
|
||||
// then drop it before mutating `texture_cache`.
|
||||
let slots: Vec<u8> = match self.shader_blobs.get(&ps_key) {
|
||||
Some(blob) => {
|
||||
let parsed = crate::ucode::parse_shader(&blob.dwords);
|
||||
crate::shader_metrics::tfetch_slots(&parsed)
|
||||
}
|
||||
None => Vec::new(),
|
||||
};
|
||||
for slot in slots {
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (k, w) in fetch6.iter_mut().enumerate() {
|
||||
*w = self
|
||||
.register_file
|
||||
.read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32);
|
||||
}
|
||||
let Some(mut key) = crate::texture_cache::decode_fetch_constant(fetch6) else {
|
||||
continue;
|
||||
};
|
||||
// The Xenos texture fetch constant carries a guest
|
||||
// *physical* base address (`base >> 12`). On the Xbox
|
||||
// 360 the GPU reads the unified physical memory; the
|
||||
// CPU writes the (decompressed) texels through its
|
||||
// cached-physical aperture, which ours backs at the
|
||||
// committed `0x4000_0000` window. Map the physical
|
||||
// base onto that backing window so the GPU samples the
|
||||
// bytes the guest actually wrote — exactly as the
|
||||
// vertex-fetch path does (`draw_capture.rs`) and as
|
||||
// canary reads textures through its GPU shared memory
|
||||
// (= physical). Without this the decode reads the
|
||||
// low VA `0x0dbee000` (always zero) instead of the
|
||||
// filled `0x4dbee000`, flattening every disk-asset
|
||||
// texture (e.g. the publisher logo `E59B2B3D`).
|
||||
key.base_address = physical_to_backing(key.base_address);
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
* (bi.bytes_per_block as u32)
|
||||
/ (bi.block_w as u32);
|
||||
let version = span_max_version(mem, key.base_address, span_bytes.max(4));
|
||||
match self.texture_cache.ensure_cached(key, version, mem) {
|
||||
Ok(entry) => {
|
||||
// iterate-3AD: carry the real content `version`
|
||||
// (from `span_max_version`) so the UI host
|
||||
// texture cache re-uploads when the guest fills
|
||||
// more of an evolving atlas (e.g. the 2nd splash
|
||||
// logo's texels land after the publisher's, in
|
||||
// the SAME K8888 surface). Previously the UI
|
||||
// pinned `version_when_uploaded = 1`, so the
|
||||
// first (partial) upload stuck and later draws
|
||||
// sampled the not-yet-filled region as black.
|
||||
self.last_draw_textures
|
||||
.push((entry.key, version, entry.bytes.clone()));
|
||||
metrics::counter!(
|
||||
"gpu.texture.decode",
|
||||
"fmt" => format!("{:?}", key.format),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::counter!(
|
||||
"gpu.texture.reject",
|
||||
"reason" => format!("{e:?}"),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// iterate-3T: attach this draw's decoded textures to the just-
|
||||
// captured draw so the UI can bind the real artwork per-draw
|
||||
// (keyed off the active PS's real tfetch slots) instead of a
|
||||
// single last-draw `primary_texture`. UI-only (`frame_captures`
|
||||
// is `None` headless); does not touch the deterministic core.
|
||||
if !self.last_draw_textures.is_empty()
|
||||
&& let Some(caps) = self.frame_captures.as_mut()
|
||||
&& let Some(last) = caps.last_mut()
|
||||
{
|
||||
last.textures = self.last_draw_textures.clone();
|
||||
}
|
||||
}
|
||||
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
||||
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
||||
@@ -1123,7 +1479,7 @@ impl GpuSystem {
|
||||
}
|
||||
pm4::PM4_LOAD_ALU_CONSTANT => {
|
||||
// payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords
|
||||
let src = self.read_payload(mem, 1) & !3;
|
||||
let src = physical_to_backing(self.read_payload(mem, 1) & !3);
|
||||
let offset_type = self.read_payload(mem, 2);
|
||||
let size_dwords = self.read_payload(mem, 3);
|
||||
let index = offset_type & 0x7FF;
|
||||
@@ -1155,7 +1511,7 @@ impl GpuSystem {
|
||||
}
|
||||
v
|
||||
} else {
|
||||
let addr = self.read_payload(mem, 1) & !3;
|
||||
let addr = physical_to_backing(self.read_payload(mem, 1) & !3);
|
||||
let mut v = Vec::with_capacity(size_dwords as usize);
|
||||
for i in 0..size_dwords {
|
||||
v.push(mem.read_u32(addr + i * 4));
|
||||
@@ -1373,11 +1729,31 @@ pub mod reg {
|
||||
/// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
|
||||
/// Bit 0 = VBLANK_INT_OCCURRED.
|
||||
pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
|
||||
/// `XE_GPU_REG_D1MODE_VIEWPORT_SIZE` / `AVIVO_D1MODE_VIEWPORT_SIZE`
|
||||
/// (Canary `register_table.inc:1134`). Packs the active display resolution
|
||||
/// as `(width << 16) | height` with 12-bit fields. The guest's
|
||||
/// swap-complete interrupt callback (`sub_824CE2B8`) divides by the low
|
||||
/// 12 bits (`height`) as a refresh-pacing term, so a 0 read makes its
|
||||
/// `twi` divide-by-zero guard trap and abort the ISR before it clears the
|
||||
/// swap-acknowledge fence. Canary returns the constant below from
|
||||
/// `GraphicsSystem::ReadRegister` (graphics_system.cc:311).
|
||||
pub const D1MODE_VIEWPORT_SIZE: u32 = 0x1961;
|
||||
/// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
|
||||
pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
|
||||
/// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
|
||||
/// (Canary `register_table.inc:530`).
|
||||
pub const COHER_STATUS_HOST: u32 = 0x0A31;
|
||||
/// `XE_GPU_REG_SCRATCH_UMSK` — bitmask of which `SCRATCH_REG{n}` writes are
|
||||
/// mirrored to memory (Canary `register_table.inc:139`).
|
||||
pub const SCRATCH_UMSK: u32 = 0x01DC;
|
||||
/// `XE_GPU_REG_SCRATCH_ADDR` — base physical address of the scratch
|
||||
/// writeback block (Canary `register_table.inc:141`).
|
||||
pub const SCRATCH_ADDR: u32 = 0x01DD;
|
||||
/// `XE_GPU_REG_SCRATCH_REG0` — first of 8 CP scratch registers
|
||||
/// (`0x0578..=0x057F`, Canary `register_table.inc:331-338`).
|
||||
pub const SCRATCH_REG0: u32 = 0x0578;
|
||||
/// `XE_GPU_REG_SCRATCH_REG7` — last CP scratch register.
|
||||
pub const SCRATCH_REG7: u32 = 0x057F;
|
||||
}
|
||||
|
||||
/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a
|
||||
@@ -1468,6 +1844,38 @@ mod tests {
|
||||
assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scratch_reg_write_mirrors_to_memory_when_umsk_enabled() {
|
||||
// Mirrors Sylpheed's CP swap-callback arming: SCRATCH_ADDR points at a
|
||||
// descriptor, SCRATCH_UMSK enables bit 4, and a Type-0 write of the
|
||||
// callback PC into SCRATCH_REG4 (0x57C) must land at SCRATCH_ADDR + 16.
|
||||
let mut gpu = GpuSystem::new();
|
||||
let mut mem = build_mem();
|
||||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||||
// Program SCRATCH_ADDR = 0x4000_1000 (physical-mirror identity), and
|
||||
// SCRATCH_UMSK = bit 4 only (so SCRATCH_REG4 mirrors, REG3 does not).
|
||||
gpu.register_file.write(reg::SCRATCH_ADDR, 0x4000_1000);
|
||||
gpu.register_file.write(reg::SCRATCH_UMSK, 1 << 4);
|
||||
// Type0 write run: base = SCRATCH_REG3 (0x57B), count = 2 → writes
|
||||
// 0x11111111 → SCRATCH_REG3 (UMSK bit 3 clear), 0x824CE2B8 →
|
||||
// SCRATCH_REG4 (UMSK bit 4 set → mirrored to ADDR + 4*4 = +16).
|
||||
const SCRATCH_REG3: u32 = 0x057B;
|
||||
let hdr = (1u32 << 16) | SCRATCH_REG3;
|
||||
mem.write_u32(0x4000_0000, hdr);
|
||||
mem.write_u32(0x4000_0004, 0x1111_1111);
|
||||
mem.write_u32(0x4000_0008, 0x824C_E2B8);
|
||||
gpu.extend_write_ptr(3);
|
||||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||||
// SCRATCH_REG3 (bit 3 clear) must NOT mirror; SCRATCH_REG4 (bit 4 set)
|
||||
// must mirror to SCRATCH_ADDR + 16.
|
||||
assert_eq!(mem.read_u32(0x4000_1000 + 12), 0, "reg3 must not mirror");
|
||||
assert_eq!(
|
||||
mem.read_u32(0x4000_1000 + 16),
|
||||
0x824C_E2B8,
|
||||
"reg4 must mirror to SCRATCH_ADDR+16"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() {
|
||||
let mut gpu = GpuSystem::new();
|
||||
@@ -1477,8 +1885,9 @@ mod tests {
|
||||
// header
|
||||
let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8);
|
||||
mem.write_u32(0x4000_0000, hdr);
|
||||
// wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 2)
|
||||
mem.write_u32(0x4000_0004, 0x12);
|
||||
// wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 3, per canary's
|
||||
// MatchValueAndRef selector: 1=Less, 2=LessEq, 3=Equal, …).
|
||||
mem.write_u32(0x4000_0004, 0x13);
|
||||
mem.write_u32(0x4000_0008, 0x4000_1000);
|
||||
mem.write_u32(0x4000_000C, 0x42);
|
||||
mem.write_u32(0x4000_0010, 0xFFFF_FFFF);
|
||||
|
||||
@@ -444,6 +444,23 @@ impl GpuBackend {
|
||||
}
|
||||
}
|
||||
|
||||
/// Current guest present (`VdSwap`) count. Cheap single-field read used
|
||||
/// by the present-anchored vsync ticker (iterate-3AJ) every scheduler
|
||||
/// round. Inline mode reads the live counter directly; threaded mode
|
||||
/// reads the last-published digest mirror under a brief lock (the
|
||||
/// `--parallel` path uses the wall-clock vsync ticker anyway, so the
|
||||
/// exact freshness here is not load-bearing).
|
||||
pub fn swaps_seen(&self) -> u64 {
|
||||
match self {
|
||||
GpuBackend::Inline(s) => s.stats.swaps_seen,
|
||||
GpuBackend::Threaded(h) => h
|
||||
.digest
|
||||
.lock()
|
||||
.map(|d| d.stats.swaps_seen)
|
||||
.unwrap_or(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Forward [`GpuSystem::has_pending_interrupts`] under inline mode;
|
||||
/// under threaded mode peek the `int_rx` channel.
|
||||
pub fn has_pending_interrupts(&self) -> bool {
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
//! [`gpu_system::GpuSystem`].
|
||||
|
||||
pub mod command_processor;
|
||||
pub mod draw_capture;
|
||||
pub mod draw_state;
|
||||
pub mod edram;
|
||||
pub mod gpu_system;
|
||||
@@ -34,7 +35,7 @@ pub mod xenos_constants;
|
||||
|
||||
pub use gpu_system::{
|
||||
ExecOutcome, GpuBlock, GpuMmio, GpuStats, GpuSystem, InterruptSource, PendingInterrupt,
|
||||
ShaderBlob, SwapNotification, WaitCmp,
|
||||
PHYSICAL_BACKING_BASE, ShaderBlob, SwapNotification, WaitCmp, physical_to_backing,
|
||||
};
|
||||
pub use handle::{
|
||||
DrainReply, GpuBackend, GpuCommand, GpuDigestSnapshot, GpuHandle, GpuWorker,
|
||||
|
||||
@@ -58,6 +58,15 @@ pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
|
||||
reg::D1MODE_VBLANK_VLINE_STATUS => {
|
||||
read_vblank_status.load(Ordering::Relaxed)
|
||||
}
|
||||
// AVIVO_D1MODE_VIEWPORT_SIZE: the active display resolution
|
||||
// (1280x720) packed as `(width << 16) | height`. Canary
|
||||
// serves this constant from `GraphicsSystem::ReadRegister`
|
||||
// (graphics_system.cc:311). The guest swap-complete interrupt
|
||||
// callback divides by the low 12 bits (`height = 0x2D0`); a 0
|
||||
// read trips its `twi` divide-guard and aborts the ISR before
|
||||
// it acknowledges the per-present swap fence — which strands
|
||||
// the present/title loop. Mirror canary exactly.
|
||||
reg::D1MODE_VIEWPORT_SIZE => 0x0500_02D0,
|
||||
_ => {
|
||||
tracing::trace!(
|
||||
reg = format_args!("{reg_index:#x}"),
|
||||
|
||||
@@ -5,9 +5,8 @@
|
||||
//! rectangles) we rewrite indices on the CPU side so the host just sees a
|
||||
//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
|
||||
//!
|
||||
//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
|
||||
//! (list, strip, fan). Rectangle + quad expansions are stubs logged via
|
||||
//! `tracing::warn!` for later.
|
||||
//! Scope: list, strip, fan, quad, and rectangle expansions are all handled
|
||||
//! (rectangles via CPU triangle-list rewrite — see `expand_rectangles`).
|
||||
|
||||
use crate::draw_state::{IndexSize, PrimitiveType};
|
||||
|
||||
@@ -138,18 +137,43 @@ fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitiv
|
||||
}
|
||||
|
||||
/// Rectangle lists: a Xenos-specific primitive where each group of 3
|
||||
/// vertices defines a right-angle rectangle by its three non-repeated
|
||||
/// corners (the 4th is derived). The uber-shader doesn't support this yet;
|
||||
/// the ucode translator will emulate it as a geometry-stage fake. For P3
|
||||
/// we emit an empty draw.
|
||||
fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
|
||||
tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
|
||||
metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
|
||||
/// vertices defines a rectangle; the 4th corner is extrapolated as
|
||||
/// `v3 = v0 + v2 - v1` (parallelogram completion). Canary expands this in a
|
||||
/// host vertex-shader variant (`kRectangleListAsTriangleStrip`,
|
||||
/// `primitive_processor.cc:389-456`): a 4-vertex triangle strip per rect with
|
||||
/// the 4th corner synthesized *in the VS* from the host-vertex index.
|
||||
///
|
||||
/// Our replay pipeline has no host-VS corner synthesis (and the procedural
|
||||
/// `vs_main` does not consume `rewritten_indices` yet), so we mirror the
|
||||
/// `expand_quads`/`expand_fan` CPU idiom and emit the 3 real vertices of each
|
||||
/// rect as one triangle list `(v0,v1,v2)` — the visible lower half of the
|
||||
/// rect. This un-rejects the draw and gives a faithful `host_vertex_count`.
|
||||
///
|
||||
/// TODO: once `vs_main` does real vertex fetch + interpolation, upgrade to the
|
||||
/// full quad — 6 indices `[v0,v1,v2, v2,v1,v3]` with a synthesized `v3` corner
|
||||
/// — mirroring canary's `kRectangleListAsTriangleStrip`.
|
||||
fn expand_rectangles(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||
let rect_count = vertex_count / 3;
|
||||
let mut out = Vec::with_capacity(3 * rect_count as usize);
|
||||
let get = |i: u32| -> u32 {
|
||||
match indices {
|
||||
Some(buf) => buf[i as usize],
|
||||
None => i,
|
||||
}
|
||||
};
|
||||
for r in 0..rect_count {
|
||||
let base = r * 3;
|
||||
out.push(get(base));
|
||||
out.push(get(base + 1));
|
||||
out.push(get(base + 2));
|
||||
}
|
||||
let host_vertex_count = out.len() as u32;
|
||||
metrics::counter!("gpu.primitive.expanded", "shape" => "rectangle_list").increment(1);
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(Vec::new()),
|
||||
host_vertex_count: 0,
|
||||
rejected: true,
|
||||
rewritten_indices: Some(out),
|
||||
host_vertex_count,
|
||||
rejected: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,6 +237,17 @@ mod tests {
|
||||
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rectangle_list_expansion() {
|
||||
// 2 rects (6 verts) → one triangle (v0,v1,v2) per rect, not rejected.
|
||||
let p = process(PrimitiveType::RectangleList, 6, None);
|
||||
let idx = p.rewritten_indices.unwrap();
|
||||
assert_eq!(idx, vec![0, 1, 2, 3, 4, 5]);
|
||||
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||
assert_eq!(p.host_vertex_count, 6);
|
||||
assert!(!p.rejected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn widen_u16_indices_big_endian() {
|
||||
// 3 indices [1, 2, 0x1234] in BE u16.
|
||||
|
||||
@@ -364,7 +364,11 @@ pub fn copy_to_memory(
|
||||
// Destination coordinates are 0-based against `dest_base` — the
|
||||
// base already points at the top-left of the copy rectangle.
|
||||
let dst_off = tiled_2d_offset(dx, dy, pitch_aligned, bpp_log2);
|
||||
let dst_addr = info.dest_base.wrapping_add(dst_off);
|
||||
// `dest_base` is a bare guest *physical* address; project onto the
|
||||
// committed backing window so resolved pixels land where the guest
|
||||
// (and `vd_swap`'s frontbuffer read) actually see them.
|
||||
let dst_addr =
|
||||
crate::gpu_system::physical_to_backing(info.dest_base.wrapping_add(dst_off));
|
||||
|
||||
if info.source_is_64bpp {
|
||||
let (lo, hi) = match single_sample_idx {
|
||||
|
||||
@@ -32,6 +32,16 @@ pub struct RingBufferView {
|
||||
/// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so
|
||||
/// we don't actually use this for scheduling — kept for observability.
|
||||
pub rptr_writeback_block_dwords: u32,
|
||||
/// True for an indirect-buffer (`INDIRECT_BUFFER`) view. An IB is a fixed
|
||||
/// *linear* sub-stream, not a circular ring: it is fully written when the
|
||||
/// GPU jumps to it, so the read pointer advances monotonically from `0` to
|
||||
/// `size_dwords` and then the buffer is exhausted (the caller ring is
|
||||
/// popped). It must NOT wrap, and the primary `CP_RB_WPTR` must not be
|
||||
/// applied to it. Mirrors canary `ExecuteIndirectBuffer`, which executes
|
||||
/// the IB through a separate `RingBuffer reader_` and restores the primary
|
||||
/// reader afterward (command_processor.cc). Circular (primary-ring)
|
||||
/// semantics are used when this is `false`.
|
||||
pub indirect: bool,
|
||||
}
|
||||
|
||||
impl RingBufferView {
|
||||
@@ -46,7 +56,16 @@ impl RingBufferView {
|
||||
|
||||
/// True if there is pending unread data to consume.
|
||||
pub fn has_pending(&self) -> bool {
|
||||
self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords
|
||||
if !self.is_initialized() {
|
||||
return false;
|
||||
}
|
||||
if self.indirect {
|
||||
// Linear sub-stream: exhausted once the read pointer reaches the
|
||||
// (fixed) write pointer. Never wraps.
|
||||
self.read_offset_dwords < self.write_offset_dwords
|
||||
} else {
|
||||
self.read_offset_dwords != self.write_offset_dwords
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of dwords we can consume without wrapping past the write ptr.
|
||||
@@ -54,7 +73,10 @@ impl RingBufferView {
|
||||
if !self.is_initialized() {
|
||||
return 0;
|
||||
}
|
||||
if self.write_offset_dwords >= self.read_offset_dwords {
|
||||
if self.indirect {
|
||||
self.write_offset_dwords
|
||||
.saturating_sub(self.read_offset_dwords)
|
||||
} else if self.write_offset_dwords >= self.read_offset_dwords {
|
||||
self.write_offset_dwords - self.read_offset_dwords
|
||||
} else {
|
||||
// write has wrapped — we can read up to the end of the ring.
|
||||
@@ -62,13 +84,19 @@ impl RingBufferView {
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance the read pointer by `dwords`, wrapping at `size_dwords`.
|
||||
/// Advance the read pointer by `dwords`. Circular rings wrap at
|
||||
/// `size_dwords`; an indirect buffer advances linearly (no wrap) so it
|
||||
/// terminates exactly at its fixed write pointer.
|
||||
pub fn advance_read(&mut self, dwords: u32) {
|
||||
if self.size_dwords == 0 {
|
||||
return;
|
||||
}
|
||||
self.read_offset_dwords =
|
||||
(self.read_offset_dwords + dwords) % self.size_dwords;
|
||||
if self.indirect {
|
||||
self.read_offset_dwords = self.read_offset_dwords.saturating_add(dwords);
|
||||
} else {
|
||||
self.read_offset_dwords =
|
||||
(self.read_offset_dwords + dwords) % self.size_dwords;
|
||||
}
|
||||
}
|
||||
|
||||
/// Guest address for the dword at relative offset `i` from the current
|
||||
@@ -77,7 +105,11 @@ impl RingBufferView {
|
||||
if !self.is_initialized() {
|
||||
return None;
|
||||
}
|
||||
let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords;
|
||||
let off = if self.indirect {
|
||||
self.read_offset_dwords.saturating_add(offset_dwords)
|
||||
} else {
|
||||
(self.read_offset_dwords + offset_dwords) % self.size_dwords
|
||||
};
|
||||
Some(self.base.wrapping_add(off.wrapping_mul(4)))
|
||||
}
|
||||
}
|
||||
@@ -120,4 +152,52 @@ mod tests {
|
||||
assert_eq!(v.addr_at_offset(1), Some(0x4000_0000));
|
||||
assert_eq!(v.addr_at_offset(2), Some(0x4000_0004));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn indirect_buffer_drains_linearly_and_terminates() {
|
||||
// An indirect buffer is a fixed linear sub-stream: read advances from
|
||||
// 0 to `size_dwords` and then is exhausted — it must NOT wrap back to
|
||||
// 0 (which previously caused an infinite re-read of a system command
|
||||
// buffer; iterate-2O). write_offset == size, exactly as the
|
||||
// INDIRECT_BUFFER handler sets it.
|
||||
let mut ib = RingBufferView {
|
||||
base: 0x4adf_5080,
|
||||
size_dwords: 11,
|
||||
read_offset_dwords: 0,
|
||||
write_offset_dwords: 11,
|
||||
rptr_writeback_addr: 0,
|
||||
rptr_writeback_block_dwords: 0,
|
||||
indirect: true,
|
||||
};
|
||||
assert!(ib.has_pending());
|
||||
// Drain the exact packet layout observed for Sylpheed's init IB:
|
||||
// 2 + 3 + 6 dwords = 11.
|
||||
ib.advance_read(2);
|
||||
assert!(ib.has_pending());
|
||||
ib.advance_read(3);
|
||||
assert!(ib.has_pending());
|
||||
ib.advance_read(6); // reaches 11 == write
|
||||
assert_eq!(ib.read_offset_dwords, 11);
|
||||
assert!(
|
||||
!ib.has_pending(),
|
||||
"indirect buffer must terminate at write ptr, not wrap to 0"
|
||||
);
|
||||
// addr_at_offset must not modulo-wrap for an indirect buffer.
|
||||
ib.read_offset_dwords = 9;
|
||||
assert_eq!(ib.addr_at_offset(1), Some(0x4adf_5080 + 10 * 4));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn indirect_flag_does_not_affect_circular_ring() {
|
||||
// Sanity: a circular (primary) ring still wraps as before.
|
||||
let mut v = RingBufferView::new();
|
||||
v.base = 0x4adc_c000;
|
||||
v.size_dwords = 8192;
|
||||
v.read_offset_dwords = 8190;
|
||||
v.write_offset_dwords = 2;
|
||||
assert!(v.has_pending());
|
||||
v.advance_read(4); // (8190 + 4) % 8192 = 2
|
||||
assert_eq!(v.read_offset_dwords, 2);
|
||||
assert!(!v.has_pending());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,8 +45,9 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(_) => vfetch_count += 1,
|
||||
@@ -174,6 +175,50 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect the unique texture-fetch-constant slot indices a shader samples.
|
||||
///
|
||||
/// Walks the same exec-clause / sequence-bitmap path as [`emit_for`] but only
|
||||
/// extracts `TextureFetch.fetch_const` slots, deduplicated and in first-seen
|
||||
/// order. The GPU draw handler uses this to decide which fetch constants to
|
||||
/// decode + cache at draw time (keyed off the *active* pixel shader's real
|
||||
/// `tfetch` instructions rather than a hardcoded slot).
|
||||
pub fn tfetch_slots(parsed: &ParsedShader) -> Vec<u8> {
|
||||
let mut slots: Vec<u8> = Vec::new();
|
||||
for clause in &parsed.cf {
|
||||
if let ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
..
|
||||
} = clause
|
||||
{
|
||||
for i in 0..(*count as usize) {
|
||||
let base = (*address as usize + i) * 3;
|
||||
if base + 2 >= parsed.instructions.len() {
|
||||
break;
|
||||
}
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if !is_fetch {
|
||||
continue;
|
||||
}
|
||||
let words = [
|
||||
parsed.instructions[base],
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
if let FetchInstruction::Texture(tf) = decode_fetch(words) {
|
||||
if !slots.contains(&tf.fetch_const) {
|
||||
slots.push(tf.fetch_const);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
slots
|
||||
}
|
||||
|
||||
fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
|
||||
if !buf.contains(&name) {
|
||||
buf.push(name);
|
||||
@@ -298,6 +343,46 @@ mod tests {
|
||||
emit_for(&shader, "vs");
|
||||
}
|
||||
|
||||
/// `tfetch_slots` should extract the fetch-constant slot of a texture
|
||||
/// fetch (and dedup), and return empty for a flat ALU-only shader.
|
||||
#[test]
|
||||
fn tfetch_slots_extracts_texture_fetch_constants() {
|
||||
// word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, const_index=3 in
|
||||
// bits[24:20] (Xenos `ucode.h:844`) → 0x01 | (3 << 20).
|
||||
let tfetch_w0: u32 = 0x01 | (3u32 << 20);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 2,
|
||||
// instruction 0 is a fetch (bit[0] of its 2-bit field set),
|
||||
// instruction 1 is ALU. is_fetch = (sequence >> (i*2)) & 1.
|
||||
sequence: 0b00_01,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
ControlFlowInstruction::Exit,
|
||||
],
|
||||
instructions: vec![tfetch_w0, 0, 0, /* ALU triple */ 0, 0, 0],
|
||||
};
|
||||
assert_eq!(tfetch_slots(&shader), vec![3]);
|
||||
|
||||
// Flat shader: no fetch bits → no slots.
|
||||
let flat = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
}],
|
||||
instructions: vec![0, 0, 0],
|
||||
};
|
||||
assert!(tfetch_slots(&flat).is_empty());
|
||||
}
|
||||
|
||||
/// P8: a shader containing `LoopStart` should mark `cf_loop` as used
|
||||
/// so the HUD can surface which deferred feature a game triggers.
|
||||
#[test]
|
||||
|
||||
@@ -20,7 +20,15 @@ struct XenosDrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
// iterate-3O: guest dword address that maps to index 0 of `vertex_buffer`.
|
||||
// The CPU uploads a bounded guest-memory window starting at the active
|
||||
// vertex-fetch base; the shader subtracts this base from the absolute
|
||||
// fetch-constant address so it indexes the uploaded window. 0 means "no
|
||||
// real vertex window" (procedural fallback path).
|
||||
vertex_base_dwords: u32,
|
||||
// iterate-3S: guest viewport → host NDC XY transform (Y pre-flipped).
|
||||
ndc_scale: vec2<f32>,
|
||||
ndc_offset: vec2<f32>,
|
||||
};
|
||||
|
||||
struct XenosConstants {
|
||||
@@ -56,6 +64,7 @@ const CF_KIND_LOOP_END: u32 = 5u;
|
||||
const CF_KIND_COND_JMP: u32 = 6u;
|
||||
const CF_KIND_COND_CALL: u32 = 7u;
|
||||
const CF_KIND_RETURN: u32 = 8u;
|
||||
const CF_KIND_NOP: u32 = 9u;
|
||||
const CF_KIND_UNKNOWN: u32 = 15u;
|
||||
|
||||
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
|
||||
@@ -628,8 +637,8 @@ const VFMT_32_32_32_FLOAT: u32 = 57u;
|
||||
// layout in `ucode.h:690`):
|
||||
// w0 [4:0] opcode
|
||||
// w0 [10:5] src_reg[5:0]
|
||||
// w0 [17:11] dst_reg[6:0] + must-be-one
|
||||
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
|
||||
// w0 [17:12] dst_reg[5:0]
|
||||
// w0 [24:20] const_index[4:0], [26:25] const_index_sel[1:0]
|
||||
// w1 [21:16] format[5:0]
|
||||
// w2 [7:0] stride (in dwords)
|
||||
// w2 [30:8] offset (signed, in dwords)
|
||||
@@ -641,9 +650,9 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
let w0 = vs_instr_dword(t, 0u);
|
||||
let w1 = vs_instr_dword(t, 1u);
|
||||
let w2 = vs_instr_dword(t, 2u);
|
||||
let fetch_const = (w0 >> 5u) & 0x1Fu;
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let fetch_const = (w0 >> 20u) & 0x1Fu;
|
||||
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||
let format = (w1 >> 16u) & 0x3Fu;
|
||||
let stride = w2 & 0xFFu;
|
||||
|
||||
@@ -651,7 +660,20 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
// dword 1 carries (endian[1:0], size[25:2]).
|
||||
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
||||
let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u];
|
||||
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||
// iterate-3O: the fetch constant holds an *absolute* guest dword address.
|
||||
// The CPU uploaded a window of guest memory starting at
|
||||
// `draw_ctx.vertex_base_dwords`, so rebase the absolute address into that
|
||||
// window. When no real window was published (`vertex_base_dwords == 0`)
|
||||
// keep the absolute value (the `addr < n` guards below then skip the read
|
||||
// and the procedural fallback position is used).
|
||||
// GPUBUG-108 (iterate-3S): the captured window begins exactly at the fetch
|
||||
// base, so index from 0 (vertex i at i*stride). The uniform `fetch[]` holds
|
||||
// the last-published per-frame constant, not this draw's — recomputing
|
||||
// `abs_base` from it produced a stale out-of-window address (the splash
|
||||
// collapsed to one pixel). Only consult the uniform for the no-window
|
||||
// synthetic fallback.
|
||||
let abs_base = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||
let base_dwords = select(abs_base, 0u, draw_ctx.vertex_base_dwords != 0u);
|
||||
// GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is
|
||||
// big-endian; the host is little-endian. Pre-fix every dword was
|
||||
// bitcast as-is — vertex positions were byte-reversed garbage.
|
||||
@@ -773,20 +795,20 @@ fn interpret_texture_fetch(t: u32, is_vertex: bool) {
|
||||
} else {
|
||||
w0 = ps_instr_dword(t, 0u);
|
||||
}
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let uv = registers[src_reg & 0x7Fu].xy;
|
||||
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||
let uv = registers[src_reg & 0x3Fu].xy;
|
||||
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
|
||||
registers[dst_reg & 0x7Fu] = sample;
|
||||
registers[dst_reg & 0x3Fu] = sample;
|
||||
}
|
||||
|
||||
// Walk an Exec clause's instruction triples.
|
||||
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
|
||||
// (we ignore in MVP); bit 1 = is-fetch.
|
||||
// sequence: 2-bit-per-instruction bitmap. Bit 0 of a pair = fetch(1)/ALU(0);
|
||||
// bit 1 = serialize (ignored). (Xenos `ucode.h:226`.)
|
||||
fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
|
||||
// 0x00 = vertex fetch, 0x01 = texture fetch.
|
||||
@@ -803,7 +825,7 @@ fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||
fn exec_ps(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
interpret_texture_fetch(t, false);
|
||||
} else {
|
||||
@@ -871,7 +893,13 @@ fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
|
||||
// Use registers[OPOS_REG] as position; the procedural fallback above
|
||||
// seeded it so an un-interpreted shader still draws a recognisable
|
||||
// circle.
|
||||
out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
||||
var opos = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
||||
// iterate-3S: guest VS position → host clip space (see translator.rs). When
|
||||
// the transform is unset (procedural fallback) pass through unchanged.
|
||||
if (draw_ctx.ndc_scale.x != 0.0 || draw_ctx.ndc_scale.y != 0.0) {
|
||||
opos = vec4<f32>(opos.xy * draw_ctx.ndc_scale + draw_ctx.ndc_offset * opos.w, opos.z, opos.w);
|
||||
}
|
||||
out.position = opos;
|
||||
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
|
||||
return out;
|
||||
}
|
||||
@@ -962,6 +990,9 @@ fn walk_cf_vs() {
|
||||
// No call stack — mark and continue.
|
||||
reject_mask |= REJECT_CF_CALL;
|
||||
}
|
||||
case CF_KIND_NOP: {
|
||||
// kNop padding / kMarkVsFetchDone hint — no-op, just advance.
|
||||
}
|
||||
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||
}
|
||||
if stop { break; }
|
||||
|
||||
@@ -94,7 +94,9 @@ struct XenosDrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
vertex_base_dwords: u32,
|
||||
ndc_scale: vec2<f32>,
|
||||
ndc_offset: vec2<f32>,
|
||||
};
|
||||
|
||||
struct XenosConstants {
|
||||
@@ -113,9 +115,21 @@ struct XenosConstants {
|
||||
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||
@group(1) @binding(1) var xenos_samp : sampler;
|
||||
|
||||
// iterate-3T: real interpolator passthrough. The Xenos VS exports up to 16
|
||||
// interpolators (export index 0..15); the PS reads interpolator i from its
|
||||
// general register r[i]. We carry 8 interpolator vec4s (covers Sylpheed's
|
||||
// splash: r0=color, r1=texcoord). `color` retained as an alias of interp0 so
|
||||
// older single-color paths keep working.
|
||||
struct VsOut {
|
||||
@builtin(position) position: vec4<f32>,
|
||||
@location(0) color: vec4<f32>,
|
||||
@location(0) interp0: vec4<f32>,
|
||||
@location(1) interp1: vec4<f32>,
|
||||
@location(2) interp2: vec4<f32>,
|
||||
@location(3) interp3: vec4<f32>,
|
||||
@location(4) interp4: vec4<f32>,
|
||||
@location(5) interp5: vec4<f32>,
|
||||
@location(6) interp6: vec4<f32>,
|
||||
@location(7) interp7: vec4<f32>,
|
||||
};
|
||||
|
||||
struct FsOut {
|
||||
@@ -154,6 +168,14 @@ struct EmitCtx {
|
||||
stage: Stage,
|
||||
out: String,
|
||||
indent: usize,
|
||||
/// GPUBUG-114: dword stride of the most recent *full* vfetch, keyed by
|
||||
/// fetch-const register offset. A vfetch_mini carries stride=0 and reuses
|
||||
/// the address + stride of the preceding full vfetch of the same stream
|
||||
/// (canary ucode.h:733). Without this a mini color attribute indexes by its
|
||||
/// tight dword count instead of the real vertex stride → reads the wrong
|
||||
/// vertex's data (Sylpheed's background fill `0x36660986` read garbage →
|
||||
/// white instead of the intended color).
|
||||
last_full_stride: std::collections::HashMap<u32, u32>,
|
||||
}
|
||||
|
||||
impl EmitCtx {
|
||||
@@ -162,6 +184,7 @@ impl EmitCtx {
|
||||
stage,
|
||||
out: String::with_capacity(2048),
|
||||
indent: 0,
|
||||
last_full_stride: std::collections::HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,19 +221,74 @@ impl EmitCtx {
|
||||
self.push("var ps: f32 = 0.0;");
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
// iterate-3T: host→guest vertex-index remap for primitives the
|
||||
// replay draws non-indexed as a flat triangle list. wgpu has no
|
||||
// QuadList/RectangleList topology, so the host issues 6 vertices
|
||||
// per quad/rect and we map them back to the guest's 4/3 source
|
||||
// vertices here (mirrors `primitive.rs` index rewrite, but in the
|
||||
// VS since the replay path is non-indexed):
|
||||
// QuadList(13): 6 host verts → guest [0,1,2, 0,2,3]
|
||||
// RectangleList(8): drawn as one triangle [0,1,2] (the 4th
|
||||
// corner needs cross-vertex synthesis — TODO), so host
|
||||
// indices >=3 fold onto the existing triangle.
|
||||
// Other prims pass through unchanged.
|
||||
self.push("var gvidx: u32 = vidx;");
|
||||
self.push("if (draw_ctx.prim_kind == 13u) {");
|
||||
self.indent += 1;
|
||||
self.push("let q = vidx % 6u; let qbase = (vidx / 6u) * 4u;");
|
||||
self.push("var lut = array<u32, 6>(0u, 1u, 2u, 0u, 2u, 3u);");
|
||||
self.push("gvidx = qbase + lut[q];");
|
||||
self.indent -= 1;
|
||||
self.push("} else if (draw_ctx.prim_kind == 8u) {");
|
||||
self.indent += 1;
|
||||
self.push("let t = vidx % 3u; let rbase = (vidx / 3u) * 3u;");
|
||||
self.push("gvidx = rbase + t;");
|
||||
self.indent -= 1;
|
||||
self.push("}");
|
||||
// Seed r0 with vertex index for simple shaders that read it.
|
||||
self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
|
||||
// Synthetic export slots — match the interpreter's layout so
|
||||
// the fallback path and translator path produce the same
|
||||
// visual output on shaders both support.
|
||||
self.push("r[0] = vec4<f32>(f32(gvidx), 0.0, 0.0, 1.0);");
|
||||
// iterate-3T: real export model. Xenos export index 62 = oPos;
|
||||
// indices 0..15 = interpolators. We hold position + 8
|
||||
// interpolator vec4s; `emit_export` writes the right slot keyed
|
||||
// on the export index.
|
||||
//
|
||||
// iterate-3AE (WHITE-TRIANGLE ROOT): interpolators a VS does NOT
|
||||
// export must default to ZERO, not white. The old `ointerp[0] =
|
||||
// (1,1,1,1)` was an iterate-3T debug convenience ("so a VS that
|
||||
// only exports position still yields a visible non-zero color")
|
||||
// — but it is a FAKE: it injects white that no guest value backs.
|
||||
// The transition/background draws use the position-only VS
|
||||
// `0xd4c14f46` (one vfetch → oPos; it exports NO color) paired
|
||||
// with PS `0xed732b5a` (`ocolor0 = interp0`). With the white
|
||||
// seed, interp0 stayed (1,1,1,1) → the fullscreen fill rendered
|
||||
// OPAQUE WHITE (the diagonal half-triangle artifact that flashed
|
||||
// before each splash logo and persisted across the dev-logo
|
||||
// transition). Canary shows a black background there because the
|
||||
// un-exported interpolator carries no white. Default to
|
||||
// (0,0,0,0): a position-only VS now contributes nothing visible
|
||||
// under its real (opaque or premultiplied) blend, matching
|
||||
// canary, while every VS that really exports interp0 (the logo
|
||||
// `0x03b7b020`, the `0x36660986` color fill) overwrites this seed
|
||||
// and is unaffected. RGB=0 → black fill; A=0 → premultiplied
|
||||
// overlays stay transparent.
|
||||
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
||||
self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
|
||||
self.push("var ointerp: array<vec4<f32>, 8>;");
|
||||
self.push("for (var i = 0u; i < 8u; i = i + 1u) { ointerp[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0); }");
|
||||
}
|
||||
Stage::Pixel => {
|
||||
// Seed r0.xy with interpolated color lane so trivial shaders
|
||||
// that read r0 still produce something.
|
||||
self.push("r[0] = in.color;");
|
||||
self.push("var ocolor0: vec4<f32> = in.color;");
|
||||
// iterate-3T: the PS reads interpolator i from general register
|
||||
// r[i] (Xenos PS input GPR mapping). Seed r0..r7 from the VS's
|
||||
// interpolators so e.g. the logo PS's texcoord (r1) and color
|
||||
// (r0) arrive correctly; tfetch then samples at the real UV.
|
||||
self.push("r[0] = in.interp0;");
|
||||
self.push("r[1] = in.interp1;");
|
||||
self.push("r[2] = in.interp2;");
|
||||
self.push("r[3] = in.interp3;");
|
||||
self.push("r[4] = in.interp4;");
|
||||
self.push("r[5] = in.interp5;");
|
||||
self.push("r[6] = in.interp6;");
|
||||
self.push("r[7] = in.interp7;");
|
||||
self.push("var ocolor0: vec4<f32> = in.interp0;");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -237,6 +315,10 @@ impl EmitCtx {
|
||||
current_alloc = *kind;
|
||||
}
|
||||
ControlFlowInstruction::Exit => break,
|
||||
// Non-executing CF clauses: padding (`kNop`) and the
|
||||
// vertex-fetch-done hint (`kMarkVsFetchDone`). Skip them.
|
||||
ControlFlowInstruction::Nop
|
||||
| ControlFlowInstruction::MarkVsFetchDone => {}
|
||||
ControlFlowInstruction::LoopStart { .. }
|
||||
| ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
|
||||
ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
|
||||
@@ -250,13 +332,41 @@ impl EmitCtx {
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
self.push("var out: VsOut;");
|
||||
// iterate-3S: guest VS position → host clip space. The guest
|
||||
// emits either clip-space or (screen-space, clip disabled)
|
||||
// render-target-pixel coords; `ndc_scale`/`ndc_offset` (from
|
||||
// canary's GetHostViewportInfo, computed CPU-side per draw)
|
||||
// rescale XY into wgpu clip space with Y already flipped. When
|
||||
// the transform is unset (all-zero scale, procedural fallback)
|
||||
// pass the position through unchanged.
|
||||
self.push("if (draw_ctx.ndc_scale.x != 0.0 || draw_ctx.ndc_scale.y != 0.0) {");
|
||||
self.indent += 1;
|
||||
self.push("opos = vec4<f32>(opos.xy * draw_ctx.ndc_scale + draw_ctx.ndc_offset * opos.w, opos.z, opos.w);");
|
||||
self.indent -= 1;
|
||||
self.push("}");
|
||||
self.push("out.position = opos;");
|
||||
self.push("out.color = ocolor;");
|
||||
self.push("out.interp0 = ointerp[0];");
|
||||
self.push("out.interp1 = ointerp[1];");
|
||||
self.push("out.interp2 = ointerp[2];");
|
||||
self.push("out.interp3 = ointerp[3];");
|
||||
self.push("out.interp4 = ointerp[4];");
|
||||
self.push("out.interp5 = ointerp[5];");
|
||||
self.push("out.interp6 = ointerp[6];");
|
||||
self.push("out.interp7 = ointerp[7];");
|
||||
self.push("return out;");
|
||||
}
|
||||
Stage::Pixel => {
|
||||
self.push("var out: FsOut;");
|
||||
self.push("out.color0 = ocolor0;");
|
||||
// GPUBUG-115: saturate the color export to [0,1], flushing NaN
|
||||
// to 0 — exactly what canary does before writing a UNORM render
|
||||
// target (spirv_shader_translator.cc:3607 "Saturate, flushing
|
||||
// NaN to 0"). The Xenos RB clamps PS output for UNORM targets;
|
||||
// without this an out-of-range guest color (Sylpheed's
|
||||
// background fill exports a huge negative float `-32896.5` as a
|
||||
// fullscreen-clear value) writes garbage/NaN to the sRGB target
|
||||
// → renders white instead of the clamped black canary shows.
|
||||
// `clamp(x,0,1)` returns 0 for NaN under WGSL's clamp semantics.
|
||||
self.push("out.color0 = clamp(ocolor0, vec4<f32>(0.0), vec4<f32>(1.0));");
|
||||
self.push("return out;");
|
||||
}
|
||||
}
|
||||
@@ -284,7 +394,9 @@ impl EmitCtx {
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
|
||||
@@ -378,53 +490,185 @@ impl EmitCtx {
|
||||
}
|
||||
|
||||
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
||||
// Xenos's export "register" indexing within an alloc range is
|
||||
// normally (alloc_base + offset). Since our CF stream doesn't
|
||||
// carry per-export slot offsets cleanly, use `alloc` to pick the
|
||||
// target.
|
||||
let lhs = match (self.stage, alloc) {
|
||||
(Stage::Vertex, AllocKind::Position) => "opos",
|
||||
(Stage::Vertex, AllocKind::Interpolators) => "ocolor",
|
||||
(Stage::Vertex, AllocKind::Colors) => "ocolor",
|
||||
(Stage::Vertex, _) => "ocolor", // fall through — any other alloc
|
||||
(Stage::Pixel, AllocKind::Colors) => "ocolor0",
|
||||
(Stage::Pixel, _) => "ocolor0",
|
||||
};
|
||||
let _ = dst_reg; // per-slot export indexing reserved for a richer v2
|
||||
self.emit_masked_write(lhs, expr, mask);
|
||||
// iterate-3T: real Xenos export-index model (replaces the `AllocKind`
|
||||
// heuristic, which collapsed every VS export to a single color slot and
|
||||
// dropped the texcoord interpolator → tfetch sampled (0,0) → flat).
|
||||
// When `export_data` is set the 6-bit vector_dest IS the export index:
|
||||
// VS: 62 = oPos, 63 = oPointSize/edge (ignored), 0..15 = interpolators.
|
||||
// PS: 0..3 = color render targets (we honor RT0).
|
||||
let _ = alloc;
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
let lhs = if dst_reg == 62 {
|
||||
"opos".to_string()
|
||||
} else if dst_reg <= 15 {
|
||||
// Clamp to the 8 interpolator slots we carry; higher slots
|
||||
// are unused by Sylpheed's splash.
|
||||
let i = (dst_reg as usize).min(7);
|
||||
format!("ointerp[{i}u]")
|
||||
} else {
|
||||
// oPointSize (63) / unknown export slot — discard.
|
||||
return;
|
||||
};
|
||||
self.emit_masked_write(&lhs, expr, mask);
|
||||
}
|
||||
Stage::Pixel => {
|
||||
// Only RT0 (export index 0) is wired to the single host target.
|
||||
if dst_reg == 0 {
|
||||
self.emit_masked_write("ocolor0", expr, mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
||||
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
||||
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
||||
// formats alongside the CPU texture cache's format expansion.
|
||||
// GPUBUG-107 (iterate-3S): decode the vertex FORMAT + dword STRIDE from
|
||||
// the vfetch instruction instead of hardcoding R32G32B32A32 (4 floats,
|
||||
// stride 4). Sylpheed's splash quads are `k_32_32_FLOAT` (2 floats,
|
||||
// stride 2); over-reading them put the next vertex's X into .w → a
|
||||
// negative W → the whole rectangle clipped behind the camera. We cover
|
||||
// the float vertex formats (the UI / screen-space draws); other formats
|
||||
// reject to the interpreter.
|
||||
//
|
||||
// GPUBUG-102: the fetch constant (xe_gpu_vertex_fetch_t,
|
||||
// xenos.h:1158-1172) holds the endian field in dword_1's low
|
||||
// 2 bits. Vertex data on Xbox 360 is big-endian; the host is
|
||||
// little-endian. Pre-fix, every dword was bitcast as-is →
|
||||
// vertex positions were byte-reversed garbage and any draw
|
||||
// that did reach the host produced clipped / NaN positions.
|
||||
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
||||
// GPUBUG-102: the fetch constant holds the endian field in dword_1's
|
||||
// low 2 bits; Xbox 360 vertex data is big-endian, so `gpu_swap` undoes
|
||||
// it per component.
|
||||
// (comps, dwords_read) per format. Float formats are 1 dword/component;
|
||||
// iterate-3T adds the packed-16 `k_16_16` (format 6) used for the logo
|
||||
// UV interpolator — 2 components packed into ONE dword.
|
||||
#[derive(PartialEq)]
|
||||
enum Pack {
|
||||
Float, // N f32 lanes, N dwords
|
||||
Norm16x2, // 2× u16 normalized into [0,1], 1 dword (k_16_16)
|
||||
Norm8x4, // 4× u8 normalized into [0,1], 1 dword (k_8_8_8_8)
|
||||
}
|
||||
let (comps, dwords_read, pack): (u32, u32, Pack) = match vf.format {
|
||||
36 => (1, 1, Pack::Float), // k_32_FLOAT
|
||||
37 => (2, 2, Pack::Float), // k_32_32_FLOAT
|
||||
57 => (3, 3, Pack::Float), // k_32_32_32_FLOAT
|
||||
38 => (4, 4, Pack::Float), // k_32_32_32_32_FLOAT
|
||||
6 => (4, 1, Pack::Norm8x4), // k_8_8_8_8 (packed RGBA8 — GPUBUG-112)
|
||||
25 => (2, 1, Pack::Norm16x2), // k_16_16
|
||||
_ => return Err(reject::VFETCH_FMT),
|
||||
};
|
||||
// iterate-3X (GPUBUG-110): index the fetch-constant region by the full
|
||||
// `const_index*3 + const_index_sel` mapping (canary `ucode.h:700`),
|
||||
// packed as `const_index*6 + sel*2` dwords. The previous expression
|
||||
// `(vf.raw[0] >> 5) & 0x1F` read the *src_reg* bits, not the const
|
||||
// index — wrong for the endian term and the no-window fallback base.
|
||||
let const_off = vf.const_reg_offset();
|
||||
// GPUBUG-114: a full vfetch carries the real vertex dword stride; a
|
||||
// vfetch_mini reuses the address + stride of the preceding full vfetch
|
||||
// of the same stream (canary ucode.h:733). Track the last full stride
|
||||
// per fetch-const and inherit it for mini-fetches (stride field == 0).
|
||||
let stride = if vf.is_mini_fetch || vf.stride == 0 {
|
||||
*self
|
||||
.last_full_stride
|
||||
.get(&const_off)
|
||||
.unwrap_or(&dwords_read)
|
||||
} else {
|
||||
self.last_full_stride.insert(const_off, vf.stride as u32);
|
||||
vf.stride as u32
|
||||
};
|
||||
// iterate-3T: per-attribute dword offset within the vertex (vfetches
|
||||
// sharing one fetch constant read different attributes).
|
||||
let attr_off = vf.offset;
|
||||
let src_reg = vf.src_register & 0x7F;
|
||||
let dst_reg = vf.dest_register & 0x7F;
|
||||
// is_signed selects [-1,1] vs [0,1] for normalized integer formats.
|
||||
let signed = vf.is_signed;
|
||||
// Build the per-component reads; unread lanes default to 0/0/0/1 so an
|
||||
// XY-only position keeps W=1 (and Z=0).
|
||||
let lane = |i: u32| -> String {
|
||||
match pack {
|
||||
Pack::Float => {
|
||||
if i < comps {
|
||||
format!("bitcast<f32>(gpu_swap(vertex_buffer[addr + {i}u], endian))")
|
||||
} else if i == 3 {
|
||||
"1.0".to_string()
|
||||
} else {
|
||||
"0.0".to_string()
|
||||
}
|
||||
}
|
||||
Pack::Norm16x2 => {
|
||||
// One dword holds [u16 lo | u16 hi] after the endian swap.
|
||||
// Component 0 = low halfword, component 1 = high halfword.
|
||||
if i == 0 {
|
||||
if signed {
|
||||
"(max(f32(i32(w16 << 16u) >> 16u) / 32767.0, -1.0))".to_string()
|
||||
} else {
|
||||
"(f32(w16 & 0xFFFFu) / 65535.0)".to_string()
|
||||
}
|
||||
} else if i == 1 {
|
||||
if signed {
|
||||
"(max(f32(i32(w16) >> 16u) / 32767.0, -1.0))".to_string()
|
||||
} else {
|
||||
"(f32(w16 >> 16u) / 65535.0)".to_string()
|
||||
}
|
||||
} else if i == 3 {
|
||||
"1.0".to_string()
|
||||
} else {
|
||||
"0.0".to_string()
|
||||
}
|
||||
}
|
||||
Pack::Norm8x4 => {
|
||||
// One dword holds 4× u8 (canary spirv_shader_translator_fetch
|
||||
// k_8_8_8_8: comp0@bit0, comp1@bit8, comp2@bit16, comp3@bit24)
|
||||
// after the endian swap. All four channels present → normalize
|
||||
// to [0,1]. GPUBUG-112: this is the logo/background vertex
|
||||
// COLOR (RGBA8), previously misdecoded as k_16_16 (2 chans,
|
||||
// B forced 0) → white texture × (R,G,0) = yellow.
|
||||
let sh = i * 8;
|
||||
if signed {
|
||||
format!(
|
||||
"(max(f32(i32(w16 << {l}u) >> 24u) / 127.0, -1.0))",
|
||||
l = 24 - sh
|
||||
)
|
||||
} else {
|
||||
format!("(f32((w16 >> {sh}u) & 0xFFu) / 255.0)")
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
let read_bound = dwords_read - 1;
|
||||
// GPUBUG-108 (iterate-3S): for the captured-geometry path the CPU
|
||||
// uploads a vertex window that begins EXACTLY at the fetch base, so the
|
||||
// base within `vertex_buffer` is 0 and vertex i sits at `i * stride`.
|
||||
// The previous `abs_base - vertex_base_dwords` rebase recomputed the
|
||||
// base from `xenos_consts.fetch[]`, but that uniform carries the
|
||||
// *last-published* (per-frame) fetch constant, not this draw's — for
|
||||
// the splash it was stale (0x8a000002 vs the real 0x0adf… base), so the
|
||||
// rebase produced a huge out-of-window address, the bounds guard
|
||||
// failed, and every vertex kept its seed (vertex_index, 0, 0, 1) →
|
||||
// every quad collapsed to ~one pixel at the origin. Index from 0 when a
|
||||
// real window is present (`vertex_base_dwords != 0`); only the
|
||||
// synthetic/no-window fallback consults the uniform fetch constant.
|
||||
let endian_term = format!("xenos_consts.fetch[{}u] & 0x3u", const_off + 1);
|
||||
// For packed formats (k_16_16, k_8_8_8_8) we read one dword into `w16`
|
||||
// (post endian-swap) and the `lane()` exprs above unpack the channels.
|
||||
let w16_decl = if pack == Pack::Norm16x2 || pack == Pack::Norm8x4 {
|
||||
"let w16 = gpu_swap(vertex_buffer[addr], endian); "
|
||||
} else {
|
||||
""
|
||||
};
|
||||
self.push(&format!(
|
||||
"{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \
|
||||
let fc1 = xenos_consts.fetch[{fc1_idx}u]; \
|
||||
let endian = fc1 & 0x3u; \
|
||||
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
||||
"{{ let endian = {endian_term}; \
|
||||
let vidx = u32(r[{src_reg}u].x); \
|
||||
let addr = base + vidx * 4u; \
|
||||
var base = 0u; \
|
||||
if (draw_ctx.vertex_base_dwords == 0u) {{ \
|
||||
base = (xenos_consts.fetch[{fc0_idx}u] & 0xFFFFFFFCu) >> 2u; \
|
||||
}} \
|
||||
let addr = base + vidx * {stride}u + {attr_off}u; \
|
||||
let n = arrayLength(&vertex_buffer); \
|
||||
if (addr + 3u < n) {{ \
|
||||
r[{dst_reg}u] = vec4<f32>( \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian))); \
|
||||
if (addr + {read_bound}u < n) {{ \
|
||||
{w16_decl}\
|
||||
r[{dst_reg}u] = vec4<f32>({l0}, {l1}, {l2}, {l3}); \
|
||||
}} }}",
|
||||
fc0_idx = fetch_const * 2,
|
||||
fc1_idx = fetch_const * 2 + 1,
|
||||
fc0_idx = const_off,
|
||||
l0 = lane(0),
|
||||
l1 = lane(1),
|
||||
l2 = lane(2),
|
||||
l3 = lane(3),
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
@@ -477,6 +721,22 @@ fn src_operand(src_byte: u8, is_temp: bool, swizzle: u8, negate: bool) -> String
|
||||
}
|
||||
|
||||
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||
// Semantics mirror the runtime interpreter's `exec_vector_op`
|
||||
// (`shaders/xenos_interp.wgsl`), which in turn mirrors canary's
|
||||
// `AluVectorOpcode` (ucode.h:1001+). Side-effecting ops (kill*, setp_push)
|
||||
// need per-invocation state the AOT emitter doesn't track yet → still
|
||||
// `None` (interpreter fallback).
|
||||
let cmp4 = |op: &str| {
|
||||
format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x{op}{b}.x), select(0.0,1.0,{a}.y{op}{b}.y), select(0.0,1.0,{a}.z{op}{b}.z), select(0.0,1.0,{a}.w{op}{b}.w))"
|
||||
)
|
||||
};
|
||||
// CND* : per-lane select(c, b, a <cmp> 0).
|
||||
let cnd4 = |op: &str| {
|
||||
format!(
|
||||
"vec4<f32>(select({c}.x,{b}.x,{a}.x{op}0.0), select({c}.y,{b}.y,{a}.y{op}0.0), select({c}.z,{b}.z,{a}.z{op}0.0), select({c}.w,{b}.w,{a}.w{op}0.0))"
|
||||
)
|
||||
};
|
||||
let s = match op {
|
||||
vop::ADD => format!("({a} + {b})"),
|
||||
vop::MUL => format!("({a} * {b})"),
|
||||
@@ -485,37 +745,63 @@ fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||
vop::MAD => format!("({a} * {b} + {c})"),
|
||||
vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
|
||||
vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
|
||||
vop::DOT2_ADD => format!(
|
||||
"vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"
|
||||
),
|
||||
vop::SEQ => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))"
|
||||
),
|
||||
vop::SGT => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))"
|
||||
),
|
||||
vop::SGE => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))"
|
||||
),
|
||||
vop::SNE => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))"
|
||||
),
|
||||
vop::DOT2_ADD => format!("vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"),
|
||||
vop::SEQ => cmp4("=="),
|
||||
vop::SGT => cmp4(">"),
|
||||
vop::SGE => cmp4(">="),
|
||||
vop::SNE => cmp4("!="),
|
||||
vop::CND_EQ => cnd4("=="),
|
||||
vop::CND_GE => cnd4(">="),
|
||||
vop::CND_GT => cnd4(">"),
|
||||
vop::FRC => format!("fract({a})"),
|
||||
vop::TRUNC => format!("trunc({a})"),
|
||||
vop::FLOOR => format!("floor({a})"),
|
||||
vop::MAX4 => format!("vec4<f32>(max(max({a}.x,{a}.y), max({a}.z,{a}.w)))"),
|
||||
// dst = (1, src0.y*src1.y, src0.z, src1.w) (canary kDst)
|
||||
vop::DST => format!("vec4<f32>(1.0, {a}.y * {b}.y, {a}.z, {b}.w)"),
|
||||
_ => return None,
|
||||
};
|
||||
Some(s)
|
||||
}
|
||||
|
||||
fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
|
||||
// Semantics mirror the runtime interpreter's `exec_scalar_op`
|
||||
// (`shaders/xenos_interp.wgsl`) / canary's `AluScalarOpcode`
|
||||
// (ucode.h:1001+). Side-effecting ops (setp*, kills*, maxas*) need
|
||||
// per-invocation predicate/kill/address state the AOT emitter doesn't
|
||||
// track yet → still `None` (interpreter fallback).
|
||||
let s = match op {
|
||||
sop::ADDS => format!("({a} + {b})"),
|
||||
sop::ADDS_PREV => format!("({a} + {prev})"),
|
||||
sop::MULS => format!("({a} * {b})"),
|
||||
sop::MULS_PREV => format!("({a} * {prev})"),
|
||||
// muls_prev2 / LIT emulation (canary kMulsPrev2): guard against
|
||||
// -FLT_MAX / non-finite ps & b, and b <= 0.
|
||||
sop::MULS_PREV2 => format!(
|
||||
"select({a} * {prev}, -3.4028235e38, {prev} == -3.4028235e38 || !(\
|
||||
{prev} == {prev}) || abs({prev}) > 3.4028235e38 || !({b} == {b}) || \
|
||||
abs({b}) > 3.4028235e38 || {b} <= 0.0)"
|
||||
),
|
||||
sop::MAXS => format!("max({a}, {b})"),
|
||||
sop::MINS => format!("min({a}, {b})"),
|
||||
sop::RCP => format!("xe_rcp({a})"),
|
||||
sop::SEQS => format!("select(0.0, 1.0, {a} == 0.0)"),
|
||||
sop::SGTS => format!("select(0.0, 1.0, {a} > 0.0)"),
|
||||
sop::SGES => format!("select(0.0, 1.0, {a} >= 0.0)"),
|
||||
sop::SNES => format!("select(0.0, 1.0, {a} != 0.0)"),
|
||||
sop::FRCS => format!("fract({a})"),
|
||||
sop::TRUNCS => format!("trunc({a})"),
|
||||
sop::FLOORS => format!("floor({a})"),
|
||||
sop::SUBS => format!("({a} - {b})"),
|
||||
sop::SUBS_PREV => format!("({a} - {prev})"),
|
||||
sop::EXP => format!("exp2({a})"),
|
||||
sop::LOG | sop::LOGC => format!("select(log2({a}), 0.0, {a} == 1.0)"),
|
||||
sop::RCP | sop::RCPC | sop::RCPF => format!("xe_rcp({a})"),
|
||||
sop::RSQ | sop::RSQC | sop::RSQF => {
|
||||
format!("select(0.0, inverseSqrt({a}), {a} > 0.0)")
|
||||
}
|
||||
sop::SQRT => format!("select(0.0, sqrt({a}), {a} >= 0.0)"),
|
||||
sop::SIN => format!("sin({a})"),
|
||||
sop::COS => format!("cos({a})"),
|
||||
sop::RETAIN_PREV => prev.to_string(),
|
||||
_ => return None,
|
||||
};
|
||||
@@ -528,17 +814,68 @@ mod tests {
|
||||
use crate::ucode::alu::{sop, vop};
|
||||
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||
|
||||
/// iterate-3T: the real publisher-logo VS (`vs_key 0x03b7b020`, captured
|
||||
/// from the live boot) must now TRANSLATE — pre-3T it rejected with
|
||||
/// `vfetch_fmt` because (a) the `k_16_16` color stream (format 6) was
|
||||
/// unsupported and (b) the export-index model (62=oPos, 0/1=interpolators)
|
||||
/// was a wrong AllocKind heuristic. This locks in the format-6 + per-
|
||||
/// attribute-offset + export-index work so the UV interpolator reaches the
|
||||
/// pixel shader (texcoord in r1) instead of collapsing to a single color.
|
||||
#[test]
|
||||
fn real_logo_vs_translates_with_interpolators() {
|
||||
let ucode: [u32; 30] = [
|
||||
0x70153003, 0x00001200, 0xC2000000, 0x00001006, 0x00001200, 0xC4000000,
|
||||
0x00002007, 0x00002200, 0x00000000, 0x2DF82000, 0x00393A88, 0x00000006,
|
||||
0x05F81000, 0x4006060A, 0x00000306, 0x05F80000, 0x40253FC8, 0x00000406,
|
||||
0xC80F803E, 0x00000000, 0xC2020200, 0xC8038001, 0x00B0B000, 0xC2000000,
|
||||
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||
];
|
||||
let p = crate::ucode::parse_shader(&ucode);
|
||||
let body = match translate(&p, Stage::Vertex) {
|
||||
Translation::Ok(b) => b,
|
||||
Translation::Reject(r) => panic!("logo VS rejected: {r}"),
|
||||
};
|
||||
// Position must come from the export-index-62 path (`opos`) and the
|
||||
// UV/color interpolators must be exported as distinct slots.
|
||||
assert!(body.contains("opos ="), "no position export: {body}");
|
||||
assert!(body.contains("ointerp[0u]"), "no interp0 export: {body}");
|
||||
assert!(body.contains("ointerp[1u]"), "no interp1 export: {body}");
|
||||
// The k_16_16 attribute must unpack via the packed-16 helper.
|
||||
assert!(body.contains("w16"), "no packed-16 unpack for k_16_16: {body}");
|
||||
}
|
||||
|
||||
/// The logo pixel shader (`ps_key 0x03b79001`) samples its texture at the
|
||||
/// interpolated texcoord register r1 — which the PS now seeds from the VS
|
||||
/// interpolator `in.interp1` (Xenos PS-input-GPR mapping). Verifies the UV
|
||||
/// chain so tfetch samples the real UV instead of (0,0).
|
||||
#[test]
|
||||
fn ps_seeds_interpolators_into_registers() {
|
||||
// A trivial PS that just exports — we only assert the preamble wiring.
|
||||
let p = crate::ucode::ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exit],
|
||||
instructions: vec![],
|
||||
};
|
||||
let body = match translate(&p, Stage::Pixel) {
|
||||
Translation::Ok(b) => b,
|
||||
Translation::Reject(r) => panic!("trivial PS rejected: {r}"),
|
||||
};
|
||||
assert!(body.contains("r[1] = in.interp1;"), "PS must seed r1 from interp1: {body}");
|
||||
}
|
||||
|
||||
fn synthetic_trivial_shader() -> ParsedShader {
|
||||
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||
// precedes so the ALU's export (if it were one) would target oPos.
|
||||
// Word-0 bits 29-31 set so all three operands resolve as temps —
|
||||
// matches the prior assertion `r[0u] = (r[0u] + r[0u])`.
|
||||
let w0 = (1u32 << 29) | (1u32 << 30) | (1u32 << 31);
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (0u32 << 16); // vector_dest = 0
|
||||
// GPUBUG-106 canary layout: dest/mask/scalar_opc in w0; vector_opc +
|
||||
// src_sel in w2. All three operands temps → r0.
|
||||
let w0 = (0u32) // vector_dest = 0
|
||||
| (0xFu32 << 16) // vector_write_mask = 0xF
|
||||
| ((sop::RETAIN_PREV as u32) << 26); // scalar_opc
|
||||
let w1 = 0u32;
|
||||
let w2 = ((vop::ADD as u32) << 24) // vector_opc
|
||||
| (1u32 << 31) // src1_sel = temp
|
||||
| (1u32 << 30) // src2_sel = temp
|
||||
| (1u32 << 29); // src3_sel = temp
|
||||
ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Alloc {
|
||||
@@ -554,7 +891,7 @@ mod tests {
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![w0, 0, w2],
|
||||
instructions: vec![w0, w1, w2],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -642,19 +979,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn shader_using_c0_emits_xenos_consts_read() {
|
||||
// ALU: r0 = c0 + r0. src_a (low byte) is constant index 0;
|
||||
// src_b (next byte) is temp index 0. src_a_is_temp=false →
|
||||
// src1_sel-style bit at w0 bit 29 = 0; src_b_is_temp=true →
|
||||
// bit 30 = 1. (src_c left as 0/temp; unused.)
|
||||
let w0 = 0x00u32 // src_a = c0
|
||||
| (0x00u32 << 8) // src_b = r0
|
||||
| (0x00u32 << 16) // src_c
|
||||
| (0u32 << 29) // src_a_is_temp = false (constant)
|
||||
| (1u32 << 30); // src_b_is_temp = true (register)
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12)
|
||||
| (0u32 << 16);
|
||||
// ALU: r0 = c0 + r0. GPUBUG-106 canary layout. src_a = src1 (w2
|
||||
// 16:23), src_b = src2 (w2 8:15). src1_sel (w2 bit31) = 0 → c0;
|
||||
// src2_sel (w2 bit30) = 1 → r0.
|
||||
let w0 = (0u32) // vector_dest = 0
|
||||
| (0xFu32 << 16) // vector_write_mask
|
||||
| ((sop::RETAIN_PREV as u32) << 26); // scalar_opc
|
||||
let w2 = ((vop::ADD as u32) << 24) // vector_opc
|
||||
| (0u32 << 16) // src1_reg = 0 → c0
|
||||
| (0u32 << 8) // src2_reg = 0 → r0
|
||||
| (0u32 << 31) // src1_sel = 0 (constant)
|
||||
| (1u32 << 30); // src2_sel = 1 (temp)
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Alloc {
|
||||
@@ -695,9 +1030,16 @@ mod tests {
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
let vf = crate::ucode::fetch::VertexFetch {
|
||||
fetch_const: 0,
|
||||
const_index_sel: 0,
|
||||
src_register: 0,
|
||||
dest_register: 0,
|
||||
dest_write_mask: 0xF,
|
||||
format: 38, // k_32_32_32_32_FLOAT (4 floats)
|
||||
stride: 4,
|
||||
offset: 0,
|
||||
is_signed: false,
|
||||
is_normalized: true,
|
||||
is_mini_fetch: false,
|
||||
raw: [0; 3],
|
||||
};
|
||||
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
||||
@@ -705,6 +1047,70 @@ mod tests {
|
||||
assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}");
|
||||
}
|
||||
|
||||
fn vf(format: u8, stride: u8, offset: u32, mini: bool) -> crate::ucode::fetch::VertexFetch {
|
||||
crate::ucode::fetch::VertexFetch {
|
||||
fetch_const: 0,
|
||||
const_index_sel: 0,
|
||||
src_register: 0,
|
||||
dest_register: 0,
|
||||
dest_write_mask: 0xF,
|
||||
format,
|
||||
stride,
|
||||
offset,
|
||||
is_signed: false,
|
||||
is_normalized: true,
|
||||
is_mini_fetch: mini,
|
||||
raw: [0; 3],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vfetch_k8888_unpacks_four_channels() {
|
||||
// GPUBUG-112: VertexFormat 6 = k_8_8_8_8 (4× u8 normalized, 1 dword),
|
||||
// NOT k_16_16. All four channels (R,G,B,A) must be unpacked so a
|
||||
// vertex COLOR keeps its blue channel (white texture × white color =
|
||||
// white, not yellow).
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
ctx.emit_vfetch(&vf(6, 6, 3, false)).expect("emit");
|
||||
let body = ctx.finish();
|
||||
// Four /255.0 channel reads from one packed dword `w16`.
|
||||
assert!(body.contains("let w16 ="), "needs packed dword: {body}");
|
||||
assert_eq!(body.matches("/ 255.0").count(), 4, "four 8-bit channels: {body}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vfetch_mini_inherits_full_stride() {
|
||||
// GPUBUG-114: a vfetch_mini (stride field 0) inherits the stride of the
|
||||
// preceding full vfetch of the same stream (canary ucode.h:733). Emit a
|
||||
// full fetch (stride 7) then a mini fetch and assert the mini indexes by
|
||||
// stride 7, not its tight dword count.
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
ctx.emit_vfetch(&vf(57, 7, 0, false)).expect("full"); // k_32_32_32_FLOAT
|
||||
ctx.emit_vfetch(&vf(38, 0, 3, true)).expect("mini"); // k_32_32_32_32_FLOAT, mini
|
||||
let body = ctx.finish();
|
||||
assert!(body.contains("vidx * 7u + 3u"), "mini must inherit stride 7: {body}");
|
||||
assert!(!body.contains("vidx * 4u"), "mini must not use tight stride 4: {body}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ps_color_export_is_saturated() {
|
||||
// GPUBUG-115: the PS color export must be clamped to [0,1] (canary
|
||||
// saturates before UNORM RT write) so an out-of-range guest color
|
||||
// doesn't write garbage/white to the sRGB target.
|
||||
let p = crate::ucode::ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exit],
|
||||
instructions: vec![],
|
||||
};
|
||||
let body = match translate(&p, Stage::Pixel) {
|
||||
Translation::Ok(b) => b,
|
||||
Translation::Reject(r) => panic!("PS rejected: {r}"),
|
||||
};
|
||||
assert!(
|
||||
body.contains("clamp(ocolor0, vec4<f32>(0.0), vec4<f32>(1.0))"),
|
||||
"PS must saturate color export: {body}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn loop_clause_rejected() {
|
||||
let shader = ParsedShader {
|
||||
@@ -722,9 +1128,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn unsupported_op_rejected() {
|
||||
let w2 = (29u32) // VOP_MAX_A, not in v1 subset
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12);
|
||||
// GPUBUG-106 layout: vector_write_mask in w0 (16:19), vector_opc in
|
||||
// w2 (24:28). MAX_A (29) is outside the supported subset → reject.
|
||||
let w0 = (0xFu32 << 16) | ((sop::RETAIN_PREV as u32) << 26);
|
||||
let w2 = (29u32) << 24; // VOP_MAX_A
|
||||
let shader = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
@@ -734,7 +1141,7 @@ mod tests {
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
}],
|
||||
instructions: vec![0, 0, w2],
|
||||
instructions: vec![w0, 0, w2],
|
||||
};
|
||||
assert!(matches!(
|
||||
translate(&shader, Stage::Vertex),
|
||||
|
||||
@@ -71,33 +71,50 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
// GPUBUG-106 (iterate-3S): correct the dword field map to match canary's
|
||||
// `AluInstruction` union (ucode.h:2036-2086). Pre-fix this read the
|
||||
// dest/mask/export/scalar-opcode out of `w2`, but they live in `w0`; the
|
||||
// vector opcode + source registers live in `w2`, and swizzle/negate/pred
|
||||
// in `w1`. The misread made every *export* ALU decode with
|
||||
// `vector_write_mask=0` → no oPos/oColor export emitted → the translated VS
|
||||
// collapsed every vertex to the clip origin (degenerate, nothing drawn).
|
||||
//
|
||||
// w0: vector_dest(0:5) vector_dest_rel(6) abs_constants(7)
|
||||
// scalar_dest(8:13) scalar_dest_rel(14) export_data(15)
|
||||
// vector_write_mask(16:19) scalar_write_mask(20:23)
|
||||
// vector_clamp(24) scalar_clamp(25) scalar_opc(26:31)
|
||||
// w1: src3_swiz(0:7) src2_swiz(8:15) src1_swiz(16:23)
|
||||
// src3/2/1_reg_negate(24/25/26) pred_condition(27) is_predicated(28)
|
||||
// w2: src3_reg(0:7) src2_reg(8:15) src1_reg(16:23)
|
||||
// vector_opc(24:28) src3/2/1_sel(29/30/31)
|
||||
//
|
||||
// Our (a,b,c) operands map to canary's (src1,src2,src3).
|
||||
AluInstruction {
|
||||
vector_opcode: (w2 & 0x3F) as u8,
|
||||
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
|
||||
vector_dest: ((w2 >> 16) & 0x7F) as u8,
|
||||
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
|
||||
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
|
||||
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
|
||||
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
|
||||
src_a: (w0 & 0xFF) as u8,
|
||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||
// Word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
|
||||
// ucode.h:2078-2086). Our `src_a` is canary's third operand
|
||||
// (low byte of w0), so its selector is bit 29.
|
||||
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
||||
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
||||
src_a_swiz: (w1 & 0xFF) as u8,
|
||||
vector_opcode: ((w2 >> 24) & 0x1F) as u8,
|
||||
scalar_opcode: ((w0 >> 26) & 0x3F) as u8,
|
||||
vector_dest: (w0 & 0x3F) as u8,
|
||||
scalar_dest: ((w0 >> 8) & 0x3F) as u8,
|
||||
vector_write_mask: ((w0 >> 16) & 0xF) as u8,
|
||||
scalar_write_mask: ((w0 >> 20) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w0 >> 15) & 1) != 0,
|
||||
// Not a real microcode bit — the scalar pipe selects `ps` implicitly
|
||||
// via the *_PREV opcodes, which `scalar_expr` handles by opcode.
|
||||
scalar_src_is_ps: false,
|
||||
src_a: ((w2 >> 16) & 0xFF) as u8,
|
||||
src_b: ((w2 >> 8) & 0xFF) as u8,
|
||||
src_c: (w2 & 0xFF) as u8,
|
||||
// sel==1 → operand is a temp register; sel==0 → ALU constant.
|
||||
src_a_is_temp: ((w2 >> 31) & 1) != 0,
|
||||
src_b_is_temp: ((w2 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w2 >> 29) & 1) != 0,
|
||||
src_a_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
||||
src_c_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 24) & 1) != 0,
|
||||
src_c_swiz: (w1 & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 26) & 1) != 0,
|
||||
src_b_negate: ((w1 >> 25) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 26) & 1) != 0,
|
||||
predicated: ((w0 >> 27) & 1) != 0,
|
||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 24) & 1) != 0,
|
||||
predicated: ((w1 >> 28) & 1) != 0,
|
||||
predicate_condition: ((w1 >> 27) & 1) != 0,
|
||||
raw: words,
|
||||
}
|
||||
}
|
||||
@@ -225,19 +242,24 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn decode_extracts_opcodes_and_dests() {
|
||||
// Build a minimal ALU word:
|
||||
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
|
||||
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RCP as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (3u32 << 16) // vector_dest
|
||||
| (7u32 << 24); // scalar_dest
|
||||
let alu = decode_alu([0, 0, w2]);
|
||||
// GPUBUG-106: correct canary field map. w0 carries dest/mask/scalar_opc;
|
||||
// w2 carries vector_opc + source regs.
|
||||
// vector_opcode = ADD (0) → w2 bits 24:28
|
||||
// scalar_opcode = RCP (22) → w0 bits 26:31
|
||||
// vector_dest = 3 → w0 bits 0:5, scalar_dest = 7 → w0 bits 8:13
|
||||
// vector_write_mask = 0xF → w0 bits 16:19, export_data → w0 bit 15
|
||||
let w0 = 3u32 // vector_dest
|
||||
| (7u32 << 8) // scalar_dest
|
||||
| (1u32 << 15) // export_data
|
||||
| (0xFu32 << 16) // vector_write_mask
|
||||
| ((sop::RCP as u32) << 26); // scalar_opc
|
||||
let w2 = (vop::ADD as u32) << 24; // vector_opc
|
||||
let alu = decode_alu([w0, 0, w2]);
|
||||
assert_eq!(alu.vector_opcode, vop::ADD);
|
||||
assert_eq!(alu.scalar_opcode, sop::RCP);
|
||||
assert_eq!(alu.vector_dest, 3);
|
||||
assert_eq!(alu.scalar_dest, 7);
|
||||
assert_eq!(alu.vector_write_mask, 0xF);
|
||||
assert!(alu.vector_dest_is_export);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,7 +43,15 @@ pub enum ControlFlowInstruction {
|
||||
Return,
|
||||
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
||||
Alloc { size: u32, kind: AllocKind },
|
||||
/// Exit the shader (terminal).
|
||||
/// `kNop` — fills space in the CF block; executes nothing, does not end
|
||||
/// the shader. (Xenos opcode 0.)
|
||||
Nop,
|
||||
/// `kMarkVsFetchDone` — hint that no more vertex fetches will be performed.
|
||||
/// (Xenos opcode 15.) Non-terminating.
|
||||
MarkVsFetchDone,
|
||||
/// Exit the shader (terminal). Synthesized — Xenos has no dedicated exit
|
||||
/// opcode; the shader ends after an `Exec`/`CondExec` clause with the
|
||||
/// END bit set (`is_end`). Retained for callers/tests that reference it.
|
||||
Exit,
|
||||
/// Unknown / unhandled opcode.
|
||||
Unknown { opcode: u8 },
|
||||
@@ -88,42 +96,66 @@ pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruc
|
||||
fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
// Top 4 bits of the 48-bit payload.
|
||||
let opcode = ((payload >> 44) & 0xF) as u8;
|
||||
// Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
|
||||
// extraction — good enough for the interpreter, which logs unknowns.
|
||||
let predicated = ((payload >> 28) & 1) != 0;
|
||||
let predicate_condition = ((payload >> 29) & 1) != 0;
|
||||
|
||||
// GPUBUG-103 (iterate-3P): clause-level predication is determined by the
|
||||
// *opcode*, not by free bits. The 48-bit CF payload is word0 = bits 0..31,
|
||||
// word1 = bits 32..47. Per canary `ucode.h`:
|
||||
// * `ControlFlowExecInstruction` (kExec/kExecEnd, opcodes 1/2): NOT
|
||||
// predicate-gated — it runs unconditionally.
|
||||
// * `ControlFlowCondExecInstruction` (kCondExec/kCondExecEnd, 3/4): gated
|
||||
// by a *bool constant*, `condition_` at word1 bit 10 = payload bit 42.
|
||||
// We don't model bool-constant gating in the WGSL paths (the bool is
|
||||
// virtually always set for these), so treat as unconditional.
|
||||
// * `ControlFlowCondExecPredInstruction` (kCondExecPred/...End/Clean...,
|
||||
// 5/6/13/14): gated by the *predicate register*; `condition_` at word1
|
||||
// bit 9 = payload bit 41.
|
||||
// The prior code read bits 28/29 (which fall inside `sequence_`/`vc_hi_`)
|
||||
// and stamped `predicated=true` on plenty of plain `kExec` clauses — which
|
||||
// made the P7 translator reject EVERY splash VS as `cf_cond`, forcing the
|
||||
// interpreter (placeholder geometry) for all draws.
|
||||
let is_pred_gated = matches!(opcode, 5 | 6 | 13 | 14);
|
||||
let predicated = is_pred_gated;
|
||||
let predicate_condition = is_pred_gated && ((payload >> 41) & 1) != 0;
|
||||
|
||||
// Xenos `ControlFlowOpcode` (canary `ucode.h:86-160`):
|
||||
// 0 kNop, 1 kExec, 2 kExecEnd, 3 kCondExec, 4 kCondExecEnd,
|
||||
// 5 kCondExecPred, 6 kCondExecPredEnd, 7 kLoopStart, 8 kLoopEnd,
|
||||
// 9 kCondCall, 10 kReturn, 11 kCondJmp, 12 kAlloc,
|
||||
// 13 kCondExecPredClean, 14 kCondExecPredCleanEnd, 15 kMarkVsFetchDone.
|
||||
// All exec variants share the address(12)/count(3)/sequence(12) layout
|
||||
// of `ControlFlowExecInstruction`; the `*End` variants terminate the
|
||||
// shader. (Prior table was off-by-one — it mapped 0→Exec and 1→Exit,
|
||||
// so a real `kExec` clause was misread as a terminal `Exit`, truncating
|
||||
// the CF block and dropping every `tfetch` in it.)
|
||||
let exec = |is_end: bool| ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
};
|
||||
match opcode {
|
||||
0 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: false,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
1 => ControlFlowInstruction::Exit,
|
||||
2 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: true,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
6 => ControlFlowInstruction::LoopStart {
|
||||
0 => ControlFlowInstruction::Nop,
|
||||
1 => exec(false),
|
||||
2 => exec(true),
|
||||
3 => exec(false),
|
||||
4 => exec(true),
|
||||
5 => exec(false),
|
||||
6 => exec(true),
|
||||
7 => ControlFlowInstruction::LoopStart {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
7 => ControlFlowInstruction::LoopEnd {
|
||||
8 => ControlFlowInstruction::LoopEnd {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
8 => ControlFlowInstruction::CondCall {
|
||||
9 => ControlFlowInstruction::CondCall {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
},
|
||||
9 => ControlFlowInstruction::Return,
|
||||
10 => ControlFlowInstruction::CondJmp {
|
||||
10 => ControlFlowInstruction::Return,
|
||||
11 => ControlFlowInstruction::CondJmp {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
@@ -132,6 +164,9 @@ fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
size: (payload & 0x7) as u32,
|
||||
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
||||
},
|
||||
13 => exec(false),
|
||||
14 => exec(true),
|
||||
15 => ControlFlowInstruction::MarkVsFetchDone,
|
||||
other => ControlFlowInstruction::Unknown { opcode: other },
|
||||
}
|
||||
}
|
||||
@@ -141,12 +176,49 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn opcode_exit_decodes() {
|
||||
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
|
||||
fn opcode_nop_and_exec_decode() {
|
||||
// Xenos opcode 0 = kNop (non-terminating padding).
|
||||
let payload: u64 = 0u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
assert_eq!(decode_cf_pair(hi, lo, 0).0, ControlFlowInstruction::Nop);
|
||||
// Xenos opcode 1 = kExec (executes instructions; NOT a terminal exit).
|
||||
let payload: u64 = 1u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||
assert_eq!(cf, ControlFlowInstruction::Exit);
|
||||
match decode_cf_pair(hi, lo, 0).0 {
|
||||
ControlFlowInstruction::Exec { is_end, .. } => assert!(!is_end),
|
||||
other => panic!("opcode 1 should be non-end Exec, got {other:?}"),
|
||||
}
|
||||
// Xenos opcode 15 = kMarkVsFetchDone (non-terminating hint).
|
||||
let payload: u64 = 15u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
assert_eq!(
|
||||
decode_cf_pair(hi, lo, 0).0,
|
||||
ControlFlowInstruction::MarkVsFetchDone
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_logo_shader_has_tfetch_clauses() {
|
||||
// The publisher-logo pixel shader E59B2B3DA4AA9008 (captured from the
|
||||
// canary oracle, byte-identical to the microcode our guest IM_LOADs).
|
||||
// Regression for iterate-3M: the old off-by-one opcode table decoded
|
||||
// its leading `kExec` (opcode 1) as a terminal `Exit`, truncating the
|
||||
// CF block so the `tfetch2D` never appeared → flat splash.
|
||||
let ucode: [u32; 24] = [
|
||||
0x00011002, 0x00001200, 0xC4000000, 0x00004003, 0x00002200, 0x00000000,
|
||||
0x10082021, 0x1F1FF688, 0x00004000, 0xC8080001, 0x001B1B00, 0xC1020000,
|
||||
0xC8070000, 0x00C0C000, 0xC1020000, 0xC8070001, 0x00C01B00, 0xC1000100,
|
||||
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||
];
|
||||
let p = crate::ucode::parse_shader(&ucode);
|
||||
let exec_clauses = p
|
||||
.cf
|
||||
.iter()
|
||||
.filter(|c| matches!(c, ControlFlowInstruction::Exec { .. }))
|
||||
.count();
|
||||
assert!(exec_clauses >= 1, "expected >=1 Exec clause, cf={:?}", p.cf);
|
||||
let slots = crate::shader_metrics::tfetch_slots(&p);
|
||||
assert!(!slots.is_empty(), "expected tfetch slots, got none; cf={:?}", p.cf);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -17,17 +17,64 @@ pub enum FetchInstruction {
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct VertexFetch {
|
||||
/// Vertex fetch constant index (0..=95).
|
||||
/// Vertex fetch *const_index* (5 bits, w0[20:24]). The full fetch-constant
|
||||
/// index is `const_index * 3 + const_index_sel` (canary `ucode.h:700`); use
|
||||
/// [`VertexFetch::const_reg_offset`] for the register-region dword offset.
|
||||
pub fetch_const: u8,
|
||||
/// iterate-3X (GPUBUG-110): `const_index_sel` (2 bits, w0[25:26]) — selects
|
||||
/// one of the 3 two-dword vertex-fetch constants packed in each 6-dword
|
||||
/// register group. Dropping this read sub-slot 0 of the group, missing the
|
||||
/// real vertex-buffer base for shaders that use sub-slot 1/2 (the publisher
|
||||
/// logo uses `const_index=31, sel=2`).
|
||||
pub const_index_sel: u8,
|
||||
/// Source register index (vertex index in r#).
|
||||
pub src_register: u8,
|
||||
/// Destination register for the fetched value.
|
||||
pub dest_register: u8,
|
||||
/// 4-bit write mask.
|
||||
pub dest_write_mask: u8,
|
||||
/// iterate-3S (GPUBUG-107): `xenos::VertexFormat` (6 bits, dword1[16:21]).
|
||||
/// Determines how many components to read and their packing. Pre-fix the
|
||||
/// translator hardcoded `k_32_32_32_32_FLOAT` (4 floats, stride 4),
|
||||
/// over-striding 2-float UI quads (`k_32_32_FLOAT`) → wrong/clipped
|
||||
/// positions (the next vertex's X bled into .w, giving negative W → the
|
||||
/// whole rectangle was clipped behind the camera).
|
||||
pub format: u8,
|
||||
/// Dword stride between consecutive vertices (dword2[0:7]).
|
||||
pub stride: u8,
|
||||
/// iterate-3T: dword offset of THIS attribute within the vertex stride
|
||||
/// (dword2[16:38] in canary's `VertexFetchInstruction`; the low 23 bits).
|
||||
/// A 6-dword vertex with position@0 + UV@2 + extra@3 needs this so the
|
||||
/// three vfetches sharing one fetch-constant read different attributes
|
||||
/// instead of all reading offset 0.
|
||||
pub offset: u32,
|
||||
/// `is_signed` = canary `fomat_comp_all`, word1 bit 12 (ucode.h:757) —
|
||||
/// selects signed vs unsigned interpretation of packed integer formats.
|
||||
/// (GPUBUG-113: was read from word1 bit 24, which is inside `exp_adjust`.)
|
||||
pub is_signed: bool,
|
||||
/// `is_normalized` = canary `num_format_all == 0`, word1 bit 13
|
||||
/// (ucode.h:758). Set bit ⇒ integer (un-normalized); clear ⇒ normalized.
|
||||
/// We store the normalized sense directly. (GPUBUG-113: was word1 bit 25.)
|
||||
pub is_normalized: bool,
|
||||
/// `is_mini_fetch` = canary word1 bit 30 (ucode.h:764). A mini-fetch reuses
|
||||
/// the address AND STRIDE of the preceding full vfetch of the same stream;
|
||||
/// its own `stride` field is 0. Required so a vfetch_mini color attribute
|
||||
/// indexes by the real vertex stride instead of its tight dword count.
|
||||
pub is_mini_fetch: bool,
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
impl VertexFetch {
|
||||
/// Dword offset of this fetch's 2-dword constant within the fetch-constant
|
||||
/// register region (`CONST_BASE_FETCH`). Vertex fetch constants are packed
|
||||
/// 3 per 6-dword group: `const_index * 6 + const_index_sel * 2`
|
||||
/// (canary `ucode.h:700` `fetch_constant_index = const_index*3 + sel`,
|
||||
/// each constant 2 dwords).
|
||||
pub fn const_reg_offset(&self) -> u32 {
|
||||
self.fetch_const as u32 * 6 + self.const_index_sel as u32 * 2
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct TextureFetch {
|
||||
/// Texture fetch constant index (0..=31).
|
||||
@@ -54,23 +101,47 @@ pub mod op {
|
||||
}
|
||||
|
||||
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||
// Fetch dword0 bitfields (Xenos `ucode.h:740-749` vfetch / `844-845`
|
||||
// tfetch): opcode_value:5, src_reg:6, src_reg_am:1, dst_reg:6,
|
||||
// dst_reg_am:1, (fetch_valid_only|must_be_one):1, const_index:5 @ bit20,
|
||||
// ... The prior decoder read `const_index` from bit 5 (which is actually
|
||||
// `src_reg`), so every fetch reported the wrong fetch-constant slot — the
|
||||
// logo `tfetch2D ..., tf0` was read as `tf1`, and slot 1's empty constant
|
||||
// failed to decode → no texture. The texture-fetch `dimension` lives in
|
||||
// dword2 bits 14..15, not dword1.
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
let opcode = (w0 & 0x1F) as u8;
|
||||
match opcode {
|
||||
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||
const_index_sel: ((w0 >> 25) & 0x3) as u8,
|
||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||
dest_write_mask: (w1 & 0xF) as u8,
|
||||
// dword1[16:21] = VertexFormat. dword2: stride[0:7],
|
||||
// offset (in dwords) [8:?] — empirically the attribute offset of
|
||||
// the textured logo VS lands in dword2[8:15] (pos@4, UV@3,
|
||||
// 3-float@0 in a 6-dword vertex). signed/normalized live higher.
|
||||
format: ((w1 >> 16) & 0x3F) as u8,
|
||||
stride: (w2 & 0xFF) as u8,
|
||||
offset: (w2 >> 8) & 0xFF,
|
||||
// GPUBUG-113: canary ucode.h:757-758,764 — signed=fomat_comp_all
|
||||
// (w1 bit12), normalized=(num_format_all==0) (w1 bit13),
|
||||
// mini-fetch=(w1 bit30). The previous bit24/25 reads landed inside
|
||||
// `exp_adjust`, so signedness/normalization were effectively random.
|
||||
is_signed: ((w1 >> 12) & 1) != 0,
|
||||
is_normalized: ((w1 >> 13) & 1) == 0,
|
||||
is_mini_fetch: ((w1 >> 30) & 1) != 0,
|
||||
raw: words,
|
||||
}),
|
||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
dimension: ((w1 >> 29) & 0x3) as u8,
|
||||
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||
dest_write_mask: (w1 & 0xF) as u8,
|
||||
dimension: ((w2 >> 14) & 0x3) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
_ => FetchInstruction::Unknown { opcode, raw: words },
|
||||
@@ -83,8 +154,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn decode_vertex_fetch() {
|
||||
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
|
||||
// opcode=0 (vertex). Xenos dword0: src_reg@bit5, dst_reg@bit12,
|
||||
// const_index@bit20. fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (2 << 5) | (7 << 12) | (5 << 20);
|
||||
let v = decode_fetch([w0, 0, 0]);
|
||||
match v {
|
||||
FetchInstruction::Vertex(vf) => {
|
||||
@@ -96,13 +168,69 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vertex_fetch_const_index_sel_and_reg_offset() {
|
||||
// iterate-3X (GPUBUG-110): the real publisher-logo vfetch (w0 =
|
||||
// 0x2DF82000) encodes const_index=31, const_index_sel=2. Its fetch
|
||||
// constant lives at dword offset `31*6 + 2*2 = 190` (reg 0x48BE), not
|
||||
// `31*6 = 186` (reg 0x48BA, which held the unused 0x1 slot). Dropping
|
||||
// the sel field made the logo geometry resolve as "no vertex buffer".
|
||||
let v = decode_fetch([0x2DF8_2000, 0, 0]);
|
||||
match v {
|
||||
FetchInstruction::Vertex(vf) => {
|
||||
assert_eq!(vf.fetch_const, 31, "const_index");
|
||||
assert_eq!(vf.const_index_sel, 2, "const_index_sel");
|
||||
assert_eq!(vf.const_reg_offset(), 190, "reg offset = 31*6 + 2*2");
|
||||
}
|
||||
other => panic!("expected Vertex, got {other:?}"),
|
||||
}
|
||||
// sel=0 collapses to the legacy `fetch_const*6` offset (back-compat).
|
||||
let v0 = decode_fetch([0u32 | (5 << 20), 0, 0]);
|
||||
if let FetchInstruction::Vertex(vf) = v0 {
|
||||
assert_eq!(vf.const_index_sel, 0);
|
||||
assert_eq!(vf.const_reg_offset(), 30);
|
||||
} else {
|
||||
panic!("expected Vertex");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vertex_fetch_signed_normalized_mini_bits() {
|
||||
// GPUBUG-113: canary ucode.h:757-758,764 — is_signed=fomat_comp_all
|
||||
// (w1 bit12), is_normalized=(num_format_all==0) (w1 bit13),
|
||||
// is_mini_fetch=(w1 bit30). Validate each bit independently.
|
||||
let mk = |w1: u32| match decode_fetch([0, w1, 0]) {
|
||||
FetchInstruction::Vertex(vf) => vf,
|
||||
_ => panic!("vertex"),
|
||||
};
|
||||
// No bits: unsigned, normalized, full fetch.
|
||||
let v = mk(0);
|
||||
assert!(!v.is_signed);
|
||||
assert!(v.is_normalized);
|
||||
assert!(!v.is_mini_fetch);
|
||||
// bit12 → signed.
|
||||
assert!(mk(1 << 12).is_signed);
|
||||
// bit13 (num_format_all=1) → NOT normalized.
|
||||
assert!(!mk(1 << 13).is_normalized);
|
||||
// bit30 → mini fetch.
|
||||
assert!(mk(1 << 30).is_mini_fetch);
|
||||
// The old (wrong) bits 24/25 must NOT affect signed/normalized.
|
||||
assert!(!mk(1 << 24).is_signed);
|
||||
assert!(mk(1 << 25).is_normalized);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_texture_fetch() {
|
||||
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
|
||||
let t = decode_fetch([w0, (2u32 << 29), 0]);
|
||||
// opcode=1 (texture). const_index@bit20=3, src@bit5=1, dst@bit12=4.
|
||||
// dimension lives in dword2 bits 14..15.
|
||||
let w0 = 1u32 | (1 << 5) | (4 << 12) | (3 << 20);
|
||||
let w2 = 2u32 << 14;
|
||||
let t = decode_fetch([w0, 0, w2]);
|
||||
match t {
|
||||
FetchInstruction::Texture(tf) => {
|
||||
assert_eq!(tf.fetch_const, 3);
|
||||
assert_eq!(tf.src_register, 1);
|
||||
assert_eq!(tf.dest_register, 4);
|
||||
assert_eq!(tf.dimension, 2);
|
||||
}
|
||||
other => panic!("expected Texture, got {other:?}"),
|
||||
|
||||
@@ -48,6 +48,9 @@ pub mod cf_kind {
|
||||
pub const COND_JMP: u32 = 6;
|
||||
pub const COND_CALL: u32 = 7;
|
||||
pub const RETURN: u32 = 8;
|
||||
/// Non-executing CF clause: `kNop` padding or `kMarkVsFetchDone` hint.
|
||||
/// The WGSL CF walker treats this as a no-op (advance, do not reject).
|
||||
pub const NOP: u32 = 9;
|
||||
pub const UNKNOWN: u32 = 15;
|
||||
}
|
||||
|
||||
@@ -136,6 +139,7 @@ fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
||||
}
|
||||
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
||||
Return => (cf_kind::RETURN, 0, 0),
|
||||
Nop | MarkVsFetchDone => (cf_kind::NOP, 0, 0),
|
||||
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
||||
}
|
||||
}
|
||||
@@ -164,9 +168,11 @@ pub struct ParsedShader {
|
||||
}
|
||||
|
||||
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
||||
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
||||
/// is encoded in the first word's low 12 bits of the last exec clause —
|
||||
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
||||
/// microcode buffer (control flow + instructions). The CF block is implicitly
|
||||
/// bounded: we walk clause-pair rows until one terminates the shader (an
|
||||
/// `Exec`/`CondExec` clause with the END bit set, per Xenos). Everything after
|
||||
/// that row is the instruction block; exec/loop addresses are then rebased to
|
||||
/// be relative to it.
|
||||
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
let mut cf = Vec::new();
|
||||
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
||||
@@ -175,22 +181,50 @@ pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
while i + 2 < raw_dwords.len() {
|
||||
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
||||
let (first, second) = a;
|
||||
let seen_exit = matches!(
|
||||
first,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
) || matches!(
|
||||
second,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
);
|
||||
// The CF block ends after the clause that terminates the shader: an
|
||||
// `Exec` with the END bit set (Xenos `kExecEnd`/`kCondExec*End`), a
|
||||
// synthetic `Exit`, or an `Unknown` opcode (decode ran off the CF
|
||||
// block into instruction data — stop defensively). `Nop` padding
|
||||
// does NOT terminate. (Previously this stopped on the first `Exit`,
|
||||
// but with the corrected opcode table opcode 1 is `kExec`, not exit,
|
||||
// so real exec clauses kept the parse going as intended.)
|
||||
let terminates = |cf: &ControlFlowInstruction| {
|
||||
matches!(
|
||||
cf,
|
||||
ControlFlowInstruction::Exec { is_end: true, .. }
|
||||
| ControlFlowInstruction::Exit
|
||||
| ControlFlowInstruction::Unknown { .. }
|
||||
)
|
||||
};
|
||||
let seen_end = terminates(&first) || terminates(&second);
|
||||
cf.push(first);
|
||||
cf.push(second);
|
||||
i += 3;
|
||||
if seen_exit {
|
||||
if seen_end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Everything after `i` dwords is the instruction block.
|
||||
let instructions = raw_dwords[i..].to_vec();
|
||||
// Xenos exec/loop `address` fields are absolute instruction-triple indices
|
||||
// counted from shader dword 0, but `instructions` here begins *after* the
|
||||
// CF block. Rebase those addresses to be relative to the instruction block
|
||||
// (subtract the CF triple count) so `address * 3` indexes `instructions`
|
||||
// directly. (Without this, every exec read 3 dwords too far per CF triple —
|
||||
// the publisher-logo `tfetch` triple was skipped → flat splash.)
|
||||
let cf_triples = (i / 3) as u32;
|
||||
for clause in cf.iter_mut() {
|
||||
match clause {
|
||||
ControlFlowInstruction::Exec { address, .. } => {
|
||||
*address = address.saturating_sub(cf_triples);
|
||||
}
|
||||
ControlFlowInstruction::LoopStart { address, .. }
|
||||
| ControlFlowInstruction::LoopEnd { address, .. } => {
|
||||
*address = address.saturating_sub(cf_triples);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
ParsedShader { cf, instructions }
|
||||
}
|
||||
|
||||
@@ -235,15 +269,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trivial_exit_clause_stops_parsing() {
|
||||
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
||||
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
||||
let w0 = 0u32; // clause A body
|
||||
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
||||
let w2 = 0u32;
|
||||
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
||||
fn exec_end_clause_stops_parsing() {
|
||||
// Row: clause B = kExecEnd (opcode 2) terminates the CF block.
|
||||
// 48-bit payload of B occupies hi16(word1) + word2; opcode lives in
|
||||
// bits 44..47 of that payload. Put opcode 2 there: payload bit 44 set
|
||||
// for the `2` → (2 << 44). In B's framing, bits 16..47 come from
|
||||
// word2, so word2 bit (44-16)=28 region holds the opcode nibble.
|
||||
let b_payload: u64 = 2u64 << 44; // kExecEnd
|
||||
// B = lo16 from hi16(word1), hi from word2. Reconstruct word1/word2.
|
||||
let word1 = ((b_payload & 0xFFFF) as u32) << 16; // B's low 16 bits → hi16(word1)
|
||||
let word2 = ((b_payload >> 16) & 0xFFFF_FFFF) as u32;
|
||||
let p = parse_shader(&[0, word1, word2, 0xDEAD_BEEF]);
|
||||
assert!(!p.cf.is_empty());
|
||||
// Exit detected → remaining dword is instruction data.
|
||||
// ExecEnd detected in the first row → remaining dword is instruction data.
|
||||
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ xenia-cpu = { workspace = true }
|
||||
xenia-vfs = { workspace = true }
|
||||
xenia-hid = { workspace = true }
|
||||
xenia-gpu = { workspace = true }
|
||||
xenia-apu = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
@@ -182,7 +182,7 @@ pub fn register_exports(state: &mut KernelState) {
|
||||
state.register_export(Xboxkrnl, 0x01F7, "XAudioGetVoiceCategoryVolumeChangeMask", stub_return_zero);
|
||||
state.register_export(Xboxkrnl, 0x01F8, "XAudioGetVoiceCategoryVolume", stub_success);
|
||||
state.register_export(Xboxkrnl, 0x0224, "XMACreateContext", xma_create_context);
|
||||
state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", stub_success);
|
||||
state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", xma_release_context);
|
||||
|
||||
// Crypto
|
||||
state.register_export(Xboxkrnl, 0x0192, "XeCryptSha", stub_success);
|
||||
@@ -486,12 +486,20 @@ fn ke_query_performance_frequency(ctx: &mut PpcContext, _mem: &GuestMemory, _sta
|
||||
ctx.gpr[3] = 50_000_000; // 50 MHz
|
||||
}
|
||||
|
||||
fn ke_query_system_time(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
|
||||
fn ke_query_system_time(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
let time_ptr = ctx.gpr[3] as u32;
|
||||
if time_ptr != 0 {
|
||||
let fake_time: u64 = 132_500_000_000_000_000; // ~2021 FILETIME
|
||||
mem.write_u32(time_ptr, (fake_time >> 32) as u32);
|
||||
mem.write_u32(time_ptr + 4, fake_time as u32);
|
||||
// ITERATE-2J — advance with the same deterministic clock the
|
||||
// KeTimeStampBundle uses (1 global_clock unit ≈ 100 ns) so a guest
|
||||
// that polls KeQuerySystemTime for elapsed time also sees forward
|
||||
// progress instead of a frozen constant. FILETIME base (~2021) +
|
||||
// 100-ns-unit clock.
|
||||
const FILETIME_BASE: u64 = 132_500_000_000_000_000;
|
||||
let hw_id = state.scheduler.current_hw_id().unwrap_or(0);
|
||||
let now = state.now_basis_at(hw_id);
|
||||
let system_time = FILETIME_BASE.wrapping_add(now);
|
||||
mem.write_u32(time_ptr, (system_time >> 32) as u32);
|
||||
mem.write_u32(time_ptr + 4, system_time as u32);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -696,9 +704,36 @@ fn mm_create_kernel_stack(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut K
|
||||
}
|
||||
}
|
||||
|
||||
/// Region-aware guest-virtual → physical translation, matching canary's
|
||||
/// `Memory::GetPhysicalAddress` + `PhysicalHeap::GetPhysicalAddress`
|
||||
/// (`xenia-canary/src/xenia/memory.cc:528-545` and `:2317-2326`).
|
||||
///
|
||||
/// Canary `PhysicalHeap::GetPhysicalAddress`:
|
||||
/// ```c
|
||||
/// address -= heap_base_;
|
||||
/// if (heap_base_ >= 0xE0000000) { address += 0x1000; }
|
||||
/// return address;
|
||||
/// ```
|
||||
/// The three physical heap bases (0xA0000000 / 0xC0000000 / 0xE0000000) all
|
||||
/// alias the same 512 MB physical window, so `address - heap_base ==
|
||||
/// address & 0x1FFFFFFF` for each. The only region-specific delta is the
|
||||
/// `+0x1000` host-address-offset for the 0xE0000000+ 4 KB mirror — see
|
||||
/// `memory.h:368-372` (`host_address_offset` for `heap_base >= 0xE0000000`).
|
||||
/// For non-physical / sub-0x1FFFFFFF virtual addresses canary returns the
|
||||
/// address unchanged, which equals `address & 0x1FFFFFFF` there too.
|
||||
pub(crate) fn translate_physical_address(virt: u32) -> u32 {
|
||||
let phys = virt & 0x1FFF_FFFF;
|
||||
if virt >= 0xE000_0000 {
|
||||
phys + 0x1000
|
||||
} else {
|
||||
phys
|
||||
}
|
||||
}
|
||||
|
||||
fn mm_get_physical_address(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
// r3 = virtual address -> return physical address
|
||||
ctx.gpr[3] &= 0x1FFF_FFFF; // Mask to 512MB physical
|
||||
// r3 = virtual address -> return physical address.
|
||||
// Region-aware, mirroring canary (see `translate_physical_address`).
|
||||
ctx.gpr[3] = translate_physical_address(ctx.gpr[3] as u32) as u64;
|
||||
}
|
||||
|
||||
fn mm_query_address_protect(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
@@ -980,6 +1015,43 @@ fn open_vfs_file(
|
||||
// see a null handle later and trigger `XamShowDirtyDiscErrorUI`.
|
||||
let path = crate::path::object_attributes_to_vfs_path(mem, obj_attrs_ptr)
|
||||
.unwrap_or_default();
|
||||
// AUDIT-2.BF — synthetic silph::WorkerCtx spawn. AUDIT-058/059
|
||||
// identified that ours never activates the 6-level static caller
|
||||
// ladder that ends in `sub_825070F0`, so the four worker threads
|
||||
// it would normally spawn (entries 0x82506528/58/88/B8) never run.
|
||||
// Canary's chain originally fires right after `DiscImageDevice::
|
||||
// ResolvePath("\\dat\\movie")` (audit-058); ours never opens
|
||||
// `dat/movie` because tid=13 wedges before reaching it. We
|
||||
// therefore trigger on the first `dat/*` open — the earliest
|
||||
// such open in ours is `dat/files.tbl` (immediately preceding
|
||||
// tid=12/13 spawn at audit-059 round 1).
|
||||
//
|
||||
// **Round 18 finding** (this commit): when the workers are
|
||||
// spawned runnable, they fault almost immediately (`PC=0` at
|
||||
// cycle ~5.5M on the hw thread carrying worker_3), preempting
|
||||
// ours' boot before the normal guest threads even spawn. The
|
||||
// ctx layout from audit-059 round 5 is incomplete — at least
|
||||
// one of `[+0x28]`/`[+0x2C]`/`[+0x30]` (the three foreign-
|
||||
// arena pointers) must be populated for the worker bodies to
|
||||
// run. Synthesising those is a fresh investigation (round 19+).
|
||||
//
|
||||
// Until then the synth path is **opt-in**: set
|
||||
// `XENIA_SILPH_SYNTH=1` to enable the runnable spawn (will
|
||||
// crash boot), or `XENIA_SILPH_SYNTH=suspend` to spawn but keep
|
||||
// them in `Blocked(Suspended)` (lets boot complete with the
|
||||
// ctx materialised in memory for downstream probes). Default:
|
||||
// disabled — preserves the existing boot trajectory.
|
||||
if !state.silph_synth_done && path.starts_with("dat/") {
|
||||
match std::env::var("XENIA_SILPH_SYNTH").as_deref() {
|
||||
Ok("1") | Ok("run") | Ok("runnable") => {
|
||||
let _ = crate::silph_synth::spawn_silph_workers(state, mem, false);
|
||||
}
|
||||
Ok("suspend") | Ok("suspended") => {
|
||||
let _ = crate::silph_synth::spawn_silph_workers(state, mem, true);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if path.is_empty() && obj_attrs_ptr == 0 {
|
||||
if handle_out != 0 {
|
||||
mem.write_u32(handle_out, 0);
|
||||
@@ -1443,20 +1515,35 @@ fn nt_query_information_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mu
|
||||
*size
|
||||
};
|
||||
|
||||
// Root-of-device opens (`game:\`, `cache:\`, `partition0`) strip to
|
||||
// an empty string post-prefix — see `open_vfs_file`'s synth path.
|
||||
// Games query these as directories (DirectoryObject probe), and
|
||||
// reporting `Directory=0` makes Sylpheed treat the open as "found a
|
||||
// non-directory where I expected a directory" and call
|
||||
// `XamShowDirtyDiscErrorUI`. Canary's `NtQueryInformationFile` pulls
|
||||
// the real file-system entry's kind; we key on path shape since we
|
||||
// don't model directory entries.
|
||||
let is_directory = path.is_empty()
|
||||
|| path.ends_with('/')
|
||||
|| path.ends_with(':');
|
||||
// Snapshot what we need from the handle, then drop the borrow so we can
|
||||
// re-resolve the path against the VFS for its real attribute byte.
|
||||
let path = path.clone();
|
||||
let size = live_size;
|
||||
let position = *position;
|
||||
|
||||
// Pull the REAL GDFX attribute byte (canary `disc_image_device.cc:154`)
|
||||
// for disc-backed handles by re-resolving the stored path. Root-of-device
|
||||
// opens (`game:\`, `cache:\`, `partition0`) strip to an empty string and
|
||||
// synth-stub opens have no VFS entry — for those we fall back to the
|
||||
// path-shape heuristic. Games query these as directories (DirectoryObject
|
||||
// probe), and reporting `Directory=0` makes Sylpheed treat the open as
|
||||
// "found a non-directory where I expected a directory" and call
|
||||
// `XamShowDirtyDiscErrorUI`.
|
||||
let vfs_attributes: Option<u32> = if path.is_empty() {
|
||||
None
|
||||
} else {
|
||||
state
|
||||
.vfs
|
||||
.as_ref()
|
||||
.and_then(|vfs| vfs.stat(&path).ok())
|
||||
.map(|e| e.attributes)
|
||||
.filter(|&a| a != 0)
|
||||
};
|
||||
let is_directory = match vfs_attributes {
|
||||
Some(a) => (a & 0x10) != 0,
|
||||
None => path.is_empty() || path.ends_with('/') || path.ends_with(':'),
|
||||
};
|
||||
|
||||
// `FILE_ATTRIBUTE_DIRECTORY` (NT / Xbox) — advertised in
|
||||
// `FileNetworkOpenInformation.FileAttributes`; Sylpheed's async-I/O
|
||||
// worker queries with class=34 and the calling code checks this bit
|
||||
@@ -1495,10 +1582,13 @@ fn nt_query_information_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mu
|
||||
}
|
||||
mem.write_u64(file_info + 32, size);
|
||||
mem.write_u64(file_info + 40, size);
|
||||
let attrs = if is_directory {
|
||||
FILE_ATTRIBUTE_DIRECTORY
|
||||
} else {
|
||||
FILE_ATTRIBUTE_NORMAL
|
||||
// Prefer the real GDFX attribute byte; fall back to the
|
||||
// DIRECTORY/NORMAL split for root-of-device and synth-stub
|
||||
// handles that have no VFS entry.
|
||||
let attrs = match vfs_attributes {
|
||||
Some(a) => a,
|
||||
None if is_directory => FILE_ATTRIBUTE_DIRECTORY,
|
||||
None => FILE_ATTRIBUTE_NORMAL,
|
||||
};
|
||||
mem.write_u32(file_info + 48, attrs);
|
||||
mem.write_u32(file_info + 52, 0); // pad
|
||||
@@ -1562,6 +1652,79 @@ fn nt_set_information_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut
|
||||
return;
|
||||
}
|
||||
|
||||
// XFileRenameInformation (10): move the backing file to a new path.
|
||||
// Sylpheed's asset-cache decompresses each packed resource to a staging
|
||||
// `cache:\<hash><tail>.tmp` then renames it into its final nested path
|
||||
// `cache:\<hash>\<dir>\<file>`. Without an actual host-FS rename the
|
||||
// nested target stays empty, the later read-back of the decompressed
|
||||
// asset (e.g. the title logo texture `\69d8e45c\e\534ffea`) misses, and
|
||||
// the logo never loads. Mirror canary `xboxkrnl_io_info.cc:226`
|
||||
// (`X_FILE_RENAME_INFORMATION{ replace_existing@0, root_dir_handle@4,
|
||||
// ansi_string@8 }` → `file->Rename(TranslateAnsiPath(ansi_string))`).
|
||||
if info_class == 10 {
|
||||
// Read the target path from the embedded ANSI_STRING at info_ptr+8.
|
||||
let target_raw = match crate::path::read_ansi_string(mem, info_ptr + 8) {
|
||||
Some(s) if !s.is_empty() => s,
|
||||
_ => {
|
||||
const STATUS_OBJECT_NAME_INVALID: u64 = 0xC000_0033;
|
||||
ctx.gpr[3] = STATUS_OBJECT_NAME_INVALID;
|
||||
return;
|
||||
}
|
||||
};
|
||||
// Resolve the destination against the host cache backing dir. We only
|
||||
// support renames within the writable `cache:` mount (the only place
|
||||
// a guest can create files); disc/synth entries are read-only.
|
||||
let new_host = state.resolve_cache_path(&target_raw);
|
||||
// Current backing host path of the handle.
|
||||
let old_host = match state.objects.get(&handle) {
|
||||
Some(KernelObject::File { host_path: Some(hp), .. }) => Some(hp.clone()),
|
||||
Some(KernelObject::File { .. }) => None,
|
||||
_ => {
|
||||
ctx.gpr[3] = STATUS_INVALID_HANDLE;
|
||||
return;
|
||||
}
|
||||
};
|
||||
let status: u64 = match (old_host, new_host) {
|
||||
(Some(old), Some(new)) => {
|
||||
if let Some(parent) = new.parent() {
|
||||
let _ = std::fs::create_dir_all(parent);
|
||||
}
|
||||
match std::fs::rename(&old, &new) {
|
||||
Ok(()) => {
|
||||
// Update the handle so subsequent I/O targets the new
|
||||
// host path + guest path.
|
||||
if let Some(KernelObject::File { path, host_path, .. }) =
|
||||
state.objects.get_mut(&handle)
|
||||
{
|
||||
*path = crate::path::normalize_path(&target_raw);
|
||||
*host_path = Some(new.clone());
|
||||
}
|
||||
tracing::info!(
|
||||
"NtSetInformationFile rename cache {:?} -> {:?} ({:?})",
|
||||
old, new, target_raw
|
||||
);
|
||||
STATUS_SUCCESS
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"NtSetInformationFile rename {:?} -> {:?} failed: {}",
|
||||
old, new, e
|
||||
);
|
||||
STATUS_UNSUCCESSFUL
|
||||
}
|
||||
}
|
||||
}
|
||||
// Non-cache (read-only VFS) source/target: acknowledge without a
|
||||
// host move, matching the prior permissive behaviour.
|
||||
_ => STATUS_SUCCESS,
|
||||
};
|
||||
if iosb_ptr != 0 {
|
||||
write_io_status_block(mem, iosb_ptr, status as u32, info_length);
|
||||
}
|
||||
ctx.gpr[3] = status;
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle lookup.
|
||||
let Some(KernelObject::File { size, position, host_path, .. }) = state.objects.get_mut(&handle) else {
|
||||
ctx.gpr[3] = STATUS_INVALID_HANDLE;
|
||||
@@ -1701,7 +1864,18 @@ fn nt_query_full_attributes_file(ctx: &mut PpcContext, mem: &GuestMemory, state:
|
||||
mem.write_u32(out + 28, filetime as u32);
|
||||
mem.write_u64(out + 32, entry.size);
|
||||
mem.write_u64(out + 40, entry.size);
|
||||
let attrs: u32 = if entry.is_directory { 0x10 } else { 0x80 };
|
||||
// Use the REAL GDFX attribute byte forwarded by the VFS
|
||||
// (canary `disc_image_device.cc:154`) instead of a
|
||||
// path-shape guess. Disc rips never carry a 0-attribute
|
||||
// entry, but guard anyway so a synthesised/legacy entry
|
||||
// still advertises a sane DIRECTORY/NORMAL split.
|
||||
let attrs: u32 = if entry.attributes != 0 {
|
||||
entry.attributes
|
||||
} else if entry.is_directory {
|
||||
0x10
|
||||
} else {
|
||||
0x80
|
||||
};
|
||||
mem.write_u32(out + 48, attrs);
|
||||
mem.write_u32(out + 52, 0);
|
||||
}
|
||||
@@ -1822,6 +1996,7 @@ fn nt_query_directory_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut
|
||||
is_directory: e.is_directory,
|
||||
size: e.size,
|
||||
offset: e.offset,
|
||||
attributes: e.attributes,
|
||||
})
|
||||
})
|
||||
.collect(),
|
||||
@@ -1872,7 +2047,12 @@ fn nt_query_directory_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut
|
||||
mem.write_u64(base + 0x20, 0);
|
||||
mem.write_u64(base + 0x28, entry.size);
|
||||
mem.write_u64(base + 0x30, entry.size);
|
||||
let attrs = if entry.is_directory {
|
||||
// Real GDFX attribute byte (canary `disc_image_device.cc:154`);
|
||||
// fall back to the directory/normal split only for legacy entries
|
||||
// that carry no attribute bits.
|
||||
let attrs = if entry.attributes != 0 {
|
||||
entry.attributes
|
||||
} else if entry.is_directory {
|
||||
FILE_ATTRIBUTE_DIRECTORY
|
||||
} else {
|
||||
FILE_ATTRIBUTE_NORMAL
|
||||
@@ -1940,14 +2120,29 @@ fn nt_close(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
// so a later scheduler round doesn't try to signal a dead handle.
|
||||
// `disarm_timer` is a no-op for non-timer handles.
|
||||
state.disarm_timer(handle);
|
||||
// AUDIT-059 R34: return the slot to the recycle FIFO so a later
|
||||
// `alloc_handle` mints the same ID (matching canary's slab).
|
||||
state.release_handle_slot(handle);
|
||||
}
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn nt_create_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// r3 = handle_ptr, r4 = obj_attrs, r5 = event_type, r6 = initial_state
|
||||
// r3 = handle_ptr, r4 = obj_attrs, r5 = event_type, r6 = initial_state.
|
||||
//
|
||||
// Xenon DISPATCHER_HEADER `Type` (NT convention):
|
||||
// 0 = NotificationEvent (manual-reset)
|
||||
// 1 = SynchronizationEvent (auto-reset)
|
||||
// Canary: `xboxkrnl_threading.cc:668` `ev->Initialize(!event_type, !!initial_state)`
|
||||
// with `XEvent::Initialize(bool manual_reset, ...)` (xevent.cc:25) and
|
||||
// `InitializeNative` (xevent.cc:41 `case 0x00: manual_reset_ = true`).
|
||||
// So `manual_reset = (event_type == 0)`. The Ke-path
|
||||
// (`ensure_dispatcher_object`) was already correct; the Nt-path here was
|
||||
// inverted, mis-classifying Sylpheed's per-frame VSync gate (type=1 auto +
|
||||
// initial=1) as manual-reset+signaled → it stayed signaled forever and
|
||||
// tid=1's main loop spun ~2800x canary's 60Hz.
|
||||
let handle_ptr = ctx.gpr[3] as u32;
|
||||
let manual_reset = ctx.gpr[5] != 0;
|
||||
let manual_reset = ctx.gpr[5] == 0;
|
||||
let signaled = ctx.gpr[6] != 0;
|
||||
let handle = state.alloc_handle_for(KernelObject::Event {
|
||||
manual_reset,
|
||||
@@ -1961,6 +2156,9 @@ fn nt_create_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelSt
|
||||
mem,
|
||||
"NtCreateEvent",
|
||||
);
|
||||
// ITERATE-2C Phase D — audit-049 auto-signal POC. Env-gated; no-op
|
||||
// when `XENIA_SILPH_UI_AUTOSIGNAL_DELAY` is unset.
|
||||
state.maybe_register_silph_autosignal(handle, ctx, mem);
|
||||
if handle_ptr != 0 {
|
||||
mem.write_u32(handle_ptr, handle);
|
||||
}
|
||||
@@ -2048,7 +2246,7 @@ fn nt_set_timer_ex(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelSt
|
||||
// timebase separately (immutable borrow) before any mutation of the
|
||||
// object to keep the borrow-checker happy.
|
||||
let hw_id = state.scheduler.current_hw_id().unwrap_or(0);
|
||||
let now = state.scheduler.ctx(hw_id).timebase;
|
||||
let now = state.now_basis_at(hw_id);
|
||||
|
||||
// Read signed i64 due_time (big-endian hi/lo — same pattern as
|
||||
// parse_timeout). Negative = relative-from-now, positive = absolute
|
||||
@@ -2758,10 +2956,12 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
|
||||
// packets directly into ring memory at the current WPTR (the GPU
|
||||
// backend lives on a worker thread under `--gpu-thread` so we can't
|
||||
// read its `ring.base` from the kernel side without a channel hop).
|
||||
// Per canary: size_log2 is log2(size in BYTES), so size in dwords =
|
||||
// 2^size_log2 / 4 = 1 << (size_log2 - 2).
|
||||
// Per canary `CommandProcessor::InitializeRingBuffer`: the ring is
|
||||
// `1 << (size_log2 + 3)` bytes = `1 << (size_log2 + 1)` dwords (`r4` is
|
||||
// log2 of the size in quadwords). Kept in sync with
|
||||
// `GpuSystem::initialize_ring_buffer`. (Currently bookkeeping-only.)
|
||||
state.ring_base = ptr;
|
||||
state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
|
||||
state.ring_size_dwords = 1u32 << (size_log2 + 1);
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
@@ -2872,52 +3072,86 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// xboxkrnl_video.cc:479. Currently skipped (see below).
|
||||
let _ = fetch_dwords; // silence unused — will be live again under the deferred path
|
||||
|
||||
// The original M2b path zero-filled buffer_ptr (in the system command
|
||||
// buffer) and bumped WPTR by 64 to expose the game's own ring writes.
|
||||
// Keep that untouched — the game still expects buffer_ptr to be a
|
||||
// skippable scratch area, and the bump still exposes any game-batched
|
||||
// PM4 packets for the drain.
|
||||
// iterate-2V: mirror xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:518-548)
|
||||
// FAITHFULLY. The game reserves 64 dwords (256 bytes) in the primary ring
|
||||
// at `buffer_ptr`; canary writes a `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
||||
// fetch-constant patch followed by `PM4_TYPE3(PM4_XE_SWAP)`, then pads with
|
||||
// NOPs — and **NEVER touches `CP_RB_WPTR`**. The game advances the primary
|
||||
// ring write-pointer itself via its own doorbell once it has finished
|
||||
// populating the reserved slot, so VdSwap only fills the bytes.
|
||||
//
|
||||
// iterate-2V FIX (the bug this removes): a prior revision bumped the
|
||||
// primary ring `CP_RB_WPTR` out-of-band here (`extend_write_ptr_by(64)`).
|
||||
// But `buffer_ptr` (~0x4add6efc) is NOT inside the primary ring (base
|
||||
// ~0x4adcd000, 8192 dwords) — it lives ~10k dwords past it, in the
|
||||
// renderer indirect-buffer region. The bogus WPTR bump pushed the GPU
|
||||
// read-pointer PAST the guest's real write-pointer, the drain treated the
|
||||
// overshoot as a circular wrap, and **re-executed the splash's draw
|
||||
// indirect-buffers ~2×** — inflating draws to 78 (real splash ≈ 28; 12
|
||||
// INDIRECT_BUFFERs vs the real 6). Canary's `VdSwap_entry` writes the
|
||||
// block and returns; the swap-complete CP interrupt comes only from the
|
||||
// game's own in-stream `PM4_INTERRUPT` packets, never from VdSwap.
|
||||
if buffer_ptr != 0 {
|
||||
for i in 0..64u32 {
|
||||
mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
|
||||
let mut off = 0u32;
|
||||
let mut put = |i: &mut u32, v: u32| {
|
||||
mem.write_u32(buffer_ptr + *i * 4, v);
|
||||
*i += 1;
|
||||
};
|
||||
// PM4_TYPE0 fetch-constant slot-0 patch (6 dwords payload). The
|
||||
// base_address field is patched to the physical frontbuffer so the
|
||||
// bloom/blur "sample frame N for frame N+1" path reads the right page.
|
||||
let mut patched = fetch_dwords;
|
||||
patched[1] = (patched[1] & 0x0000_0FFF) | ((frontbuffer_addr >> 12) << 12);
|
||||
put(
|
||||
&mut off,
|
||||
xenia_gpu::pm4::make_packet_type0(
|
||||
xenia_gpu::gpu_system::CONST_BASE_FETCH as u16,
|
||||
6,
|
||||
),
|
||||
);
|
||||
for d in patched {
|
||||
put(&mut off, d);
|
||||
}
|
||||
// PM4_TYPE3(PM4_XE_SWAP, 4 dwords): signature, frontbuffer_phys, w, h.
|
||||
put(
|
||||
&mut off,
|
||||
xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
|
||||
);
|
||||
put(&mut off, xenia_gpu::pm4::SWAP_SIGNATURE);
|
||||
put(&mut off, frontbuffer_addr);
|
||||
put(&mut off, width);
|
||||
put(&mut off, height);
|
||||
// Pad the remainder with NOP (Type-2) packets.
|
||||
while off < 64 {
|
||||
put(&mut off, xenia_gpu::pm4::make_packet_type2());
|
||||
}
|
||||
}
|
||||
state.gpu.extend_write_ptr_by(64);
|
||||
// NOTE: We deliberately do NOT bump `CP_RB_WPTR` here (see the iterate-2V
|
||||
// comment above). The drain below consumes only the packets the game has
|
||||
// legitimately advanced the write-pointer over.
|
||||
|
||||
// GPUBUG-DRAIN-001: notify the swap directly.
|
||||
//
|
||||
// Per xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:438-521), the
|
||||
// textbook approach is to inject `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
||||
// (fetch-constant slot-0 patch for the Sylpheed bloom/blur "frame N+1"
|
||||
// sample) followed by `PM4_TYPE3(PM4_XE_SWAP)` directly into the
|
||||
// primary ring at WPTR, then let the natural drain consume them.
|
||||
//
|
||||
// That works in **pure lockstep** (drain runs at every kernel callback
|
||||
// boundary, ring has at most a few hundred packets pending). It
|
||||
// **does not** work under `--parallel` (CPU + GPU ring contention) —
|
||||
// observed empirically: vd_swap's `drain_to_current_wptr` consumes
|
||||
// 8-10 million game-batched IB packets in the 900 ms inline-deadline
|
||||
// window without reaching our tail-injected PM4_XE_SWAP. Under
|
||||
// threaded backend the worker has the same deadline. Either:
|
||||
// (a) the safety-net direct notify (below) fires and gets the swap
|
||||
// counted — but if the worker *eventually* drains past our
|
||||
// injected packet later it would double-count,
|
||||
// (b) we extend the deadline so far that vd_swap blocks for many
|
||||
// seconds — unreasonable for a kernel callback.
|
||||
//
|
||||
// Skip the ring injection unconditionally and post `notify_xe_swap`
|
||||
// directly. The drain still runs (game packets execute as normal).
|
||||
// **Trade-off**: the slot-0 fetch-constant patch is deferred —
|
||||
// tracked as GPUBUG-FETCH-PATCH-001. Sylpheed currently has draws=0,
|
||||
// so a stale slot 0 has no observable effect.
|
||||
// Drain the ring up to whatever the game has actually submitted; any
|
||||
// in-stream `PM4_INTERRUPT` / draw packets execute in order. The
|
||||
// reserved-slot PM4_XE_SWAP is consumed by the GPU only once the game
|
||||
// advances its own doorbell over it. The swap-counter safety net below
|
||||
// keeps host swap bookkeeping live in the meantime.
|
||||
let drained = state.gpu.drain_to_current_wptr(mem);
|
||||
tracing::debug!(drained, "VdSwap: drained PM4 packets");
|
||||
|
||||
// Direct swap notification. Inline mode bumps `swaps_seen`
|
||||
// synchronously; threaded mode posts a `GpuCommand::NotifyXeSwap`
|
||||
// and the worker bumps it asynchronously.
|
||||
// Safety net: if the drain did NOT reach our PM4_XE_SWAP this call (e.g.
|
||||
// an undersized inline deadline left game-batched packets pending), still
|
||||
// bump the host swap counter so the UI present + swap stats stay live.
|
||||
// Skip when the in-stream PM4_XE_SWAP already recorded this frontbuffer
|
||||
// (avoids double-counting). This path does NOT raise a CP interrupt.
|
||||
if frontbuffer_addr != 0 && width > 0 && height > 0 {
|
||||
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
||||
let already_swapped = state
|
||||
.gpu
|
||||
.as_inline_mut()
|
||||
.map(|g| g.last_swap.map(|s| s.frontbuffer_phys) == Some(frontbuffer_addr))
|
||||
.unwrap_or(false);
|
||||
if !already_swapped {
|
||||
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
||||
}
|
||||
}
|
||||
|
||||
// The remaining vd_swap work (UI publish: shader blobs, constants,
|
||||
@@ -2955,27 +3189,34 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
);
|
||||
ui.publish_assets(blobs, constants);
|
||||
|
||||
// P5: try to decode the primary texture (fetch constant slot 0).
|
||||
// Slot 0 is the convention most games use for their main bound
|
||||
// texture at draw time; full N-slot binding waits for P6+. If the
|
||||
// slot is unset or the format isn't supported (magenta stub kicks
|
||||
// in host-side), we skip.
|
||||
//
|
||||
// Texture fetch constants live at `CONST_BASE_FETCH + slot*6` in
|
||||
// the register file; we read the 6 dwords, decode the key, hit
|
||||
// the CPU cache (with page-version freshness), and clone the
|
||||
// decoded bytes across the bridge.
|
||||
const TEX_SLOT: u32 = 0;
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (i, slot) in fetch6.iter_mut().enumerate() {
|
||||
*slot = gpu_inline
|
||||
.register_file
|
||||
.read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32);
|
||||
}
|
||||
let published = if let Some(key) = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)
|
||||
{
|
||||
// Span over the entire tiled texture footprint to pick the
|
||||
// max page version covering it.
|
||||
// P5b: publish the texture the last draw's *active pixel shader*
|
||||
// actually sampled. The GPU draw handler decodes the PS's real
|
||||
// `tfetch` fetch-constant slots into `last_draw_textures`; we publish
|
||||
// the first (the UI binds a single texture today). When the last draw
|
||||
// used a flat (no-tfetch) shader the list is empty, so we fall back to
|
||||
// the legacy slot-0 probe to preserve behavior on flat-only frames.
|
||||
// The legacy single-texture `publish_texture` bridge wants
|
||||
// `(TextureKey, bytes)`; `last_draw_textures` now also carries the
|
||||
// content version (for the per-draw host-cache re-upload). Drop it here.
|
||||
let published = gpu_inline
|
||||
.last_draw_textures
|
||||
.first()
|
||||
.map(|(k, _v, b)| (*k, b.clone()))
|
||||
.or_else(|| {
|
||||
// Fallback: probe fetch constant slot 0 directly. Texture fetch
|
||||
// constants live at `CONST_BASE_FETCH + slot*6` in the register
|
||||
// file; read 6 dwords, decode the key, hit the CPU cache with
|
||||
// page-version freshness, clone the bytes across the bridge.
|
||||
const TEX_SLOT: u32 = 0;
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (i, slot) in fetch6.iter_mut().enumerate() {
|
||||
*slot = gpu_inline
|
||||
.register_file
|
||||
.read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32);
|
||||
}
|
||||
let key = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)?;
|
||||
// Span over the entire tiled texture footprint to pick the max
|
||||
// page version covering it.
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
@@ -2993,12 +3234,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
});
|
||||
metrics::gauge!("gpu.texture_cache.entries")
|
||||
.set(gpu_inline.texture_cache.len() as f64);
|
||||
ui.publish_texture(published);
|
||||
|
||||
// iterate-3O: publish this frame's captured per-draw geometry and
|
||||
// reset the accumulator for the next frame. The UI replays these as
|
||||
// real guest draws (real vertices + prim type) instead of synthetic
|
||||
// placeholder shapes. `frame_captures` is `Some` only under `--ui`.
|
||||
if let Some(caps) = gpu_inline.frame_captures.as_mut() {
|
||||
let drained = std::mem::take(caps);
|
||||
metrics::counter!("gpu.geometry.published").increment(drained.len() as u64);
|
||||
ui.publish_geometry(drained);
|
||||
}
|
||||
}
|
||||
// Notify the UI.
|
||||
if let Some(ui) = state.ui.clone() {
|
||||
@@ -3044,13 +3293,18 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// safer to cap the read at the known total size to avoid OOB.
|
||||
let mut tiled = Vec::with_capacity(total_tiled_bytes);
|
||||
let mut ok = true;
|
||||
// The frontbuffer is a guest *physical* address; project onto the
|
||||
// committed backing window (see `xenia_gpu::physical_to_backing`)
|
||||
// so the present reads the pixels the GPU resolved, not a stale /
|
||||
// zero mirror page.
|
||||
let fb_backing = xenia_gpu::physical_to_backing(swap.frontbuffer_phys);
|
||||
for i in 0..total_tiled_bytes {
|
||||
// read_u8 is cheap — the VirtualMemory handler returns 0
|
||||
// for unmapped pages so we get a recognisable dark frame
|
||||
// rather than a crash if the address turned out bogus.
|
||||
let addr = swap.frontbuffer_phys.wrapping_add(i as u32);
|
||||
let addr = fb_backing.wrapping_add(i as u32);
|
||||
tiled.push(mem.read_u8(addr));
|
||||
if addr < swap.frontbuffer_phys {
|
||||
if addr < fb_backing {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
@@ -3144,6 +3398,7 @@ fn xaudio_register_render_driver(ctx: &mut PpcContext, mem: &GuestMemory, state:
|
||||
callback_pc,
|
||||
callback_arg,
|
||||
wrapped_callback_arg: wrapped,
|
||||
submitted_frames: 0,
|
||||
};
|
||||
let Some(index) = state.xaudio.register(client) else {
|
||||
tracing::warn!("XAudioRegisterRenderDriverClient: client table full");
|
||||
@@ -3252,18 +3507,75 @@ fn xaudio_unregister_render_driver(ctx: &mut PpcContext, _mem: &GuestMemory, sta
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
/// Mirrors canary `XAudioSubmitRenderDriverFrame_entry` →
|
||||
/// `AudioSystem::SubmitFrame(driver_ptr & 0xFFFF, samples)`:
|
||||
/// the guest render-driver mixer (`sub_824DC350`) calls this once per audio
|
||||
/// frame with `r3 = driver_id` (`0x4155_xxxx`) and `r4 = sample buffer`.
|
||||
/// Canary forwards `samples` to the client's `AudioDriver`; the driver's
|
||||
/// playback-completion callback later releases the client semaphore, which is
|
||||
/// the buffer-consumed pacing our XAudio callback ticker
|
||||
/// (`tick_instr` + `try_inject_audio_callback`) already drives. SubmitFrame
|
||||
/// returns void and the caller discards r3 / reads no field SubmitFrame
|
||||
/// writes, so faithfully we validate the client index and account the frame
|
||||
/// (observational; never read back by the guest). Always returns
|
||||
/// `X_ERROR_SUCCESS`, matching canary. Deterministic: only this guest-driven
|
||||
/// export mutates state; no wall-clock, no host thread.
|
||||
fn xaudio_submit_render_driver_frame(
|
||||
ctx: &mut PpcContext,
|
||||
_mem: &GuestMemory,
|
||||
_state: &mut KernelState,
|
||||
state: &mut KernelState,
|
||||
) {
|
||||
let driver_id = ctx.gpr[3] as u32;
|
||||
let index = (driver_id & XAUDIO_DRIVER_INDEX_MASK) as usize;
|
||||
let registered = state.xaudio.record_submit(index);
|
||||
if !registered {
|
||||
// Canary logs and submits silence to keep the callback chain alive
|
||||
// for an unregistered/invalid index; our ticker keeps the chain
|
||||
// alive independently, so a debug log suffices.
|
||||
tracing::debug!(
|
||||
driver_id = format_args!("{driver_id:#010x}"),
|
||||
index,
|
||||
"XAudioSubmitRenderDriverFrame: unregistered client index"
|
||||
);
|
||||
} else if state.xaudio.submitted_frames(index) == 1 {
|
||||
tracing::info!(
|
||||
driver_id = format_args!("{driver_id:#010x}"),
|
||||
index,
|
||||
"XAudioSubmitRenderDriverFrame: first frame submitted by guest mixer"
|
||||
);
|
||||
}
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn xma_create_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
let handle = state.alloc_handle();
|
||||
tracing::info!("XMACreateContext: handle={:#x}", handle);
|
||||
ctx.gpr[3] = handle as u64;
|
||||
/// Mirrors xenia-canary `XMACreateContext_entry(lpdword_t context_out_ptr)`:
|
||||
/// allocate a context from the register-mapped array, write its guest pointer
|
||||
/// to `*context_out_ptr`, and return `X_STATUS_SUCCESS` (or `X_STATUS_NO_MEMORY`
|
||||
/// when the 320-slot array is exhausted).
|
||||
fn xma_create_context(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
let out_ptr = ctx.gpr[3] as u32;
|
||||
let context_ptr = state.xma.lock().unwrap().allocate_context();
|
||||
if out_ptr != 0 {
|
||||
mem.write_u32(out_ptr, context_ptr);
|
||||
}
|
||||
tracing::info!(
|
||||
out_ptr = format_args!("{out_ptr:#010x}"),
|
||||
context_ptr = format_args!("{context_ptr:#010x}"),
|
||||
"XMACreateContext"
|
||||
);
|
||||
ctx.gpr[3] = if context_ptr == 0 {
|
||||
0xC000_0017 // X_STATUS_NO_MEMORY
|
||||
} else {
|
||||
0 // X_STATUS_SUCCESS
|
||||
};
|
||||
}
|
||||
|
||||
/// Mirrors xenia-canary `XMAReleaseContext_entry(lpvoid_t context_ptr)`:
|
||||
/// free the context slot and return 0.
|
||||
fn xma_release_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
let context_ptr = ctx.gpr[3] as u32;
|
||||
state.xma.lock().unwrap().release_context(context_ptr);
|
||||
tracing::info!(context_ptr = format_args!("{context_ptr:#010x}"), "XMAReleaseContext");
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
// ===== Xex =====
|
||||
@@ -3472,7 +3784,7 @@ pub(crate) fn parse_timeout(state: &KernelState, timeout_ptr: u32, mem: &GuestMe
|
||||
return Some(Some(0)); // poll
|
||||
}
|
||||
let hw_id = state.scheduler.current_hw_id().unwrap_or(0);
|
||||
let now = state.scheduler.ctx(hw_id).timebase;
|
||||
let now = state.now_basis_at(hw_id);
|
||||
// Negative = relative, positive = absolute wall-clock. Our timebase is a
|
||||
// plain instruction counter, so we treat all timeouts as "time-units
|
||||
// after now" regardless of sign, using the magnitude.
|
||||
@@ -4159,7 +4471,8 @@ fn nt_yield_execution(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut Ker
|
||||
}
|
||||
|
||||
fn ke_resume_thread(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
|
||||
let raw = ctx.gpr[3] as u32;
|
||||
let handle = resolve_pseudo_handle(state, raw);
|
||||
match state.scheduler.find_by_handle(handle) {
|
||||
Some(r) => {
|
||||
state.scheduler.resume_ref(r);
|
||||
@@ -4175,13 +4488,18 @@ fn nt_resume_thread(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelS
|
||||
// r3 = handle, r4 = prev_suspend_count_ptr
|
||||
let handle = ctx.gpr[3] as u32;
|
||||
let prev_ptr = ctx.gpr[4] as u32;
|
||||
let prev = state
|
||||
.scheduler
|
||||
.find_by_handle(handle)
|
||||
.map(|r| state.scheduler.resume_ref(r))
|
||||
.unwrap_or(0);
|
||||
if prev_ptr != 0 {
|
||||
mem.write_u32(prev_ptr, prev);
|
||||
match state.scheduler.find_by_handle(handle) {
|
||||
Some(r) => {
|
||||
let prev = state.scheduler.resume_ref(r);
|
||||
if prev_ptr != 0 {
|
||||
mem.write_u32(prev_ptr, prev);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
if prev_ptr != 0 {
|
||||
mem.write_u32(prev_ptr, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
ctx.gpr[3] = STATUS_SUCCESS;
|
||||
}
|
||||
@@ -4780,12 +5098,14 @@ mod tests {
|
||||
is_directory: false,
|
||||
size: 0x1000,
|
||||
offset: 0,
|
||||
attributes: 0x81, // NORMAL | READONLY
|
||||
},
|
||||
xenia_vfs::VfsEntry {
|
||||
name: "dat".into(),
|
||||
is_directory: true,
|
||||
size: 0,
|
||||
offset: 0,
|
||||
attributes: 0x11, // DIRECTORY | READONLY
|
||||
},
|
||||
// A grandchild — must NOT appear in root enumeration.
|
||||
xenia_vfs::VfsEntry {
|
||||
@@ -4793,6 +5113,7 @@ mod tests {
|
||||
is_directory: false,
|
||||
size: 0x2000,
|
||||
offset: 0,
|
||||
attributes: 0x81,
|
||||
},
|
||||
],
|
||||
}));
|
||||
@@ -4819,9 +5140,11 @@ mod tests {
|
||||
// NextEntryOffset.
|
||||
let mut cursor: u32 = 0;
|
||||
let mut names: Vec<String> = Vec::new();
|
||||
let mut attrs: Vec<u32> = Vec::new();
|
||||
loop {
|
||||
let entry_base = buf + cursor;
|
||||
let name_len = mem.read_u32(entry_base + 0x3C) as usize;
|
||||
attrs.push(mem.read_u32(entry_base + 0x38));
|
||||
let mut bytes = Vec::with_capacity(name_len);
|
||||
for i in 0..name_len as u32 {
|
||||
bytes.push(mem.read_u8(entry_base + 0x40 + i));
|
||||
@@ -4834,6 +5157,12 @@ mod tests {
|
||||
cursor += next;
|
||||
}
|
||||
assert_eq!(names, vec!["default.xex", "dat"]);
|
||||
// The real GDFX attribute byte must be forwarded verbatim: the file
|
||||
// reports NORMAL|READONLY (no DIRECTORY bit), the directory reports
|
||||
// DIRECTORY|READONLY.
|
||||
assert_eq!(attrs, vec![0x81, 0x11]);
|
||||
assert_eq!(attrs[0] & 0x10, 0, "file must not advertise DIRECTORY");
|
||||
assert_ne!(attrs[1] & 0x10, 0, "dir must advertise DIRECTORY");
|
||||
// A second call on the same handle must return NO_MORE_FILES —
|
||||
// the cursor has advanced past the end.
|
||||
ctx.gpr[3] = handle as u64;
|
||||
@@ -5406,6 +5735,67 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// `NtSetInformationFile` class 10 (`XFileRenameInformation`) must move
|
||||
/// the backing host file to the new `cache:` path and update the handle.
|
||||
/// Mirrors Sylpheed's asset-cache `.tmp` → `\<hash>\<dir>\<file>` move;
|
||||
/// without it the nested target stays empty and the decompressed asset
|
||||
/// (logo texture) never reads back. Faithful to canary `file->Rename`.
|
||||
#[test]
|
||||
fn nt_set_information_file_rename_moves_cache_file() {
|
||||
let (mut ctx, mut mem, mut state) = fresh();
|
||||
// Real temp cache root + a staging `.tmp` file with known bytes.
|
||||
let root = std::env::temp_dir().join(format!("xenia-rs-rename-test-{}", std::process::id()));
|
||||
let _ = std::fs::remove_dir_all(&root);
|
||||
std::fs::create_dir_all(&root).unwrap();
|
||||
let old_host = root.join("69d8e45ce534ffea.tmp");
|
||||
std::fs::write(&old_host, b"LOGOTEX!").unwrap();
|
||||
state.cache_root = Some(root.clone());
|
||||
// Open handle whose backing host_path is the staging file.
|
||||
let handle = state.alloc_handle_for(KernelObject::File {
|
||||
path: "69d8e45ce534ffea.tmp".to_string(),
|
||||
size: 8,
|
||||
position: 0,
|
||||
data: Arc::new(Vec::new()),
|
||||
dir_enum_pos: None,
|
||||
host_path: Some(old_host.clone()),
|
||||
});
|
||||
// X_FILE_RENAME_INFORMATION { replace@0, root_dir@4, ANSI_STRING@8 }.
|
||||
// ANSI_STRING { len u16, max u16, buf u32 } at info_ptr+8; buffer holds
|
||||
// the target path "cache:\69d8e45c\e\534ffea".
|
||||
let info_ptr = SCRATCH_BASE + 0x100;
|
||||
let str_buf = SCRATCH_BASE + 0x200;
|
||||
let target = b"cache:\\69d8e45c\\e\\534ffea";
|
||||
for (i, b) in target.iter().enumerate() {
|
||||
mem.write_u8(str_buf + i as u32, *b);
|
||||
}
|
||||
mem.write_u32(info_ptr, 0); // replace_existing
|
||||
mem.write_u32(info_ptr + 4, 0); // root_dir_handle
|
||||
mem.write_u16(info_ptr + 8, target.len() as u16); // ANSI_STRING.Length
|
||||
mem.write_u16(info_ptr + 10, target.len() as u16); // MaximumLength
|
||||
mem.write_u32(info_ptr + 12, str_buf); // Buffer
|
||||
let iosb_ptr = SCRATCH_BASE + 0x140;
|
||||
ctx.gpr[3] = handle as u64;
|
||||
ctx.gpr[4] = iosb_ptr as u64;
|
||||
ctx.gpr[5] = info_ptr as u64;
|
||||
ctx.gpr[6] = 16;
|
||||
ctx.gpr[7] = 10; // XFileRenameInformation
|
||||
nt_set_information_file(&mut ctx, &mut mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
|
||||
// Staging file gone; nested target exists with the same bytes.
|
||||
let new_host = root.join("69d8e45c").join("e").join("534ffea");
|
||||
assert!(!old_host.exists(), "staging .tmp should be moved away");
|
||||
assert_eq!(std::fs::read(&new_host).unwrap(), b"LOGOTEX!");
|
||||
// Handle now points at the new host + guest path.
|
||||
match state.objects.get(&handle) {
|
||||
Some(KernelObject::File { host_path: Some(hp), path, .. }) => {
|
||||
assert_eq!(hp, &new_host);
|
||||
assert_eq!(path, "cache:/69d8e45c/e/534ffea");
|
||||
}
|
||||
_ => panic!("file handle lost or host_path missing"),
|
||||
}
|
||||
let _ = std::fs::remove_dir_all(&root);
|
||||
}
|
||||
|
||||
/// Read-only VFS — truncating to a different size must fail with
|
||||
/// `STATUS_UNSUCCESSFUL`, matching Canary's error path when
|
||||
/// `file->SetLength(...)` can't honour the request.
|
||||
@@ -6353,4 +6743,23 @@ mod tests {
|
||||
assert!(resolved.ends_with("etc/foo"));
|
||||
std::fs::remove_dir_all(&dir).ok();
|
||||
}
|
||||
|
||||
/// `MmGetPhysicalAddress` must be region-aware, matching canary's
|
||||
/// `PhysicalHeap::GetPhysicalAddress`: the 0xE0000000+ 4 KB mirror gets a
|
||||
/// `+0x1000` host-address-offset; every other region is a flat
|
||||
/// `& 0x1FFFFFFF` mask.
|
||||
#[test]
|
||||
fn mm_get_physical_address_region_aware() {
|
||||
// 0xE0000000 mirror: canary `address - heap_base (==addr & 0x1FFFFFFF)`
|
||||
// then `+ 0x1000`.
|
||||
assert_eq!(translate_physical_address(0xE000_0000), 0x0000_1000);
|
||||
assert_eq!(translate_physical_address(0xE000_5000), 0x0000_6000);
|
||||
assert_eq!(translate_physical_address(0xFFFF_F000), 0x1FFF_F000 + 0x1000);
|
||||
// 0xA0000000 / 0xC0000000 physical heaps: flat mask, no offset.
|
||||
assert_eq!(translate_physical_address(0xA000_0000), 0x0000_0000);
|
||||
assert_eq!(translate_physical_address(0xC012_3000), 0x0012_3000);
|
||||
// Virtual / already-physical (< 0x20000000): unchanged.
|
||||
assert_eq!(translate_physical_address(0x0012_3000), 0x0012_3000);
|
||||
assert_eq!(translate_physical_address(0x4012_3000), 0x0012_3000);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,12 @@ use xenia_cpu::ThreadRef;
|
||||
pub const INTERRUPT_SOURCE_VSYNC: u32 = 0;
|
||||
pub const INTERRUPT_SOURCE_CP: u32 = 1;
|
||||
|
||||
/// The processor the graphics ISR impersonates for a v-sync interrupt.
|
||||
/// Canary hard-codes this: `MarkVblank` → `DispatchInterruptCallback(0, 2)`
|
||||
/// (graphics_system.cc:478). CP interrupts instead use the bit index of the
|
||||
/// `PM4_INTERRUPT` `cpu_mask`.
|
||||
pub const VSYNC_TARGET_CPU: u8 = 2;
|
||||
|
||||
/// Guest-registered V-sync / graphics-interrupt callback (from
|
||||
/// `VdSetGraphicsInterruptCallback`).
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -145,9 +151,16 @@ pub type PendingLocalIrq = [std::sync::atomic::AtomicU8;
|
||||
pub struct InterruptState {
|
||||
/// Registered callback (set by `VdSetGraphicsInterruptCallback`).
|
||||
pub callback: Option<GraphicsInterruptCallback>,
|
||||
/// Bounded FIFO of pending interrupt sources awaiting injection.
|
||||
/// Push-back on queue, pop-front on inject. Over-cap pushes drop.
|
||||
pub pending: VecDeque<u32>,
|
||||
/// Bounded FIFO of pending interrupts awaiting injection, as
|
||||
/// `(source, target_cpu)`. Push-back on queue, pop-front on inject.
|
||||
/// Over-cap pushes drop. `target_cpu` is the processor the graphics
|
||||
/// ISR must impersonate (canary `XThread::SetActiveCpu` / the
|
||||
/// `DispatchInterruptCallback(source, cpu)` argument): the bit index
|
||||
/// of the CP `PM4_INTERRUPT` `cpu_mask` for source=1, and a fixed `2`
|
||||
/// for vsync (canary `DispatchInterruptCallback(0, 2)`). The ISR reads
|
||||
/// it from the PCR (`[r13+268]`) to clear the matching per-CPU bit of
|
||||
/// the swap-acknowledge fence.
|
||||
pub pending: VecDeque<(u32, u8)>,
|
||||
/// When `Some`, some HW thread is currently running a callback; on
|
||||
/// return-to-sentinel we restore this and clear the flag.
|
||||
pub saved: Option<SavedCallbackCtx>,
|
||||
@@ -170,6 +183,28 @@ pub struct InterruptState {
|
||||
/// ticker. `tick_vsync_instr` diffs against this to advance
|
||||
/// `vsync_accumulator`.
|
||||
pub last_instr_count: u64,
|
||||
/// **iterate-3AJ — present-anchored vsync.** Set `true` once the guest
|
||||
/// has presented at least one frame (a `VdSwap`). Before this, the
|
||||
/// vsync ticker uses the legacy fixed instruction-quantum cadence so
|
||||
/// the boot present-loop bootstrap (iterate-2W) still gets the vsyncs
|
||||
/// it needs *before* the first present. After this, vsync is anchored
|
||||
/// to the guest's real present rate (≈1 vblank per present, as on real
|
||||
/// hardware where the title double-buffers at vblank), with only a
|
||||
/// small capped instruction-quantum *fallback* for frames where the
|
||||
/// guest genuinely stops presenting (heavy asset load). This stops the
|
||||
/// proxy from firing ~66 vsyncs during one heavy load frame, which
|
||||
/// collapsed the splash-logo intro fade-in (the guest's vsync counter
|
||||
/// jumped 0→66 in one frame instead of ramping smoothly).
|
||||
pub vsync_present_anchored: bool,
|
||||
/// Last observed guest present (`VdSwap`) count. `tick_vsync_instr`
|
||||
/// diffs the live count against this each call to emit one vblank per
|
||||
/// new present once `vsync_present_anchored` is set.
|
||||
pub last_present_count: u64,
|
||||
/// How many *fallback* (non-present-driven) vsyncs have fired in the
|
||||
/// current dry (no-present) window. Reset to 0 whenever a present
|
||||
/// occurs. Capped at [`DRY_FALLBACK_CAP`] so one heavy non-presenting
|
||||
/// frame cannot fire a long burst of vsyncs (the fade-in regression).
|
||||
pub dry_fallback_fired: u32,
|
||||
/// Wall-clock anchor for the production v-sync ticker. `None` until
|
||||
/// the first `tick_vsync_wallclock` call (lazy init so unit tests
|
||||
/// that never invoke that function don't construct an Instant).
|
||||
@@ -195,6 +230,21 @@ pub struct InterruptState {
|
||||
/// determinism.
|
||||
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
|
||||
|
||||
/// **iterate-3AJ — present-anchored vsync fallback.**
|
||||
///
|
||||
/// Once the guest is in its present loop (`vsync_present_anchored`), each
|
||||
/// guest present emits exactly one vblank — vsync *is* the present cadence,
|
||||
/// as on real Xbox 360 hardware where the title double-buffers at vblank.
|
||||
/// For a frame where the guest stops presenting (e.g. the ~1.1 s splash
|
||||
/// asset-load), we still need *some* vsyncs to keep timers / the present
|
||||
/// loop alive, but firing one per [`VSYNC_INSTR_PERIOD`] would reproduce the
|
||||
/// ~66-vsync spike that collapsed the fade-in. So the fallback fires one
|
||||
/// vblank per `VSYNC_INSTR_PERIOD` of *non-presenting* instructions, but at
|
||||
/// most [`DRY_FALLBACK_CAP`] per dry window (the counter resets on each
|
||||
/// present). A heavy load frame therefore advances the guest vsync counter
|
||||
/// by ≤ `DRY_FALLBACK_CAP` (a small ramp like canary's 0/5/10/2/1…), not 66.
|
||||
pub const DRY_FALLBACK_CAP: u32 = 4;
|
||||
|
||||
/// Wall-clock period for the **production** v-sync ticker. 16.667 ms
|
||||
/// targets exactly 60 Hz. KRNBUG-D08 — converting from the
|
||||
/// instruction-count proxy fixes the `--parallel` rate drop while
|
||||
@@ -211,8 +261,9 @@ impl InterruptState {
|
||||
});
|
||||
}
|
||||
|
||||
/// Queue an interrupt for the next safe injection point.
|
||||
pub fn queue_interrupt(&mut self, source: u32) {
|
||||
/// Queue an interrupt for the next safe injection point. `cpu` is the
|
||||
/// processor the ISR must impersonate (see `pending`).
|
||||
pub fn queue_interrupt(&mut self, source: u32, cpu: u8) {
|
||||
if self.callback.is_none() {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
@@ -221,37 +272,102 @@ impl InterruptState {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
}
|
||||
self.pending.push_back(source);
|
||||
self.pending.push_back((source, cpu));
|
||||
}
|
||||
|
||||
/// Peek at the next pending source without removing it.
|
||||
pub fn peek_next(&self) -> Option<u32> {
|
||||
self.pending.front().copied()
|
||||
self.pending.front().map(|&(source, _)| source)
|
||||
}
|
||||
|
||||
/// Peek at the target CPU of the next pending interrupt.
|
||||
pub fn peek_next_cpu(&self) -> Option<u8> {
|
||||
self.pending.front().map(|&(_, cpu)| cpu)
|
||||
}
|
||||
|
||||
/// Pop the next pending source (called by the injector after it has
|
||||
/// committed to dispatching it).
|
||||
pub fn take_next(&mut self) -> Option<u32> {
|
||||
self.pending.pop_front()
|
||||
self.pending.pop_front().map(|(source, _)| source)
|
||||
}
|
||||
|
||||
/// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
|
||||
/// that need a deterministic clock source. Production code calls
|
||||
/// `tick_vsync_wallclock` instead. Returns `true` if at least one
|
||||
/// v-sync was queued.
|
||||
pub fn tick_vsync_instr(&mut self, current_instr_count: u64) -> bool {
|
||||
/// **Present-anchored** instruction-paced v-sync ticker (the lockstep
|
||||
/// production path; also used by unit tests for a deterministic clock).
|
||||
///
|
||||
/// `current_instr_count` is the running retired-instruction count.
|
||||
/// `present_count` is the guest's running `VdSwap` count (monotonic).
|
||||
///
|
||||
/// Two regimes:
|
||||
///
|
||||
/// 1. **Bootstrap** (`!vsync_present_anchored`, i.e. before the guest's
|
||||
/// first present): legacy fixed-quantum cadence — one vsync per
|
||||
/// [`VSYNC_INSTR_PERIOD`] retired instructions. The boot present loop
|
||||
/// (iterate-2W) needs vsyncs delivered *before* it can present, so
|
||||
/// this regime is unchanged from the original ticker. The first
|
||||
/// observed present flips `vsync_present_anchored`.
|
||||
///
|
||||
/// 2. **Present-anchored** (after the first present): one vblank per
|
||||
/// guest present (vsync *is* the present cadence on real hardware),
|
||||
/// plus a small capped instruction-quantum fallback ([`DRY_FALLBACK_CAP`]
|
||||
/// per dry window) so a frame where the guest stops presenting (heavy
|
||||
/// asset load) still ticks a *few* vsyncs — not ~66, which collapsed
|
||||
/// the splash fade-in.
|
||||
///
|
||||
/// Returns `true` if at least one v-sync was queued.
|
||||
pub fn tick_vsync_instr(&mut self, current_instr_count: u64, present_count: u64) -> bool {
|
||||
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
||||
self.last_instr_count = current_instr_count;
|
||||
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
|
||||
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
||||
return false;
|
||||
|
||||
let new_presents = present_count.saturating_sub(self.last_present_count);
|
||||
self.last_present_count = present_count;
|
||||
if new_presents > 0 {
|
||||
self.vsync_present_anchored = true;
|
||||
}
|
||||
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
||||
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
||||
for _ in 0..periods {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
|
||||
// Regime 1 — bootstrap: legacy fixed instruction quantum. Preserves
|
||||
// the iterate-2W present-loop bootstrap exactly (vsyncs must fire
|
||||
// before the guest can present).
|
||||
if !self.vsync_present_anchored {
|
||||
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
||||
return false;
|
||||
}
|
||||
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
||||
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
||||
for _ in 0..periods {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
true
|
||||
|
||||
// Regime 2 — present-anchored.
|
||||
let mut queued = false;
|
||||
|
||||
if new_presents > 0 {
|
||||
// One vblank per guest present. `queue_interrupt` caps the FIFO,
|
||||
// so a burst of presents in one round can't flood. A fresh
|
||||
// present resets the dry-window state.
|
||||
for _ in 0..new_presents {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
self.vsync_accumulator = 0;
|
||||
self.dry_fallback_fired = 0;
|
||||
queued = true;
|
||||
} else if self.vsync_accumulator >= VSYNC_INSTR_PERIOD
|
||||
&& self.dry_fallback_fired < DRY_FALLBACK_CAP
|
||||
{
|
||||
// Dry frame (no present this tick): the guest stopped presenting
|
||||
// (heavy load). Tick a *capped* number of fallback vsyncs so
|
||||
// timers/the present loop stay alive without re-introducing the
|
||||
// ~66-vsync spike. Consume one period per fired vsync so the
|
||||
// accumulator paces the few fallbacks.
|
||||
self.vsync_accumulator -= VSYNC_INSTR_PERIOD;
|
||||
self.dry_fallback_fired += 1;
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
queued = true;
|
||||
}
|
||||
|
||||
queued
|
||||
}
|
||||
|
||||
/// **Production** — wall-clock v-sync ticker. Fires
|
||||
@@ -288,7 +404,7 @@ impl InterruptState {
|
||||
self.last_vsync_instant = Some(anchor + advance);
|
||||
let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP);
|
||||
for _ in 0..to_queue {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -306,7 +422,7 @@ mod tests {
|
||||
#[test]
|
||||
fn queue_interrupt_drops_without_callback() {
|
||||
let mut s = InterruptState::default();
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 1);
|
||||
assert!(s.pending.is_empty());
|
||||
}
|
||||
@@ -315,9 +431,9 @@ mod tests {
|
||||
fn queue_interrupt_fifo_preserves_order() {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_CP);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_CP, 2);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 0);
|
||||
// FIFO: take_next hands them out in push order.
|
||||
assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
@@ -331,11 +447,11 @@ mod tests {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
for _ in 0..INTERRUPT_QUEUE_CAP {
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
// Over-cap: drops rather than evicting the oldest.
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 2);
|
||||
assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
|
||||
}
|
||||
@@ -345,9 +461,10 @@ mod tests {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
|
||||
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1));
|
||||
// present_count = 0 → bootstrap regime (legacy fixed quantum).
|
||||
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1, 0));
|
||||
assert!(s.pending.is_empty());
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD));
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD, 0));
|
||||
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
}
|
||||
|
||||
@@ -357,10 +474,59 @@ mod tests {
|
||||
// be delivered, not lost.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10));
|
||||
// present_count = 0 → bootstrap regime drains all 3 periods at once.
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10, 0));
|
||||
assert_eq!(s.pending.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_instr_present_anchors_after_first_present() {
|
||||
// iterate-3AJ: once the guest presents, vsync tracks presents (one
|
||||
// vblank per present), NOT the fixed instruction quantum.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
// Bootstrap: instruction quantum fires (present_count still 0).
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD, 0));
|
||||
assert_eq!(s.pending.len(), 1);
|
||||
let _ = s.take_next();
|
||||
// First present flips to anchored: exactly one vblank for the present.
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 2, 1));
|
||||
assert!(s.vsync_present_anchored);
|
||||
assert_eq!(s.pending.len(), 1);
|
||||
let _ = s.take_next();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_instr_heavy_dry_frame_capped_not_spiking() {
|
||||
// iterate-3AJ: the regression. A heavy non-presenting frame retires
|
||||
// ~10M instructions; the OLD ticker fired ~66 vsyncs (10M/150k) in
|
||||
// that single frame, jumping the guest vsync counter 0→66 and
|
||||
// skipping the fade-in. The present-anchored ticker caps the dry
|
||||
// window at DRY_FALLBACK_CAP.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
// Enter anchored mode via one present.
|
||||
let mut instr: u64 = VSYNC_INSTR_PERIOD;
|
||||
assert!(s.tick_vsync_instr(instr, 1));
|
||||
while s.take_next().is_some() {}
|
||||
// Simulate a 10M-instruction frame with NO new present, ticked in
|
||||
// chunks (as coord_pre_round would). Count fallback vsyncs queued.
|
||||
let mut fallback = 0usize;
|
||||
for _ in 0..100 {
|
||||
instr += 100_000; // 100 chunks × 100k = 10M instructions
|
||||
if s.tick_vsync_instr(instr, 1) {
|
||||
while s.take_next().is_some() {
|
||||
fallback += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
fallback, DRY_FALLBACK_CAP as usize,
|
||||
"a heavy dry frame must cap fallback vsyncs at DRY_FALLBACK_CAP, \
|
||||
not fire ~66"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_wallclock_first_call_sets_anchor() {
|
||||
// First call seeds the anchor and never fires. KRNBUG-D08:
|
||||
|
||||
@@ -3,6 +3,7 @@ pub mod exports;
|
||||
pub mod interrupts;
|
||||
pub mod objects;
|
||||
pub mod path;
|
||||
pub mod silph_synth;
|
||||
pub mod state;
|
||||
pub mod thread;
|
||||
pub mod ui_bridge;
|
||||
|
||||
@@ -13,7 +13,7 @@ use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
/// u16 Length
|
||||
/// u16 MaximumLength
|
||||
/// u32 Buffer (guest pointer)
|
||||
fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
|
||||
pub fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
|
||||
if ptr == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
280
crates/xenia-kernel/src/silph_synth.rs
Normal file
280
crates/xenia-kernel/src/silph_synth.rs
Normal file
@@ -0,0 +1,280 @@
|
||||
//! AUDIT-2.BF — synthetic spawn of the silph::WorkerCtx worker quartet.
|
||||
//!
|
||||
//! AUDIT-058/059 traced a 6-level static-caller ladder
|
||||
//! (`sub_824F7800 ← sub_824F7CD0 ← sub_824F8398 ← sub_821B55D8 ← sub_821B6DF4`,
|
||||
//! topped by virtual-dispatch from `sub_82172BA0+0x1E8`) that activates
|
||||
//! `sub_825070F0` in canary at ~1× / 30 s, kicking off four worker threads
|
||||
//! initialised against a single ~0x440-byte ctx. In ours none of those PCs
|
||||
//! fire (audit-059 round 9 confirmed sub_821B6DF4 = 0×, real chain entry =
|
||||
//! virtual-dispatch from sub_82172BA0+0x1E8 hits wrong-vtable slot).
|
||||
//!
|
||||
//! Rather than chase the wrong-vtable break, this module reproduces the end
|
||||
//! state directly: at the first observation of a load-bearing VFS path
|
||||
//! (`dat/movie`), we synthesise the ctx structure in guest memory per audit-
|
||||
//! 059 round 5's live hexdump and spawn the four worker entry points the
|
||||
//! same way AUDIT-048's audio host-pump spawns its dedicated client worker.
|
||||
//!
|
||||
//! The ctx is opaque to the workers — only fields they dereference matter.
|
||||
//! Per round 5 dump (`audit-runs/audit-059-handle-disambiguation/round5-ctx-
|
||||
//! dump/canary.log`):
|
||||
//!
|
||||
//! +0x00 vtable = 0x8200A1E8 (XEX .rdata, valid in both engines)
|
||||
//! +0x04 self = ctx
|
||||
//! +0x08 intrusive head= ctx
|
||||
//! +0x0C init flag = 1
|
||||
//! +0x10 packed byte = 0x01000000
|
||||
//! +0x18 float ~1.0 = 0x3F7FCCCC
|
||||
//! +0x1C float ~1.0 = 0x3F802D83
|
||||
//! +0x24 flag = 1
|
||||
//! +0x28..+0x30 = three foreign pointers, NULL initially
|
||||
//! +0x54..+0x84 = 4× X_KEVENT auto-reset, state=0
|
||||
//! +0x94..+0xC4 = 4× X_KEVENT manual-reset, state=1
|
||||
//! +0x210..+0x250 = 4-entry intrusive work-ring, empty
|
||||
//!
|
||||
//! Worker entries (each takes r3 = ctx_ptr):
|
||||
//! 0x82506528, 0x82506558, 0x82506588, 0x825065B8
|
||||
|
||||
use xenia_cpu::scheduler::{BlockReason, SpawnParams};
|
||||
use xenia_cpu::ThreadRef;
|
||||
use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
|
||||
use crate::objects::KernelObject;
|
||||
use crate::state::{GuestMemoryPcr, KernelState};
|
||||
use crate::thread::allocate_thread_image;
|
||||
|
||||
/// XEX `.rdata` vtable for the silph::WorkerCtx singleton (audit-059 round 5).
|
||||
const SILPH_CTX_VTABLE: u32 = 0x8200_A1E8;
|
||||
|
||||
/// 4-element fixed entry table — guest text PCs for the four worker bodies.
|
||||
const SILPH_WORKER_ENTRIES: [u32; 4] = [
|
||||
0x8250_6528,
|
||||
0x8250_6558,
|
||||
0x8250_6588,
|
||||
0x8250_65B8,
|
||||
];
|
||||
|
||||
/// Round 0x440 up to a page-ish so the ctx alloc never straddles a page
|
||||
/// boundary in heap_alloc's bookkeeping. Round 20 grew the alloc from 0x500
|
||||
/// to 0x800 to make room for a synthesised sub-object at +0x300 and its
|
||||
/// 32-slot vtable at +0x500 (= ctx + 0x500..0x580). Round 21 retains the
|
||||
/// embedded sub-object but drops the synthesized vtable (we now point at
|
||||
/// canary's real XEX-resident sub-vtable directly), so the 0x500..0x580
|
||||
/// region is unused but harmless.
|
||||
const SILPH_CTX_SIZE: u32 = 0x800;
|
||||
|
||||
/// Offset within the ctx allocation of the synthetic sub-object referenced
|
||||
/// at `[ctx+0x2C]`. Canary's sub-object sits ~0x300 bytes above the ctx and
|
||||
/// varies per-instance; we keep it embedded in the same alloc so a single
|
||||
/// `heap_alloc` covers everything.
|
||||
const SILPH_SUBOBJ_OFFSET: u32 = 0x300;
|
||||
|
||||
/// XEX `.rdata` VA of canary's real sub-object vtable (audit-059 round 21).
|
||||
/// Discovered by:
|
||||
/// 1. Probing canary at `pc=0x82506B08` (= `sub_82506B08`, method 35 of
|
||||
/// the WorkerCtx vtable, the first sub-object method called by every
|
||||
/// `sub_82506528/58/88/B8` worker entry).
|
||||
/// 2. Capturing `[ctx+0x2C]` from the JIT-prolog dump (= sub-object VA
|
||||
/// in canary's heap).
|
||||
/// 3. Re-running with `--audit_jit_prolog_mem_dump=<sub-obj VA>` to read
|
||||
/// `[sub-object + 0]` = sub-vtable VA = **`0x8200A168`**.
|
||||
/// PE inspection confirms slot 15 (called via `[r11+0x3C]` at
|
||||
/// `sub_82506B08+0x44`) = `sub_824FCCC8` and slot 17 (`[r11+0x44]` at
|
||||
/// `sub_82506B08+0x70`) = `sub_824FCE38`. Both are real game methods in
|
||||
/// the same `.text` region as the rest of the worker dispatch surface.
|
||||
const SILPH_SUB_VTABLE_SOURCE_VA: u32 = 0x8200_A168;
|
||||
|
||||
/// Round-19 XEX-resident wrapper constant observed at `[ctx+0x30]` in every
|
||||
/// canary ctx (audit-059 round 7). Same value for all four ctxes — opaque
|
||||
/// pointer / handle the worker passes through without dereferencing.
|
||||
const SILPH_CTX_FIELD_30_CONST: u32 = 0xBE56_8F00;
|
||||
|
||||
/// 64 KiB worker stack (mirrors AUDIT-048 audio worker), half of canary's
|
||||
/// 128 KiB default.
|
||||
const SILPH_WORKER_STACK: u32 = 0x10_000;
|
||||
|
||||
/// Idempotently synthesise the silph::WorkerCtx and spawn the four worker
|
||||
/// threads it normally drives.
|
||||
///
|
||||
/// `suspended` controls whether the spawned threads enter the runqueue as
|
||||
/// `Ready` (false) or as `Blocked(Suspended)` (true). Use `true` for
|
||||
/// diagnostic baselines where you want the ctx materialised in guest memory
|
||||
/// for downstream probes but don't want the worker bodies executing (e.g.
|
||||
/// when round-5 ctx fields like the foreign-arena pointers at +0x28/+0x2C/
|
||||
/// +0x30 are still NULL and the workers would fault on first dereference).
|
||||
///
|
||||
/// Returns the ctx VA on the first call; on subsequent calls returns the
|
||||
/// cached VA without re-spawning. Failures inside spawn are logged but the
|
||||
/// `synth_done` latch is still flipped so we don't retry-loop.
|
||||
///
|
||||
/// Mirrors the AUDIT-048 audio-worker spawn pattern in
|
||||
/// `xaudio_register_render_driver` (`exports.rs:3122`).
|
||||
pub fn spawn_silph_workers(
|
||||
state: &mut KernelState,
|
||||
mem: &GuestMemory,
|
||||
suspended: bool,
|
||||
) -> Option<u32> {
|
||||
if state.silph_synth_done {
|
||||
return Some(state.silph_synth_ctx);
|
||||
}
|
||||
state.silph_synth_done = true;
|
||||
|
||||
let Some(ctx) = state.heap_alloc(SILPH_CTX_SIZE, mem) else {
|
||||
tracing::warn!("silph_synth: heap_alloc({:#x}) failed for ctx", SILPH_CTX_SIZE);
|
||||
return None;
|
||||
};
|
||||
state.silph_synth_ctx = ctx;
|
||||
|
||||
// Zero the entire ctx page first — heap_alloc returns freshly mapped
|
||||
// memory but we want the audit-059-round-5 layout to be canonical
|
||||
// regardless of any future allocator behaviour change.
|
||||
for off in (0..SILPH_CTX_SIZE).step_by(4) {
|
||||
mem.write_u32(ctx + off, 0);
|
||||
}
|
||||
|
||||
// ---- Header scalars (per audit-059 round 5 hexdump) ----
|
||||
mem.write_u32(ctx + 0x00, SILPH_CTX_VTABLE);
|
||||
mem.write_u32(ctx + 0x04, ctx); // self
|
||||
mem.write_u32(ctx + 0x08, ctx); // intrusive list head pointing at self
|
||||
mem.write_u32(ctx + 0x0C, 0x0000_0001); // init flag / refcount
|
||||
mem.write_u32(ctx + 0x10, 0x0100_0000); // packed byte field
|
||||
mem.write_u32(ctx + 0x18, 0x3F7F_CCCC); // float ~1.0 (UI rate A)
|
||||
mem.write_u32(ctx + 0x1C, 0x3F80_2D83); // float ~1.0 (UI rate B)
|
||||
mem.write_u32(ctx + 0x24, 0x0000_0001);
|
||||
|
||||
// +0x28..+0x30 = three foreign pointers.
|
||||
// +0x28 — canary's first-fire snapshot has NULL here. Round-19 fault
|
||||
// analysis shows worker bodies don't dereference this on
|
||||
// first entry, so we leave it NULL too.
|
||||
// +0x2C — sub-object pointer. Worker bodies do
|
||||
// `lwz r3,44(rN); lwz r11,0(r3); lwz r11,60(r11); bctrl`,
|
||||
// i.e. virtual-dispatch through slot 15 of the sub-object's
|
||||
// vtable. Point this at our synthesised sub-object embedded
|
||||
// at ctx + SILPH_SUBOBJ_OFFSET.
|
||||
// +0x30 — XEX-resident wrapper constant 0xBE568F00 (round 7). Opaque
|
||||
// but identical across all four canary ctxes.
|
||||
let subobj_ptr = ctx + SILPH_SUBOBJ_OFFSET;
|
||||
mem.write_u32(ctx + 0x2C, subobj_ptr);
|
||||
mem.write_u32(ctx + 0x30, SILPH_CTX_FIELD_30_CONST);
|
||||
|
||||
// ---- Embedded sub-object at +0x300 ----
|
||||
// Round-21 pivot: instead of synthesising a stub vtable that returns
|
||||
// NULL from every slot, point `[sub_object + 0]` directly at canary's
|
||||
// real XEX-resident sub-vtable VA. The vtable bytes are part of the
|
||||
// same static image both engines map, so referring to it costs zero
|
||||
// guest memory and gives the workers a working virtual-method surface
|
||||
// (slot 15 = sub_824FCCC8, slot 17 = sub_824FCE38, plus 29 other real
|
||||
// methods). Round-19 disassembly shows worker bodies only touch the
|
||||
// sub-object's vtable; the rest of the sub-object is opaque so we
|
||||
// leave it zero-filled.
|
||||
mem.write_u32(subobj_ptr, SILPH_SUB_VTABLE_SOURCE_VA);
|
||||
|
||||
// ---- 4× X_KEVENT auto-reset at +0x54/+0x64/+0x74/+0x84, state = 0 ----
|
||||
// X_DISPATCH_HEADER layout (canary xobject.h:35):
|
||||
// +0x00 type (u8: 0=manual-event, 1=auto-event, 2=mutant, ...)
|
||||
// +0x01 abandoned (u8)
|
||||
// +0x02 size (u8 dwords)
|
||||
// +0x03 inserted (u8)
|
||||
// +0x04 signal_state (u32 BE)
|
||||
// +0x08..+0x0F list_head (two pointers — self-link = empty list)
|
||||
for i in 0..4u32 {
|
||||
let off = ctx + 0x54 + (i * 0x10);
|
||||
mem.write_u8(off, 1); // type = auto-reset Event
|
||||
mem.write_u32(off + 4, 0); // signal_state = 0
|
||||
// List head self-link denotes empty waiter list.
|
||||
mem.write_u32(off + 8, off + 8);
|
||||
mem.write_u32(off + 12, off + 8);
|
||||
}
|
||||
// ---- 4× X_KEVENT manual-reset at +0x94..+0xC4, state = 1 (pre-signaled) ----
|
||||
for i in 0..4u32 {
|
||||
let off = ctx + 0x94 + (i * 0x10);
|
||||
mem.write_u8(off, 0); // type = manual-reset Event
|
||||
mem.write_u32(off + 4, 1); // signal_state = 1 (pre-signaled)
|
||||
mem.write_u32(off + 8, off + 8);
|
||||
mem.write_u32(off + 12, off + 8);
|
||||
}
|
||||
|
||||
// ---- 4-entry intrusive work-ring at +0x210, initially empty ----
|
||||
// Each entry: [+0]=0x01000000 [+4]=0 [+8]=self_ptr [+0xC]=self_ptr.
|
||||
for i in 0..4u32 {
|
||||
let off = ctx + 0x210 + (i * 0x10);
|
||||
mem.write_u32(off, 0x0100_0000);
|
||||
mem.write_u32(off + 4, 0);
|
||||
mem.write_u32(off + 8, off + 8);
|
||||
mem.write_u32(off + 12, off + 8);
|
||||
}
|
||||
|
||||
// +0x250 "XEN"-tagged descriptors and +0x2E0 resource-index table left
|
||||
// zero — they may be populated lazily by the workers themselves.
|
||||
|
||||
// ---- Spawn the 4 worker guest threads ----
|
||||
use std::sync::atomic::Ordering;
|
||||
let mut spawned = 0usize;
|
||||
for (i, &entry) in SILPH_WORKER_ENTRIES.iter().enumerate() {
|
||||
let Some(image) = allocate_thread_image(state, mem, SILPH_WORKER_STACK, 0) else {
|
||||
tracing::warn!("silph_synth: allocate_thread_image failed for worker {}", i);
|
||||
continue;
|
||||
};
|
||||
let tid = state.next_thread_id.fetch_add(1, Ordering::Relaxed);
|
||||
let handle = state.alloc_handle_for(KernelObject::Thread {
|
||||
id: tid,
|
||||
hw_id: None,
|
||||
exit_code: None,
|
||||
waiters: Vec::new(),
|
||||
});
|
||||
let tls_slot_count = state.next_tls_index.load(Ordering::Relaxed);
|
||||
let params = SpawnParams {
|
||||
entry,
|
||||
start_context: ctx, // r3 = ctx_ptr
|
||||
stack_base: image.stack_base,
|
||||
stack_size: image.stack_size,
|
||||
pcr_base: image.pcr_base,
|
||||
tls_base: image.tls_base,
|
||||
thread_handle: handle,
|
||||
guest_tid: tid,
|
||||
create_suspended: suspended,
|
||||
is_initial: false,
|
||||
tls_slot_count,
|
||||
affinity_mask: 0,
|
||||
priority: 0,
|
||||
ideal_processor: None,
|
||||
};
|
||||
match state.scheduler.spawn(params, &mut GuestMemoryPcr(mem)) {
|
||||
Ok(hw_id) => {
|
||||
if let Some(KernelObject::Thread { hw_id: slot, .. }) =
|
||||
state.objects.get_mut(&handle)
|
||||
{
|
||||
*slot = Some(hw_id);
|
||||
}
|
||||
let tref = ThreadRef::new(
|
||||
hw_id,
|
||||
(state.scheduler.slots[hw_id as usize].runqueue.len() - 1) as u16,
|
||||
);
|
||||
state.silph_synth_handles[i] = Some(handle);
|
||||
state.silph_synth_refs[i] = Some(tref);
|
||||
spawned += 1;
|
||||
tracing::info!(
|
||||
"silph_synth: spawned worker {} tid={} handle={:#x} entry={:#010x} ctx={:#010x}",
|
||||
i, tid, handle, entry, ctx
|
||||
);
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
"silph_synth: scheduler.spawn failed for worker {} entry={:#010x}",
|
||||
i, entry
|
||||
);
|
||||
}
|
||||
}
|
||||
// Avoid an unused-variable warning if BlockReason isn't referenced.
|
||||
let _ = BlockReason::WaitAny {
|
||||
handles: Vec::new(),
|
||||
deadline: None,
|
||||
};
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"silph_synth: ctx={:#010x} workers_spawned={}/4",
|
||||
ctx, spawned
|
||||
);
|
||||
|
||||
Some(ctx)
|
||||
}
|
||||
@@ -17,6 +17,16 @@ impl PcrWriter for GuestMemoryPcr<'_> {
|
||||
// `GuestMemory::write_u32` takes `&self` post-M2 trait flip; the
|
||||
// wrapping `&'a GuestMemory` is sufficient.
|
||||
self.0.write_u32(pcr_base + 0x2C, hw_id as u32);
|
||||
// PRCB.current_cpu byte at PCR+0x10C (prcb_data@0x100 + current_cpu@0xC).
|
||||
// Canary writes `GetFakeCpuNumber(affinity)` here (xthread.cc:847
|
||||
// `pcr->prcb_data.current_cpu = cpu_index`), which equals the HW thread
|
||||
// id we already compute. Guest spin-barriers (e.g. sub_824D1328, used by
|
||||
// the audio/update pump threads at entries 0x824D2878/0x824D2940) index a
|
||||
// per-HW-thread occupancy array by `lbz r11, 268(r13)` = this byte. Left
|
||||
// unwritten it stayed 0 for every thread, so all threads collided on
|
||||
// slot 0 and the multi-thread rendezvous signature never assembled —
|
||||
// the pump threads spun forever and never fired their KeSetEvent loops.
|
||||
self.0.write_u8(pcr_base + 0x10C, hw_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,6 +66,18 @@ pub struct KernelState {
|
||||
/// publish; observers (the kernel object table) are guarded by
|
||||
/// their own synchronization.
|
||||
next_handle: std::sync::atomic::AtomicU32,
|
||||
/// AUDIT-059 R34: FIFO free list of closed handle slots, mirroring
|
||||
/// canary's slab/free-list `ObjectTable`. Without this, ours' bump
|
||||
/// allocator monotonically grows so a recycled slot in canary
|
||||
/// (e.g. `F8000098` reused 130× per 30s) corresponds to a fresh,
|
||||
/// never-reused slot in ours — the kernel-object identity drifts.
|
||||
/// Recycling closes that gap and (per AUDIT-042 / R30) may
|
||||
/// side-effect-unwedge γ-cluster #2 by letting silph signals land
|
||||
/// on the same handle slot the wait registered for. Population is
|
||||
/// gated on `KernelState::release_handle_slot` (only IDs in
|
||||
/// `[HANDLE_BASE, 0xF000_0000)` are recycled — synthetic XAudio
|
||||
/// handles at `0xF000_0000+` are reserved and must never be reused).
|
||||
free_handles: std::collections::VecDeque<u32>,
|
||||
/// Scheduler managing all emulated HW threads + their per-slot
|
||||
/// runqueues. Starts empty — the app installs the initial guest thread
|
||||
/// on slot 0 via `KernelState::install_initial_thread` once it has the
|
||||
@@ -139,6 +161,11 @@ pub struct KernelState {
|
||||
/// graphics interrupts is enforced by the injector's
|
||||
/// `is_in_callback()` guard.
|
||||
pub xaudio: crate::xaudio::XAudioState,
|
||||
/// Register-mapped XMA context array (apu stage 1). Shared with the
|
||||
/// `0x7FEA0000` MMIO region installed by the app and with the
|
||||
/// `XMACreateContext`/`XMAReleaseContext` exports, so it lives behind an
|
||||
/// `Arc<Mutex<…>>`. Stage 1 records kicks; stage 3 will decode them.
|
||||
pub xma: std::sync::Arc<std::sync::Mutex<xenia_apu::XmaDecoder>>,
|
||||
/// AUDIT-032 Plan B (default true). When true, the round prologue
|
||||
/// runs the XAudio ticker + `try_inject_audio_callback`. Pre-fix this
|
||||
/// was off by default because injection used random-victim selection
|
||||
@@ -197,6 +224,17 @@ pub struct KernelState {
|
||||
/// only). Used by `xex_get_procedure_address` to resolve ordinals back
|
||||
/// to callable thunks.
|
||||
thunks_by_ordinal: HashMap<(ModuleId, u16), u32>,
|
||||
|
||||
/// Perf (Tier-A #4): inclusive [min, max] guest-address band that
|
||||
/// contains every registered import thunk. Import thunks sit in a
|
||||
/// small contiguous region of the XEX; almost every executing PC is
|
||||
/// ordinary guest code OUTSIDE this band. The per-slot-visit prologue
|
||||
/// looks up `thunk_map.get(&pc)` (a `HashMap<u32,…>` → `hash_one` per
|
||||
/// call, ~3.2M visits boot-to-splash). Range-rejecting against this
|
||||
/// band first turns the common (non-thunk) case into a pair of integer
|
||||
/// compares and skips the hash entirely. `None` until the first thunk
|
||||
/// is registered (no band → reject everything, matching an empty map).
|
||||
thunk_addr_band: Option<(u32, u32)>,
|
||||
/// First-Pixels diagnostic latch. Set the first time
|
||||
/// `RtlRaiseException` fires with code `0xE06D7363` (MSVC C++ throw)
|
||||
/// so the deep stack-walk + `runtime_error` decode in
|
||||
@@ -279,6 +317,17 @@ pub struct KernelState {
|
||||
/// Settable via `--audit-r3-dump-bytes` /
|
||||
/// `XENIA_AUDIT_R3_DUMP_BYTES`.
|
||||
pub audit_r3_dump_bytes: Option<u32>,
|
||||
/// iterate-2E — diagnostic pointer-chase. `(reg, off)`: on every
|
||||
/// `AUDIT-PC-PROBE` fire, treat `gpr[reg]` as a base object pointer,
|
||||
/// dump its first 64 bytes, then follow `[base+off]` to a sub-object
|
||||
/// (e.g. a stream/file object held in a work item), dump ITS first 64
|
||||
/// bytes, then follow `[[base+off]+0]` to the sub-object's vtable and
|
||||
/// dump the first 48 u32 slots. Designed to capture the live work-item
|
||||
/// + stream object + vtable at `sub_824510E0` entry (r4 = work item,
|
||||
/// stream at +36, vtable[28] = the "is-read-done?" predicate) BEFORE
|
||||
/// the pool recycles the slot. Read-only; lockstep digest unaffected.
|
||||
/// Settable via `XENIA_AUDIT_DEREF=<reg>:<off>` (e.g. `4:36`).
|
||||
pub audit_deref: Option<(u8, u32)>,
|
||||
/// M12 — diagnostic. PCs at which to emit a structured JSONL record
|
||||
/// per fire, designed for diffing against xenia-canary's
|
||||
/// `--log_lr_on_pc` patch output. Each line carries
|
||||
@@ -299,6 +348,65 @@ pub struct KernelState {
|
||||
pub dump_addrs: Vec<u32>,
|
||||
/// `--dump-section=BASE:LEN:PATH` end-of-run snapshot, page-gated by `is_mapped`.
|
||||
pub dump_section: Option<(u32, u32, std::path::PathBuf)>,
|
||||
/// AUDIT-2.BF — synthetic silph::WorkerCtx spawn one-shot latch. Set on
|
||||
/// first call to [`crate::silph_synth::spawn_silph_workers`] (triggered
|
||||
/// by the first observation of a load-bearing VFS path such as
|
||||
/// `dat/movie`), then reused — subsequent triggers are no-ops.
|
||||
pub silph_synth_done: bool,
|
||||
/// AUDIT-2.BF — VA of the synthesised silph::WorkerCtx. Zero before the
|
||||
/// first spawn; set to the ctx base by `spawn_silph_workers`. Held on
|
||||
/// the kernel state so future export hooks can find it (no caller does
|
||||
/// yet — placeholder for round 19+ wiring).
|
||||
pub silph_synth_ctx: u32,
|
||||
/// AUDIT-2.BF — kernel handles for the 4 synthetic worker threads.
|
||||
pub silph_synth_handles: [Option<u32>; 4],
|
||||
/// AUDIT-2.BF — `ThreadRef` cache for the 4 synthetic workers.
|
||||
pub silph_synth_refs: [Option<xenia_cpu::ThreadRef>; 4],
|
||||
/// ITERATE-2C Phase D — auto-signal delay for silph::UImpl
|
||||
/// `NtCreateEvent` calls (see [`Self::maybe_register_silph_autosignal`]).
|
||||
/// `None` = feature disabled; populated once from
|
||||
/// `XENIA_SILPH_UI_AUTOSIGNAL_DELAY=<u64>` at construction.
|
||||
pub silph_autosignal_delay: Option<u64>,
|
||||
/// ITERATE-2C Phase D — pending auto-signal queue. Drained each
|
||||
/// outer round by [`Self::fire_due_silph_autosignals`].
|
||||
pub silph_autosignal_pending: Vec<AutoSignalPending>,
|
||||
/// ITERATE-2C Phase D — most recent `stats.instruction_count`
|
||||
/// deposited by the scheduler loop (see
|
||||
/// [`Self::set_now_cycle_hint`]). Used by
|
||||
/// [`Self::maybe_register_silph_autosignal`] to compute absolute
|
||||
/// deadlines, since `nt_create_event` doesn't see `ExecStats`.
|
||||
pub last_cycle_hint: u64,
|
||||
/// ITERATE-2C Phase D — one-shot diagnostic latch. Flipped by
|
||||
/// [`Self::fire_due_silph_autosignals`] on the first visit where
|
||||
/// the pending queue is non-empty but no entry is due yet.
|
||||
pub silph_autosignal_diag_logged: bool,
|
||||
/// ITERATE-2J — guest VA of the `KeTimeStampBundle` block (xboxkrnl
|
||||
/// data export ordinal 0x00AD). Set during the import-patch pass in
|
||||
/// `xenia-app`. Zero until then. The guest's worker-hub channel
|
||||
/// dispatch loop polls `[block+0x10]` (`tick_count`, milliseconds) and
|
||||
/// gates dispatch on a `tick_count + 66` deadline; if the block is
|
||||
/// never re-written that deadline never elapses and the hub spins
|
||||
/// forever (the tid14 0x109c starvation gate). The run loop ticks this
|
||||
/// block every round from the deterministic `global_clock` via
|
||||
/// [`Self::update_timestamp_bundle`].
|
||||
pub timestamp_bundle_addr: u32,
|
||||
|
||||
/// Perf (Tier-B #5) throttle state for [`Self::update_timestamp_bundle`].
|
||||
/// Holds the `clock` value at which the bundle was last actually written;
|
||||
/// `u64::MAX` is the "never written" sentinel (forces the first write).
|
||||
/// `AtomicU64` (not `Cell`) so the `&self` update path stays `Sync` for
|
||||
/// the parallel `Arc<Mutex<KernelState>>` usage. Only ever advanced
|
||||
/// forward under the kernel lock, so `Relaxed` ordering is sufficient and
|
||||
/// the sequence is deterministic.
|
||||
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64,
|
||||
}
|
||||
|
||||
/// ITERATE-2C Phase D — one queued auto-signal. `deadline_cycle` is
|
||||
/// absolute (cycle hint at register time + configured delay).
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct AutoSignalPending {
|
||||
pub handle: u32,
|
||||
pub deadline_cycle: u64,
|
||||
}
|
||||
|
||||
impl KernelState {
|
||||
@@ -324,6 +432,7 @@ impl KernelState {
|
||||
let mut state = Self {
|
||||
exports: HashMap::new(),
|
||||
next_handle: AtomicU32::new(0x1000),
|
||||
free_handles: std::collections::VecDeque::new(),
|
||||
scheduler,
|
||||
next_tls_index: AtomicU32::new(0),
|
||||
cs_waiters: HashMap::new(),
|
||||
@@ -345,6 +454,9 @@ impl KernelState {
|
||||
ui: None,
|
||||
interrupts: crate::interrupts::InterruptState::default(),
|
||||
xaudio: crate::xaudio::XAudioState::default(),
|
||||
// apu stage 1 — un-initialized until the app reserves the context
|
||||
// array and calls `xma.lock().init(va, phys)`.
|
||||
xma: std::sync::Arc::new(std::sync::Mutex::new(xenia_apu::XmaDecoder::new())),
|
||||
// AUDIT-032: dedicated audio worker per client (Plan B in
|
||||
// `xaudio_register_render_driver`) — not victim hijack, so safe
|
||||
// to enable by default. Previously gated off because the
|
||||
@@ -355,6 +467,7 @@ impl KernelState {
|
||||
audit: HandleAudit::default(),
|
||||
reservations,
|
||||
thunks_by_ordinal: HashMap::new(),
|
||||
thunk_addr_band: None,
|
||||
cxx_throw_logged: false,
|
||||
ring_base: 0,
|
||||
ring_size_dwords: 0,
|
||||
@@ -365,10 +478,23 @@ impl KernelState {
|
||||
audit_pc_probe_pcs: std::collections::HashSet::new(),
|
||||
audit_mem_read_addr: None,
|
||||
audit_r3_dump_bytes: None,
|
||||
audit_deref: None,
|
||||
lr_trace_pcs: std::collections::HashSet::new(),
|
||||
lr_trace_writer: None,
|
||||
dump_addrs: Vec::new(),
|
||||
dump_section: None,
|
||||
silph_synth_done: false,
|
||||
silph_synth_ctx: 0,
|
||||
silph_synth_handles: [None; 4],
|
||||
silph_synth_refs: [None; 4],
|
||||
silph_autosignal_delay: std::env::var("XENIA_SILPH_UI_AUTOSIGNAL_DELAY")
|
||||
.ok()
|
||||
.and_then(|v| v.parse::<u64>().ok()),
|
||||
silph_autosignal_pending: Vec::new(),
|
||||
last_cycle_hint: 0,
|
||||
silph_autosignal_diag_logged: false,
|
||||
timestamp_bundle_addr: 0,
|
||||
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64::new(u64::MAX),
|
||||
};
|
||||
crate::exports::register_exports(&mut state);
|
||||
crate::xam::register_exports(&mut state);
|
||||
@@ -488,6 +614,25 @@ impl KernelState {
|
||||
/// emits each ordinal once per module).
|
||||
pub fn register_thunk(&mut self, module: ModuleId, ordinal: u16, address: u32) {
|
||||
self.thunks_by_ordinal.insert((module, ordinal), address);
|
||||
// Widen the thunk address band (Tier-A #4) so the hot prologue can
|
||||
// range-reject non-thunk PCs before hashing the thunk map.
|
||||
self.thunk_addr_band = Some(match self.thunk_addr_band {
|
||||
Some((lo, hi)) => (lo.min(address), hi.max(address)),
|
||||
None => (address, address),
|
||||
});
|
||||
}
|
||||
|
||||
/// Perf (Tier-A #4). Cheap pre-filter for the per-slot-visit import-thunk
|
||||
/// dispatch: `false` guarantees `pc` is NOT a registered thunk (so the
|
||||
/// caller can skip the `thunk_map.get(&pc)` hash). `true` means `pc` lies
|
||||
/// within the registered thunk address band and the map must be consulted
|
||||
/// for an exact match. Conservative — never a false negative.
|
||||
#[inline]
|
||||
pub fn pc_in_thunk_band(&self, pc: u32) -> bool {
|
||||
match self.thunk_addr_band {
|
||||
Some((lo, hi)) => pc >= lo && pc <= hi,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve a `(module, ordinal)` to its registered thunk address.
|
||||
@@ -642,12 +787,39 @@ impl KernelState {
|
||||
}
|
||||
|
||||
pub fn alloc_handle(&mut self) -> u32 {
|
||||
// AUDIT-059 R34: prefer recycling a closed slot (FIFO, matching
|
||||
// canary's `ObjectTable` slab) before bumping. The Arc<Mutex<
|
||||
// KernelState>> already serializes us; no extra synchronization.
|
||||
if let Some(slot) = self.free_handles.pop_front() {
|
||||
return slot;
|
||||
}
|
||||
// M2.4: lock-free fetch_add. Relaxed is sufficient — IDs are
|
||||
// opaque tokens; no payload is sequenced against the counter.
|
||||
self.next_handle
|
||||
.fetch_add(4, std::sync::atomic::Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// AUDIT-059 R34. Return a freshly-closed handle slot to the FIFO
|
||||
/// recycle queue. No-op for the synthetic XAudio range (`>= 0xF000_0000`,
|
||||
/// AUDIT-048) and the reserved `< 0x1000` band. Call site: `nt_close`'s
|
||||
/// `objects.remove` branch when refcount reaches zero.
|
||||
///
|
||||
/// ABA guard (subsystem-audit 2026-06-12): never recycle a slot that a
|
||||
/// thread is still parked on. Without this, a closed slot could be
|
||||
/// re-minted for a new object and a signal on that new object would wake
|
||||
/// the stale waiter that was blocked on the OLD object at the same slot.
|
||||
/// Such a slot is simply leaked (it stays out of `free_handles`),
|
||||
/// reproducing the pre-R34 bump-only behaviour for that rare case.
|
||||
pub fn release_handle_slot(&mut self, handle: u32) {
|
||||
if handle < 0x1000 || handle >= 0xF000_0000 {
|
||||
return;
|
||||
}
|
||||
if self.scheduler.any_thread_waiting_on(handle) {
|
||||
return;
|
||||
}
|
||||
self.free_handles.push_back(handle);
|
||||
}
|
||||
|
||||
pub fn alloc_handle_for(&mut self, obj: KernelObject) -> u32 {
|
||||
let h = self.alloc_handle();
|
||||
self.objects.insert(h, obj);
|
||||
@@ -752,6 +924,216 @@ impl KernelState {
|
||||
self.audit.record_wake(handle, entry);
|
||||
}
|
||||
|
||||
/// ITERATE-2C Phase D — deposit the latest scheduler instruction
|
||||
/// count so `nt_create_event` can compute absolute auto-signal
|
||||
/// deadlines. Called once per outer round from the app's
|
||||
/// `coord_pre_round` site. No-op when the feature env is unset.
|
||||
pub fn set_now_cycle_hint(&mut self, now_cycle: u64) {
|
||||
self.last_cycle_hint = now_cycle;
|
||||
}
|
||||
|
||||
/// ITERATE-2J — tick the `KeTimeStampBundle` block (xboxkrnl ordinal
|
||||
/// 0x00AD) from the deterministic monotonic clock so the guest sees a
|
||||
/// clock that *advances*.
|
||||
///
|
||||
/// `clock` is the scheduler's `global_clock` — a pure function of
|
||||
/// retired guest instructions (see [`Self::now_basis_at`] /
|
||||
/// `Scheduler::global_clock`). Lockstep floors it up to
|
||||
/// `stats.instruction_count` each round; parallel sums per-block
|
||||
/// retired counts. Using it (rather than wall-clock) keeps every
|
||||
/// guest-visible time value a deterministic function of guest progress,
|
||||
/// so lockstep stays byte-reproducible.
|
||||
///
|
||||
/// ## Cadence
|
||||
/// The existing kernel time math (`parse_timeout` in `exports.rs`)
|
||||
/// already treats **1 `global_clock` unit ≈ 100 ns**: it converts a
|
||||
/// signed 100-ns `LARGE_INTEGER` timeout to a deadline by dividing the
|
||||
/// magnitude by 100 and adding it to `now` (= `global_clock`). To stay
|
||||
/// coherent with that, this method uses the same scale:
|
||||
///
|
||||
/// * `interrupt_time` / `system_time` (100-ns units): `clock` (with a
|
||||
/// FILETIME epoch base added to `system_time`).
|
||||
/// * `tick_count` (milliseconds): `clock / INSTRUCTIONS_PER_MS` where
|
||||
/// `INSTRUCTIONS_PER_MS = 10_000` (10_000 × 100 ns = 1 ms).
|
||||
///
|
||||
/// At 10_000 clock-units/ms, the guest's `tick_count + 66` ms hub
|
||||
/// deadline elapses by ~660_000 retired instructions — very early in a
|
||||
/// ~1 B-instruction boot — while a 16 ms `KeWait` timeout
|
||||
/// (`parse_timeout`: 160_000 units) still resolves to 16 ms of
|
||||
/// tick_count, so no timeout collapses to "instant". The two readers
|
||||
/// share one scale.
|
||||
pub fn update_timestamp_bundle(&self, mem: &GuestMemory, clock: u64) {
|
||||
let block = self.timestamp_bundle_addr;
|
||||
if block == 0 {
|
||||
return;
|
||||
}
|
||||
const INSTRUCTIONS_PER_MS: u64 = 10_000;
|
||||
// Perf (Tier-B #5): the bundle is updated once per scheduler round
|
||||
// (~every 7 retired instructions), but the four guest BE memory
|
||||
// writes are ~8.6% of boot-to-splash. `clock` is the retired-
|
||||
// instruction count, so consecutive rounds rewrite essentially the
|
||||
// same staircase. Throttle to a 0.25 ms quantum: only re-write when
|
||||
// `clock` advanced by >= INSTRUCTIONS_PER_MS/4 (2500 units) since the
|
||||
// last write. This keeps `tick_count` (ms, changes every 10_000
|
||||
// units) ALWAYS fresh and `interrupt_time`/`system_time` monotone at
|
||||
// 0.25 ms granularity — finer than any guest deadline math needs
|
||||
// (`parse_timeout` works in whole ms; the hub gate is `+66 ms`). The
|
||||
// fade-in (3AH-proven vsync-counter driven, NOT this bundle) is
|
||||
// untouched. Throttle threshold is well below 1 ms so no guest-
|
||||
// visible ms boundary is ever skipped.
|
||||
const BUNDLE_QUANTUM: u64 = INSTRUCTIONS_PER_MS / 4; // 2500 units = 0.25 ms
|
||||
{
|
||||
use std::sync::atomic::Ordering;
|
||||
let last = self.timestamp_bundle_last_clock.load(Ordering::Relaxed);
|
||||
// Always allow the first write (last == u64::MAX sentinel) and any
|
||||
// write that crosses the quantum. Never go backwards.
|
||||
if last != u64::MAX && clock < last.saturating_add(BUNDLE_QUANTUM) {
|
||||
return;
|
||||
}
|
||||
self.timestamp_bundle_last_clock
|
||||
.store(clock, Ordering::Relaxed);
|
||||
}
|
||||
// FILETIME epoch base (~2021) so `system_time` is a plausible
|
||||
// absolute wall-clock; matches the constant used by
|
||||
// `ke_query_system_time`. interrupt_time is "since boot" so it
|
||||
// starts at the clock origin (no epoch offset).
|
||||
const FILETIME_BASE: u64 = 132_500_000_000_000_000;
|
||||
let interrupt_time: u64 = clock;
|
||||
let system_time: u64 = FILETIME_BASE.wrapping_add(clock);
|
||||
let tick_count: u32 = (clock / INSTRUCTIONS_PER_MS) as u32;
|
||||
// BE writes (write_u64/write_u32 use to_be_bytes) — guest is BE.
|
||||
mem.write_u64(block, interrupt_time); // +0x00 interrupt_time
|
||||
mem.write_u64(block + 0x08, system_time); // +0x08 system_time
|
||||
mem.write_u32(block + 0x10, tick_count); // +0x10 tick_count (ms)
|
||||
mem.write_u32(block + 0x14, 0); // +0x14 padding
|
||||
}
|
||||
|
||||
/// ITERATE-2C Phase D — register a freshly-allocated event for
|
||||
/// auto-signal after the configured delay, **iff** the creating
|
||||
/// thread matches the silph::UImpl tid=13 chain that wedges in
|
||||
/// audit-049. Filter:
|
||||
///
|
||||
/// * Env `XENIA_SILPH_UI_AUTOSIGNAL_DELAY` set (= delay non-None)
|
||||
/// * Frame-1 LR (the guest caller's post-bl PC, walked one step up
|
||||
/// from the live thunk-wrapper frame) is in
|
||||
/// `[0x821CB15C, 0x821CB160]` — this is the `NtCreateEvent` call
|
||||
/// site inside `sub_821CB030+0x128`. The live `ctx.lr` is the
|
||||
/// thunk wrapper's return slot (e.g. `0x824a9f6c`), so we walk
|
||||
/// one back-chain step to reach the actual guest caller.
|
||||
/// * Creating thread's `start_entry == 0x821748F0` (silph trampoline)
|
||||
/// * Creating thread's `start_context == 0x4024a840`
|
||||
///
|
||||
/// On match, the handle is queued with `deadline = last_cycle_hint +
|
||||
/// delay`. Drained by [`Self::fire_due_silph_autosignals`] from the
|
||||
/// outer scheduler loop.
|
||||
pub fn maybe_register_silph_autosignal(
|
||||
&mut self,
|
||||
handle: u32,
|
||||
ctx: &PpcContext,
|
||||
mem: &GuestMemory,
|
||||
) {
|
||||
let Some(delay) = self.silph_autosignal_delay else {
|
||||
return;
|
||||
};
|
||||
let Some((entry, start_ctx)) = self.scheduler.current_thread_entry_and_ctx() else {
|
||||
return;
|
||||
};
|
||||
if entry != 0x821748F0 || start_ctx != 0x4024_a840 {
|
||||
return;
|
||||
}
|
||||
let frames = walk_guest_back_chain(ctx.gpr[1] as u32, ctx.lr as u32, mem, 2);
|
||||
let caller_lr = match frames.get(1) {
|
||||
Some((_, lr)) => *lr,
|
||||
None => return,
|
||||
};
|
||||
if !(0x821CB15C..=0x821CB160).contains(&caller_lr) {
|
||||
return;
|
||||
}
|
||||
let deadline = self.last_cycle_hint.saturating_add(delay);
|
||||
self.silph_autosignal_pending
|
||||
.push(AutoSignalPending { handle, deadline_cycle: deadline });
|
||||
tracing::info!(
|
||||
"silph autosignal: scheduled handle={:#x} caller_lr={:#x} for cycle {} (now={}, delay={})",
|
||||
handle,
|
||||
caller_lr,
|
||||
deadline,
|
||||
self.last_cycle_hint,
|
||||
delay,
|
||||
);
|
||||
}
|
||||
|
||||
/// ITERATE-2C Phase D — drain pending entries whose deadline has
|
||||
/// passed. Each fires by setting `Event { signaled = true }` and
|
||||
/// invoking the existing `wake_eligible_waiters` to release blocked
|
||||
/// waiters. No-op when the queue is empty (the common case).
|
||||
pub fn fire_due_silph_autosignals(&mut self, now_cycle: u64) {
|
||||
if self.silph_autosignal_pending.is_empty() {
|
||||
return;
|
||||
}
|
||||
let any_due = self
|
||||
.silph_autosignal_pending
|
||||
.iter()
|
||||
.any(|p| p.deadline_cycle <= now_cycle);
|
||||
if !any_due {
|
||||
// Diagnostic for the Phase D POC: log first time we visit
|
||||
// with a non-empty queue but nothing due yet.
|
||||
if !self.silph_autosignal_diag_logged {
|
||||
self.silph_autosignal_diag_logged = true;
|
||||
if let Some(first) = self.silph_autosignal_pending.first() {
|
||||
tracing::info!(
|
||||
"silph autosignal: tick (first visit, none due) now={} pending={} first_deadline={}",
|
||||
now_cycle,
|
||||
self.silph_autosignal_pending.len(),
|
||||
first.deadline_cycle,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut i = 0;
|
||||
while i < self.silph_autosignal_pending.len() {
|
||||
if self.silph_autosignal_pending[i].deadline_cycle <= now_cycle {
|
||||
let p = self.silph_autosignal_pending.swap_remove(i);
|
||||
let prev = match self.objects.get_mut(&p.handle) {
|
||||
Some(KernelObject::Event { signaled, .. }) => {
|
||||
let was = *signaled;
|
||||
*signaled = true;
|
||||
Some(was)
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
tracing::info!(
|
||||
"silph autosignal: firing handle={:#x} prev_signaled={:?} at cycle {}",
|
||||
p.handle,
|
||||
prev,
|
||||
now_cycle,
|
||||
);
|
||||
self.audit_signal(p.handle, 0, "silph_autosignal", prev.unwrap_or(false) as u64);
|
||||
crate::exports::wake_eligible_waiters(self, p.handle);
|
||||
// do not advance i — swap_remove pulled a new entry into i
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Perf gate (Tier-A quick-win #3). `true` iff any of the four
|
||||
/// per-slot-visit diagnostic probe registries
|
||||
/// (`ctor_probe_pcs` / `branch_probe_pcs` / `audit_pc_probe_pcs`
|
||||
/// / `lr_trace_pcs`) holds at least one PC. The common headless
|
||||
/// run leaves all four empty, so the prologue can skip the four
|
||||
/// `fire_*_if_match` calls entirely with this single predicted
|
||||
/// branch — avoiding 4× call overhead per slot-visit (~3.2M
|
||||
/// visits over boot-to-splash) when no probe is configured.
|
||||
/// Purely a fast-path guard; each `fire_*` still re-checks its own
|
||||
/// set, so behaviour is identical whether or not the caller gates.
|
||||
#[inline]
|
||||
pub fn any_probe_active(&self) -> bool {
|
||||
!self.ctor_probe_pcs.is_empty()
|
||||
|| !self.branch_probe_pcs.is_empty()
|
||||
|| !self.audit_pc_probe_pcs.is_empty()
|
||||
|| !self.lr_trace_pcs.is_empty()
|
||||
}
|
||||
|
||||
/// Diagnostic. If the live PC for HW slot `hw_id` is in
|
||||
/// `self.ctor_probe_pcs`, emit a single `CTOR-PROBE` line with
|
||||
/// the current cycle, tid, hw_id, sp, r3, lr, plus an 8-frame
|
||||
@@ -918,6 +1300,38 @@ impl KernelState {
|
||||
}
|
||||
println!("{}", out);
|
||||
}
|
||||
// iterate-2E — pointer-chase: dump base object (gpr[reg]), the
|
||||
// sub-object it holds at [base+off], and that sub-object's vtable
|
||||
// slots. Captures the live work-item + stream + vtable[28] at
|
||||
// sub_824510E0 before the pool recycles the slot. Read-only.
|
||||
if let Some((reg, deref_off)) = self.audit_deref {
|
||||
use std::fmt::Write as _;
|
||||
let base = ctx.gpr[reg as usize] as u32;
|
||||
let dump64 = |label: &str, p: u32| {
|
||||
let mut s = String::with_capacity(256);
|
||||
let _ = write!(&mut s, "AUDIT-DEREF {} ptr={:#010x}", label, p);
|
||||
let mut o: u32 = 0;
|
||||
while o < 64 {
|
||||
let _ = write!(&mut s, " +0x{:02x}={:#010x}", o, mem.read_u32(p.wrapping_add(o)));
|
||||
o += 4;
|
||||
}
|
||||
println!("{}", s);
|
||||
};
|
||||
println!("AUDIT-DEREF-HEAD pc={:#010x} tid={} cycle={} reg=r{} off=0x{:x}", pc, tid, cycle, reg, deref_off);
|
||||
dump64("item", base);
|
||||
let sub = mem.read_u32(base.wrapping_add(deref_off));
|
||||
dump64("sub", sub);
|
||||
let vt = mem.read_u32(sub); // [sub+0] = vtable
|
||||
// Dump 48 vtable slots so slot 28 (+0x70) and slot 36 (+0x90) show.
|
||||
let mut s = String::with_capacity(512);
|
||||
let _ = write!(&mut s, "AUDIT-DEREF vtable={:#010x}", vt);
|
||||
let mut slot: u32 = 0;
|
||||
while slot < 48 {
|
||||
let _ = write!(&mut s, " [{}]={:#010x}", slot, mem.read_u32(vt.wrapping_add(slot * 4)));
|
||||
slot += 1;
|
||||
}
|
||||
println!("{}", s);
|
||||
}
|
||||
}
|
||||
|
||||
/// M12 — diagnostic. If the live PC for HW slot `hw_id` is in
|
||||
@@ -1045,6 +1459,30 @@ impl KernelState {
|
||||
self.pending_timer_fires.first().map(|&(d, _)| d)
|
||||
}
|
||||
|
||||
/// Coherent "now" basis for deadline arithmetic — the scheduler's
|
||||
/// single monotonic `global_clock`, in BOTH execution modes.
|
||||
///
|
||||
/// Per-thread `ctx(hw_id).timebase` is NOT a sound "now" for deadline
|
||||
/// arithmetic: in `--parallel` workers extract/zero their slots while
|
||||
/// stepping unlocked, and in **lockstep** a parked/poll thread has
|
||||
/// `running_idx == None` so `ctx()` returns `idle_ctx` (timebase 0).
|
||||
/// Either way a `parse_timeout` reading the per-thread basis can see 0
|
||||
/// (or a stale value) and register `deadline = 0 + relative`, a value
|
||||
/// permanently in the past, which `coord_idle_advance` then re-arms
|
||||
/// forever (the timebase-desync livelock; the render-gate root). The
|
||||
/// `global_clock` is a deterministic function of retired guest
|
||||
/// instructions (per-round `stats.instruction_count` floor-ups in
|
||||
/// lockstep, per-block retired counts in parallel), so it is coherent,
|
||||
/// monotonic, never zero after boot, and bit-reproducible across two
|
||||
/// cold lockstep runs.
|
||||
///
|
||||
/// The `hw_id` argument is retained for call-site clarity (which slot a
|
||||
/// caller would conceptually be "asking about") but is no longer read —
|
||||
/// the basis is global.
|
||||
pub fn now_basis_at(&self, _hw_id: u8) -> u64 {
|
||||
self.scheduler.global_clock()
|
||||
}
|
||||
|
||||
/// Fire every timer whose deadline is `<= now` (derived from slot 0's
|
||||
/// timebase, matching `parse_timeout`'s "current thread" fallback).
|
||||
/// For each fire: mark the timer `signaled=true`, clear its
|
||||
@@ -1053,7 +1491,7 @@ impl KernelState {
|
||||
/// fired — the caller uses this to decide whether the scheduler round
|
||||
/// needs a follow-up `advance_to_next_wake_if_due` step.
|
||||
pub fn fire_due_timers(&mut self) -> bool {
|
||||
let now = self.scheduler.ctx(0).timebase;
|
||||
let now = self.now_basis_at(0);
|
||||
let mut fired = false;
|
||||
loop {
|
||||
let Some(&(deadline, handle)) = self.pending_timer_fires.first() else {
|
||||
|
||||
@@ -57,6 +57,11 @@ pub fn allocate_thread_image(
|
||||
mem.write_u32(pcr_base, tls_base);
|
||||
mem.write_u32(pcr_base + 0x2C, hw_thread_id as u32);
|
||||
mem.write_u32(pcr_base + 0x100, 0x1000);
|
||||
// +0x10C prcb_data.current_cpu — canary `pcr->prcb_data.current_cpu`
|
||||
// (PRCB@0x100 + current_cpu@0xC). Guest spin-barriers index a
|
||||
// per-HW-thread slot array by `lbz r11, 268(r13)` = this byte; it
|
||||
// must equal the HW thread id (== PCR+0x2C). See state.rs PcrWriter.
|
||||
mem.write_u8(pcr_base + 0x10C, hw_thread_id);
|
||||
mem.write_u32(pcr_base + 0x150, 0);
|
||||
|
||||
Some(ThreadImage {
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64};
|
||||
|
||||
use xenia_gpu::draw_capture::DrawCapture;
|
||||
use xenia_gpu::texture_cache::TextureKey;
|
||||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||||
use xenia_hid::GamepadState;
|
||||
@@ -133,6 +134,14 @@ pub struct UiBridge {
|
||||
/// reverts to its magenta stub.
|
||||
pub publish_texture:
|
||||
Arc<dyn Fn(Option<(TextureKey, Vec<u8>)>) + Send + Sync>,
|
||||
/// iterate-3O real-render slice: at each `VdSwap`, the kernel hands the
|
||||
/// UI the per-draw geometry captured this frame (one [`DrawCapture`] per
|
||||
/// `PM4_DRAW_INDX*`), including the real guest vertex window. The UI
|
||||
/// replays them through the Xenos wgpu pipeline so the splash renders its
|
||||
/// actual geometry instead of synthetic placeholder shapes. Empty in the
|
||||
/// degenerate case (no draws or capture disabled).
|
||||
pub publish_geometry:
|
||||
Arc<dyn Fn(Vec<DrawCapture>) + Send + Sync>,
|
||||
}
|
||||
|
||||
impl UiBridge {
|
||||
@@ -182,4 +191,9 @@ impl UiBridge {
|
||||
pub fn publish_texture(&self, tex: Option<(TextureKey, Vec<u8>)>) {
|
||||
(self.publish_texture)(tex);
|
||||
}
|
||||
|
||||
/// Hand this frame's captured per-draw geometry to the UI.
|
||||
pub fn publish_geometry(&self, caps: Vec<DrawCapture>) {
|
||||
(self.publish_geometry)(caps);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,14 @@ pub const XAUDIO_MAX_CLIENTS: usize = 8;
|
||||
/// no-op anyway).
|
||||
pub const XAUDIO_SYNTHETIC_HANDLE_BASE: u32 = 0xF000_0000;
|
||||
|
||||
/// The scheduler's deadlock force-wake skips waiters parked solely on
|
||||
/// handles at/above [`xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR`]
|
||||
/// so it never destroys a parked audio worker. Keep these in lockstep:
|
||||
/// every `synthetic_park_handle` must fall inside that protected range.
|
||||
const _: () = assert!(
|
||||
XAUDIO_SYNTHETIC_HANDLE_BASE >= xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR
|
||||
);
|
||||
|
||||
/// Compute the synthetic park-handle for client slot `i`.
|
||||
pub const fn synthetic_park_handle(i: usize) -> u32 {
|
||||
XAUDIO_SYNTHETIC_HANDLE_BASE | (i as u32)
|
||||
@@ -68,6 +76,16 @@ pub struct XAudioClient {
|
||||
/// [audio_system.cc:225-228](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L225-L228)
|
||||
/// + [audio_system.cc:139-141](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L139-L141).
|
||||
pub wrapped_callback_arg: u32,
|
||||
/// Count of frames the guest has handed us via
|
||||
/// `XAudioSubmitRenderDriverFrame` for this client. Canary's
|
||||
/// `AudioSystem::SubmitFrame` forwards the sample buffer to the client's
|
||||
/// driver, whose playback completion later releases the client semaphore
|
||||
/// — the pacing our callback ticker emulates. The guest mixer
|
||||
/// (`sub_824DC350`) discards SubmitFrame's return and reads no field it
|
||||
/// writes, so this counter is purely observational (logging / liveness),
|
||||
/// never read back by the guest. Deterministic: incremented only inside
|
||||
/// the guest-driven export call.
|
||||
pub submitted_frames: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -138,6 +156,35 @@ impl XAudioState {
|
||||
self.clients.get(index).copied().flatten()
|
||||
}
|
||||
|
||||
/// Faithful counterpart to canary `AudioSystem::SubmitFrame`: the guest
|
||||
/// driver client `index` handed us one frame of samples. Canary forwards
|
||||
/// `samples` to the client's `AudioDriver`, whose playback-completion
|
||||
/// callback later releases the client semaphore — the buffer-consumed
|
||||
/// pacing our [`tick_instr`]/[`try_inject_audio_callback`] path already
|
||||
/// emulates. SubmitFrame itself returns void and the guest mixer
|
||||
/// (`sub_824DC350`) reads no field from it, so all we faithfully need to
|
||||
/// do is validate the client and account the frame. Returns `true` iff
|
||||
/// `index` is a registered client (canary submits silence / warns
|
||||
/// otherwise). Deterministic — only the guest-driven export mutates this.
|
||||
pub fn record_submit(&mut self, index: usize) -> bool {
|
||||
match self.clients.get_mut(index) {
|
||||
Some(Some(c)) => {
|
||||
c.submitted_frames = c.submitted_frames.saturating_add(1);
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn submitted_frames(&self, index: usize) -> u64 {
|
||||
self.clients
|
||||
.get(index)
|
||||
.copied()
|
||||
.flatten()
|
||||
.map(|c| c.submitted_frames)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
pub fn any_registered(&self) -> bool {
|
||||
self.clients.iter().any(|c| c.is_some())
|
||||
}
|
||||
@@ -230,6 +277,7 @@ mod tests {
|
||||
callback_pc: 0x8200_0000 + arg,
|
||||
callback_arg: arg,
|
||||
wrapped_callback_arg: 0x4000_0000 + arg,
|
||||
submitted_frames: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -89,6 +89,14 @@ pub struct GuestMemory {
|
||||
mem_watch_addrs: Vec<u32>,
|
||||
/// Count of fires observed (for tests / hand-off telemetry).
|
||||
mem_watch_count: AtomicU64,
|
||||
/// Monotonic count of MMIO accesses (every scalar load/store that
|
||||
/// resolves to a registered MMIO region bumps this by 1). A pure,
|
||||
/// deterministic function of guest execution — the superblock runner
|
||||
/// samples it before/after each block to detect an MMIO touch and
|
||||
/// end the run there (so MMIO ordering vs other HW threads stays at
|
||||
/// the same fine lockstep granularity as before). Relaxed because the
|
||||
/// lockstep path is single-threaded and only needs monotonicity.
|
||||
mmio_access_count: AtomicU64,
|
||||
}
|
||||
|
||||
/// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit
|
||||
@@ -133,9 +141,26 @@ impl GuestMemory {
|
||||
writes_total: AtomicU64::new(0),
|
||||
mem_watch_addrs: Vec::new(),
|
||||
mem_watch_count: AtomicU64::new(0),
|
||||
mmio_access_count: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
/// Monotonic count of MMIO accesses since boot. Used by the superblock
|
||||
/// runner to detect that a just-executed block touched MMIO (so it can
|
||||
/// end the superblock there and keep MMIO ordering at lockstep
|
||||
/// granularity). Deterministic function of guest execution.
|
||||
#[inline]
|
||||
pub fn mmio_access_count(&self) -> u64 {
|
||||
self.mmio_access_count
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn bump_mmio_access(&self) {
|
||||
self.mmio_access_count
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Current version watermark for the page containing `addr`. Bumped by
|
||||
/// any write through `write_u8/16/32/64`. Not affected by MMIO writes
|
||||
/// (those don't touch the backing texture memory).
|
||||
@@ -357,7 +382,8 @@ impl GuestMemory {
|
||||
/// from `GuestMemory` without a wider plumbing change).
|
||||
pub fn write_bulk(&self, addr: u32, buf: &[u8]) {
|
||||
let len = buf.len() as u32;
|
||||
let old_lane = self.capture_mem_watch_old(addr, len);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, len) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len());
|
||||
@@ -374,7 +400,7 @@ impl GuestMemory {
|
||||
// the page works.
|
||||
self.bump_page_version(page * PAGE_SIZE);
|
||||
}
|
||||
self.check_mem_watch(addr, len, old_lane);
|
||||
if watch { self.check_mem_watch(addr, len, old_lane); }
|
||||
}
|
||||
|
||||
/// Check if a guest address has been allocated/committed. Acquire load
|
||||
@@ -487,6 +513,7 @@ impl MemoryAccess for GuestMemory {
|
||||
// MMIO dispatch must come first — a byte read at an MMIO-mapped
|
||||
// address should invoke the callback, not the backing memory.
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
return (mmio.read_callback)(addr) as u8;
|
||||
}
|
||||
if !self.is_mapped(addr) { return 0; }
|
||||
@@ -497,6 +524,7 @@ impl MemoryAccess for GuestMemory {
|
||||
#[inline]
|
||||
fn read_u16(&self, addr: u32) -> u16 {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.read_callback)(addr) as u16
|
||||
} else if !self.is_mapped(addr) {
|
||||
0
|
||||
@@ -509,6 +537,7 @@ impl MemoryAccess for GuestMemory {
|
||||
#[inline]
|
||||
fn read_u32(&self, addr: u32) -> u32 {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.read_callback)(addr)
|
||||
} else if !self.is_mapped(addr) {
|
||||
0
|
||||
@@ -521,6 +550,7 @@ impl MemoryAccess for GuestMemory {
|
||||
#[inline]
|
||||
fn read_u64(&self, addr: u32) -> u64 {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
let hi = (mmio.read_callback)(addr) as u64;
|
||||
let lo = (mmio.read_callback)(addr.wrapping_add(4)) as u64;
|
||||
(hi << 32) | lo
|
||||
@@ -536,23 +566,31 @@ impl MemoryAccess for GuestMemory {
|
||||
// MMIO dispatch first — a byte write at an MMIO-mapped address
|
||||
// must invoke the callback, not the backing memory.
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, val as u32);
|
||||
return;
|
||||
}
|
||||
if !self.is_mapped(addr) { return; }
|
||||
let old_lane = self.capture_mem_watch_old(addr, 1);
|
||||
// Perf (Tier-A #1): the mem-watch capture/report pair are out-of-line
|
||||
// calls; on the common (no-watch) path each was a real call that
|
||||
// immediately returned. Gate both behind one predicted branch so the
|
||||
// hot store does no call work unless a watch is actually armed.
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 1) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe { *ptr = val };
|
||||
self.bump_page_version(addr);
|
||||
self.check_mem_watch(addr, 1, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 1, old_lane); }
|
||||
}
|
||||
|
||||
fn write_u16(&self, addr: u32, val: u16) {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, val as u32);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 2);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 2) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2);
|
||||
@@ -564,16 +602,18 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 1) {
|
||||
self.bump_page_version(addr.wrapping_add(1));
|
||||
}
|
||||
self.check_mem_watch(addr, 2, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 2, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
fn write_u32(&self, addr: u32, val: u32) {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, val);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 4);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 4) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4);
|
||||
@@ -582,17 +622,19 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 3) {
|
||||
self.bump_page_version(addr.wrapping_add(3));
|
||||
}
|
||||
self.check_mem_watch(addr, 4, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 4, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
fn write_u64(&self, addr: u32, val: u64) {
|
||||
if let Some(mmio) = self.find_mmio(addr) {
|
||||
self.bump_mmio_access();
|
||||
(mmio.write_callback)(addr, (val >> 32) as u32);
|
||||
(mmio.write_callback)(addr.wrapping_add(4), val as u32);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 8);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 8) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8);
|
||||
@@ -601,7 +643,7 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 7) {
|
||||
self.bump_page_version(addr.wrapping_add(7));
|
||||
}
|
||||
self.check_mem_watch(addr, 8, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 8, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -181,10 +181,11 @@ impl App {
|
||||
y += line_h;
|
||||
let (fbw, fbh) = rs.frontbuffer_size();
|
||||
let render_line = format!(
|
||||
"Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} tex-cache={:>3} fb={}x{}",
|
||||
"Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} real-geo={:>5} tex-cache={:>3} fb={}x{}",
|
||||
rs.xenos_dispatches_translator,
|
||||
rs.xenos_dispatches_interpreter,
|
||||
rs.translated_pipeline_count(),
|
||||
rs.real_geometry_draws(),
|
||||
rs.host_texture_count(),
|
||||
fbw,
|
||||
fbh,
|
||||
@@ -368,53 +369,28 @@ impl ApplicationHandler<SwapEvent> for App {
|
||||
.map(|s| s.frame_index)
|
||||
.unwrap_or(0);
|
||||
if frame_idx != self.last_xenos_swap_frame {
|
||||
rs.clear_frontbuffer([0.04, 0.04, 0.06, 1.0]);
|
||||
// iterate-3AE: clear to BLACK, matching canary's
|
||||
// splash background. The old navy `[0.04,0.04,0.06]`
|
||||
// was an iterate-3S debug placeholder never matched
|
||||
// to the guest. The splash background-fill draw is a
|
||||
// full-screen Xbox-360 RectangleList (3 verts → a HW
|
||||
// rectangle covering the whole screen); the UI replay
|
||||
// draws it as a single triangle (the 4th implied
|
||||
// corner isn't synthesized), so only the diagonal
|
||||
// half is covered. With a navy clear the uncovered
|
||||
// half showed a navy diagonal in the brief
|
||||
// pre/inter-logo transition frames (where that fill
|
||||
// is the only coverage). Canary's background there is
|
||||
// black, and the guest's fill itself resolves to
|
||||
// black, so a black clear makes the uncovered half
|
||||
// match — the transition is uniformly black like the
|
||||
// oracle. (Full RectangleList→rectangle expansion is
|
||||
// the deeper fix and a separate follow-up; under a
|
||||
// black clear the half-coverage is invisible.)
|
||||
rs.clear_frontbuffer([0.0, 0.0, 0.0, 1.0]);
|
||||
self.last_xenos_swap_frame = frame_idx;
|
||||
}
|
||||
let delta = (draws_total - already) as u32;
|
||||
let (verts_hint, prim_kind, vs_key, ps_key) = self
|
||||
.last_swap_info
|
||||
.map(|s| {
|
||||
(
|
||||
s.last_draw_vertex_count.max(3),
|
||||
s.last_draw_prim,
|
||||
s.vs_blob_key,
|
||||
s.ps_blob_key,
|
||||
)
|
||||
})
|
||||
.unwrap_or((3, 4, 0, 0));
|
||||
// Look up blobs + constants from the bridge and
|
||||
// pack into the WGSL-interpreter layout. Empty
|
||||
// slices produce zero-clause packed buffers — the
|
||||
// WGSL walker short-circuits and the placeholder
|
||||
// export path still renders.
|
||||
let raw_vs: Vec<u32> = self
|
||||
.handles
|
||||
.shader_blobs
|
||||
.lock()
|
||||
.ok()
|
||||
.and_then(|g| g.get(&vs_key).cloned())
|
||||
.unwrap_or_default();
|
||||
let raw_ps: Vec<u32> = self
|
||||
.handles
|
||||
.shader_blobs
|
||||
.lock()
|
||||
.ok()
|
||||
.and_then(|g| g.get(&ps_key).cloned())
|
||||
.unwrap_or_default();
|
||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||
// First time we see a blob key, run the static
|
||||
// metrics analyzer. Keyed on (stage_tag, blob_key)
|
||||
// because the guest can reuse a key across stages.
|
||||
if self.seen_shader_blobs.insert((0u8, vs_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||
}
|
||||
if self.seen_shader_blobs.insert((1u8, ps_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||
}
|
||||
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||
let constants = self
|
||||
.handles
|
||||
.xenos_constants
|
||||
@@ -431,19 +407,72 @@ impl ApplicationHandler<SwapEvent> for App {
|
||||
.ok()
|
||||
.and_then(|g| g.clone());
|
||||
rs.bind_primary_texture(tex_payload);
|
||||
rs.dispatch_xenos_draws(
|
||||
already,
|
||||
delta,
|
||||
verts_hint,
|
||||
prim_kind,
|
||||
vs_key,
|
||||
ps_key,
|
||||
&parsed_vs,
|
||||
&parsed_ps,
|
||||
&vs_packed,
|
||||
&ps_packed,
|
||||
&constants,
|
||||
);
|
||||
|
||||
// iterate-3O real-render slice: prefer replaying the
|
||||
// *real* captured guest geometry. The kernel publishes
|
||||
// one `DrawCapture` per `PM4_DRAW_INDX*` this frame
|
||||
// (real vertices + prim type + shader keys). Fall back
|
||||
// to the legacy synthetic dispatch only when no capture
|
||||
// is available (e.g. capture disabled), so we never
|
||||
// regress to a blank screen.
|
||||
let captures: Vec<xenia_gpu::draw_capture::DrawCapture> = self
|
||||
.handles
|
||||
.geometry
|
||||
.lock()
|
||||
.map(|g| g.clone())
|
||||
.unwrap_or_default();
|
||||
let blobs: std::collections::HashMap<u32, Vec<u32>> = self
|
||||
.handles
|
||||
.shader_blobs
|
||||
.lock()
|
||||
.map(|g| g.clone())
|
||||
.unwrap_or_default();
|
||||
if !captures.is_empty() {
|
||||
rs.dispatch_xenos_captures(
|
||||
&captures,
|
||||
&blobs,
|
||||
&constants,
|
||||
&mut self.seen_shader_blobs,
|
||||
);
|
||||
} else {
|
||||
// Legacy synthetic-geometry fallback (placeholder).
|
||||
let (verts_hint, prim_kind, vs_key, ps_key) = self
|
||||
.last_swap_info
|
||||
.map(|s| {
|
||||
(
|
||||
s.last_draw_vertex_count.max(3),
|
||||
s.last_draw_prim,
|
||||
s.vs_blob_key,
|
||||
s.ps_blob_key,
|
||||
)
|
||||
})
|
||||
.unwrap_or((3, 4, 0, 0));
|
||||
let raw_vs = blobs.get(&vs_key).cloned().unwrap_or_default();
|
||||
let raw_ps = blobs.get(&ps_key).cloned().unwrap_or_default();
|
||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||
if self.seen_shader_blobs.insert((0u8, vs_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||
}
|
||||
if self.seen_shader_blobs.insert((1u8, ps_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||
}
|
||||
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||
rs.dispatch_xenos_draws(
|
||||
already,
|
||||
delta,
|
||||
verts_hint,
|
||||
prim_kind,
|
||||
vs_key,
|
||||
ps_key,
|
||||
&parsed_vs,
|
||||
&parsed_ps,
|
||||
&vs_packed,
|
||||
&ps_packed,
|
||||
&constants,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Self::ingest_frontbuffer(
|
||||
|
||||
@@ -18,6 +18,7 @@ use std::sync::Mutex;
|
||||
|
||||
use crossbeam_utils::atomic::AtomicCell;
|
||||
use winit::event_loop::EventLoopProxy;
|
||||
use xenia_gpu::draw_capture::DrawCapture;
|
||||
use xenia_gpu::texture_cache::TextureKey;
|
||||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||||
use xenia_hid::GamepadState;
|
||||
@@ -66,6 +67,10 @@ pub struct UiHandles {
|
||||
/// fetch-constant slot 0 into linear bytes that the UI should
|
||||
/// upload into the host cache and bind at `@group(1) @binding(0)`.
|
||||
pub primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>>,
|
||||
/// iterate-3O: the most recent frame's captured per-draw geometry. The
|
||||
/// redraw path drains this to replay real guest draws. Replaced wholesale
|
||||
/// each `VdSwap`.
|
||||
pub geometry: Arc<Mutex<Vec<DrawCapture>>>,
|
||||
}
|
||||
|
||||
/// Swap event posted by the CPU-side `VdSwap` handler via
|
||||
@@ -89,6 +94,7 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
||||
let xenos_constants = Arc::new(Mutex::new(XenosConstantsBlock::default()));
|
||||
let primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>> =
|
||||
Arc::new(Mutex::new(None));
|
||||
let geometry: Arc<Mutex<Vec<DrawCapture>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let kernel_bridge = UiBridge {
|
||||
gamepad: {
|
||||
@@ -144,6 +150,14 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
||||
}
|
||||
})
|
||||
},
|
||||
publish_geometry: {
|
||||
let geo = Arc::clone(&geometry);
|
||||
Arc::new(move |caps| {
|
||||
if let Ok(mut lock) = geo.lock() {
|
||||
*lock = caps;
|
||||
}
|
||||
})
|
||||
},
|
||||
};
|
||||
|
||||
let handles = UiHandles {
|
||||
@@ -155,6 +169,7 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
||||
shader_blobs,
|
||||
xenos_constants,
|
||||
primary_texture,
|
||||
geometry,
|
||||
};
|
||||
(handles, kernel_bridge)
|
||||
}
|
||||
|
||||
@@ -84,6 +84,9 @@ pub struct RenderState {
|
||||
/// the shader, or (c) we're running the slow interpreter path.
|
||||
pub xenos_dispatches_translator: u64,
|
||||
pub xenos_dispatches_interpreter: u64,
|
||||
/// iterate-3O: running total of replayed draws that carried a real guest
|
||||
/// vertex window (vs. the procedural fallback). Surfaced on the HUD.
|
||||
real_geometry_draws: u64,
|
||||
/// One-shot latch so we emit a tracing::info! on the **first** real
|
||||
/// draw dispatch rather than spamming every frame. Pairs with the
|
||||
/// "first translator compile" latch below.
|
||||
@@ -447,6 +450,7 @@ impl RenderState {
|
||||
fallback_rgb: [0.06, 0.06, 0.09],
|
||||
xenos_pipeline,
|
||||
xenos_draws_rendered: 0,
|
||||
real_geometry_draws: 0,
|
||||
xenos_dispatches_translator: 0,
|
||||
xenos_dispatches_interpreter: 0,
|
||||
first_dispatch_logged: false,
|
||||
@@ -657,26 +661,39 @@ impl RenderState {
|
||||
draw_index: idx,
|
||||
vertex_count: vertex_count_hint.max(3),
|
||||
prim_kind,
|
||||
// Synthetic fallback path: no real vertex window.
|
||||
vertex_base_dwords: 0,
|
||||
// No real geometry → no NDC transform (procedural positions are
|
||||
// already in clip space).
|
||||
ndc_scale: [0.0, 0.0],
|
||||
ndc_offset: [0.0, 0.0],
|
||||
};
|
||||
// Synthetic visualizer path (legacy): no captured render state, so
|
||||
// use the opaque default.
|
||||
let rstate = crate::xenos_pipeline::RenderState::OPAQUE;
|
||||
if use_translated
|
||||
&& let Some(p) = self.xenos_pipeline.translated_pipeline(vs_key, ps_key) {
|
||||
self.xenos_pipeline.render_one_with_pipeline(
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
p,
|
||||
);
|
||||
metrics::counter!("gpu.shader.use", "path" => "translator")
|
||||
.increment(1);
|
||||
served_translator += 1;
|
||||
continue;
|
||||
}
|
||||
&& self.xenos_pipeline.render_one_translated(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
vs_key,
|
||||
ps_key,
|
||||
rstate,
|
||||
)
|
||||
{
|
||||
metrics::counter!("gpu.shader.use", "path" => "translator").increment(1);
|
||||
served_translator += 1;
|
||||
continue;
|
||||
}
|
||||
self.xenos_pipeline.render_one(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
rstate,
|
||||
);
|
||||
metrics::counter!("gpu.shader.use", "path" => "interpreter").increment(1);
|
||||
served_interpreter += 1;
|
||||
@@ -707,12 +724,201 @@ impl RenderState {
|
||||
}
|
||||
}
|
||||
|
||||
/// iterate-3O real-render slice: replay a batch of *real* captured guest
|
||||
/// draws. Unlike [`dispatch_xenos_draws`] (synthetic placeholder geometry),
|
||||
/// each [`DrawCapture`] carries the actual guest vertex window, primitive
|
||||
/// type, host vertex count, and the real (vs, ps) keys. Per capture we:
|
||||
/// 1. upload the captured guest vertex bytes into `vertex_buffer` (b4),
|
||||
/// 2. upload the matching VS/PS microcode + per-frame constants,
|
||||
/// 3. render through the translated (P7) pipeline if it compiled, else
|
||||
/// the interpreter — with `vertex_base_dwords` set so the shader
|
||||
/// rebases its absolute fetch address into the uploaded window.
|
||||
///
|
||||
/// Returns the number of captures that had a real vertex window (vs. the
|
||||
/// procedural fallback), for HUD reporting. `shader_blobs` / `constants`
|
||||
/// come from the bridge; `seen` records which blobs have had static
|
||||
/// metrics emitted (one-shot per blob, matching the legacy path).
|
||||
pub fn dispatch_xenos_captures(
|
||||
&mut self,
|
||||
captures: &[xenia_gpu::draw_capture::DrawCapture],
|
||||
shader_blobs: &std::collections::HashMap<u32, Vec<u32>>,
|
||||
constants: &xenia_gpu::xenos_constants::XenosConstantsBlock,
|
||||
seen: &mut std::collections::HashSet<(u8, u32)>,
|
||||
) -> u32 {
|
||||
if captures.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
let mut real_count = 0u32;
|
||||
// iterate-3X (GPUBUG-111): each captured draw uploads its OWN vertex
|
||||
// window + per-draw constants + shader via `queue.write_buffer`. In
|
||||
// wgpu all `write_buffer` calls staged before a single `queue.submit`
|
||||
// are applied *before any* command in that submit executes — so a single
|
||||
// encoder for the whole batch made every draw read only the LAST draw's
|
||||
// vertex buffer / uniforms (the splash logo quad sampled the fullscreen
|
||||
// background quad's vertices → nothing rendered where the logo was).
|
||||
// Submit ONE encoder PER draw so each draw's writes land before its own
|
||||
// pass. The frontbuffer uses `LoadOp::Load`, so per-draw submits still
|
||||
// composite over each other exactly like before.
|
||||
for cap in captures {
|
||||
// iterate-3T: bind this draw's REAL decoded texture (keyed off the
|
||||
// active PS's tfetch slot, attached in `gpu_system`) so the textured
|
||||
// logo samples the artwork. `None` reverts to the magenta stub for
|
||||
// flat draws. Each `set_texture_view` rebuilds the tex bind group;
|
||||
// the subsequent `render_one*` reads it, so per-draw binding works
|
||||
// even though all draws share one encoder.
|
||||
{
|
||||
let Self {
|
||||
device,
|
||||
queue,
|
||||
xenos_pipeline,
|
||||
host_texture_cache,
|
||||
..
|
||||
} = self;
|
||||
match cap.textures.first() {
|
||||
Some((key, version, bytes)) => {
|
||||
// iterate-3AD: use the decoder's real content `version`
|
||||
// (from `span_max_version`) so the host cache re-uploads
|
||||
// when the guest fills MORE of an evolving atlas. The
|
||||
// publisher and the 2nd splash logo share one K8888
|
||||
// surface (base 0x4dbee000); the 2nd logo's texels land
|
||||
// AFTER the first upload. With the old hardcoded
|
||||
// `version_when_uploaded = 1`, the same `TextureKey`
|
||||
// never re-uploaded, so the 2nd logo sampled its (then
|
||||
// still-zero) atlas region as black. The real version
|
||||
// increases as the guest writes, triggering re-upload.
|
||||
let cached = xenia_gpu::texture_cache::CachedTexture {
|
||||
key: *key,
|
||||
version_when_uploaded: *version,
|
||||
bytes: bytes.clone(),
|
||||
};
|
||||
host_texture_cache.upload(device, queue, &cached);
|
||||
if let Some(view) = host_texture_cache.view_for(key) {
|
||||
xenos_pipeline.set_texture_view(device, Some(view));
|
||||
}
|
||||
}
|
||||
None => xenos_pipeline.set_texture_view(device, None),
|
||||
}
|
||||
}
|
||||
let raw_vs = shader_blobs.get(&cap.vs_key).cloned().unwrap_or_default();
|
||||
let raw_ps = shader_blobs.get(&cap.ps_key).cloned().unwrap_or_default();
|
||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||
if seen.insert((0u8, cap.vs_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||
}
|
||||
if seen.insert((1u8, cap.ps_key)) {
|
||||
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||
}
|
||||
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||
// Upload this draw's shader + constants + real vertex window.
|
||||
self.xenos_pipeline.upload_shader_and_constants(
|
||||
&self.queue,
|
||||
&vs_packed,
|
||||
&ps_packed,
|
||||
constants,
|
||||
);
|
||||
if cap.has_real_vertices && !cap.vertex_dwords.is_empty() {
|
||||
self.xenos_pipeline
|
||||
.upload_vertex_data(&self.queue, &cap.vertex_dwords);
|
||||
real_count += 1;
|
||||
}
|
||||
let use_translated = cap.vs_key != 0
|
||||
&& cap.ps_key != 0
|
||||
&& ensure_translated_pipeline(
|
||||
&mut self.xenos_pipeline,
|
||||
&self.device,
|
||||
cap.vs_key,
|
||||
cap.ps_key,
|
||||
&parsed_vs,
|
||||
&parsed_ps,
|
||||
);
|
||||
let base = if cap.has_real_vertices {
|
||||
cap.window_base_dwords
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let req = DrawRequest {
|
||||
draw_index: cap.draw_index,
|
||||
vertex_count: cap.host_vertex_count.max(3),
|
||||
prim_kind: cap.prim_code,
|
||||
vertex_base_dwords: base,
|
||||
// iterate-3S: apply the per-draw guest viewport → host NDC
|
||||
// transform only when we have real geometry (otherwise the
|
||||
// procedural fallback already emits clip-space positions).
|
||||
ndc_scale: if cap.has_real_vertices { cap.ndc_scale } else { [0.0, 0.0] },
|
||||
ndc_offset: if cap.has_real_vertices { cap.ndc_offset } else { [0.0, 0.0] },
|
||||
};
|
||||
// iterate-3Y: replay this draw's real color/blend/write-mask state
|
||||
// (captured from `RB_BLENDCONTROL0` / `RB_COLOR_MASK`) so overlays
|
||||
// composite the way the guest intends instead of opaquely
|
||||
// overwriting the logo.
|
||||
let rstate = crate::xenos_pipeline::RenderState {
|
||||
blend_control: cap.blend_control,
|
||||
color_mask: cap.color_mask,
|
||||
};
|
||||
let mut encoder = self
|
||||
.device
|
||||
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("xenos capture replay (per-draw)"),
|
||||
});
|
||||
let served_translated = use_translated
|
||||
&& self.xenos_pipeline.render_one_translated(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
cap.vs_key,
|
||||
cap.ps_key,
|
||||
rstate,
|
||||
);
|
||||
if served_translated {
|
||||
self.xenos_dispatches_translator =
|
||||
self.xenos_dispatches_translator.saturating_add(1);
|
||||
} else {
|
||||
self.xenos_pipeline.render_one(
|
||||
&self.device,
|
||||
&self.queue,
|
||||
&mut encoder,
|
||||
&self.frontbuffer_view,
|
||||
req,
|
||||
rstate,
|
||||
);
|
||||
self.xenos_dispatches_interpreter =
|
||||
self.xenos_dispatches_interpreter.saturating_add(1);
|
||||
}
|
||||
self.queue.submit(std::iter::once(encoder.finish()));
|
||||
}
|
||||
self.xenos_draws_rendered = self
|
||||
.xenos_draws_rendered
|
||||
.saturating_add(captures.len() as u64);
|
||||
self.real_geometry_draws = self
|
||||
.real_geometry_draws
|
||||
.saturating_add(real_count as u64);
|
||||
if !self.first_dispatch_logged {
|
||||
self.first_dispatch_logged = true;
|
||||
tracing::info!(
|
||||
captures = captures.len(),
|
||||
real_vertex_draws = real_count,
|
||||
"first Xenos capture batch replayed (real geometry)"
|
||||
);
|
||||
}
|
||||
real_count
|
||||
}
|
||||
|
||||
/// Count of distinct translator pipelines compiled so far. Surfaced
|
||||
/// on the HUD as `xlated=N` to make "is P7 working?" observable.
|
||||
pub fn translated_pipeline_count(&self) -> usize {
|
||||
self.xenos_pipeline.translated_pipeline_count()
|
||||
}
|
||||
|
||||
/// Running count of captured draws that carried a real vertex window
|
||||
/// (surfaced on the HUD). Updated by [`dispatch_xenos_captures`].
|
||||
pub fn real_geometry_draws(&self) -> u64 {
|
||||
self.real_geometry_draws
|
||||
}
|
||||
|
||||
/// Clear the frontbuffer to `[r,g,b,a]` in linear space. Matches the
|
||||
/// fallback clear the outer swapchain render does so the two stages
|
||||
/// agree on "no draws yet = dark navy".
|
||||
|
||||
@@ -36,7 +36,142 @@ struct DrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
/// iterate-3O: guest dword base of the uploaded `vertex_buffer` window.
|
||||
/// The WGSL subtracts this from the absolute vertex-fetch address.
|
||||
vertex_base_dwords: u32,
|
||||
/// iterate-3S: guest→host NDC XY transform (mirrors canary
|
||||
/// `GetHostViewportInfo`). `clip.xy = pos.xy * ndc_scale + ndc_offset*pos.w`.
|
||||
/// Y is pre-flipped for wgpu. 16 bytes so the block stays 16-byte aligned.
|
||||
ndc_scale: [f32; 2],
|
||||
ndc_offset: [f32; 2],
|
||||
}
|
||||
|
||||
/// iterate-3Y: the per-draw host color/blend/write-mask render state, decoded
|
||||
/// from the guest registers (`RB_BLENDCONTROL0` / `RB_COLOR_MASK`). Used both
|
||||
/// as part of the pipeline-cache key and to build the `wgpu::ColorTargetState`.
|
||||
/// Mirrors canary's `GetColorBlendStateForRenderTarget` (D3D12
|
||||
/// `pipeline_cache.cc`): the factors come straight from `RB_BLENDCONTROL`,
|
||||
/// and a zero write-mask forces the no-blend `One,Zero` equation.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct RenderState {
|
||||
/// `RB_BLENDCONTROL0` raw value (RT0). `0x00010001` (One,Zero / One,Zero,
|
||||
/// Add) is the opaque case.
|
||||
pub blend_control: u32,
|
||||
/// RT0 nibble of `RB_COLOR_MASK` (bit0=R … bit3=A). 0 = write nothing.
|
||||
pub color_mask: u8,
|
||||
}
|
||||
|
||||
impl RenderState {
|
||||
/// Fully-opaque, all-channels state (the legacy fixed behaviour). Used for
|
||||
/// procedural/synthetic draws that have no captured guest state.
|
||||
pub const OPAQUE: RenderState = RenderState {
|
||||
blend_control: 0x0001_0001,
|
||||
color_mask: 0xF,
|
||||
};
|
||||
|
||||
/// Map a Xenos `BlendFactor` (5-bit field) to a wgpu `BlendFactor`,
|
||||
/// mirroring canary `kBlendFactorMap` (D3D12 `pipeline_cache.cc:1504`).
|
||||
fn map_factor(f: u32) -> wgpu::BlendFactor {
|
||||
match f {
|
||||
0 => wgpu::BlendFactor::Zero,
|
||||
1 => wgpu::BlendFactor::One,
|
||||
4 => wgpu::BlendFactor::Src,
|
||||
5 => wgpu::BlendFactor::OneMinusSrc,
|
||||
6 => wgpu::BlendFactor::SrcAlpha,
|
||||
7 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||
8 => wgpu::BlendFactor::Dst,
|
||||
9 => wgpu::BlendFactor::OneMinusDst,
|
||||
10 => wgpu::BlendFactor::DstAlpha,
|
||||
11 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||||
12 => wgpu::BlendFactor::Constant,
|
||||
13 => wgpu::BlendFactor::OneMinusConstant,
|
||||
14 => wgpu::BlendFactor::Constant,
|
||||
15 => wgpu::BlendFactor::OneMinusConstant,
|
||||
16 => wgpu::BlendFactor::SrcAlphaSaturated,
|
||||
// 2/3 and >16 are undefined on Xenos; canary maps to Zero.
|
||||
_ => wgpu::BlendFactor::Zero,
|
||||
}
|
||||
}
|
||||
|
||||
/// Map a Xenos `BlendFactor` for the *alpha* channel, mirroring canary
|
||||
/// `kBlendFactorAlphaMap` (color-mode factors collapse to alpha).
|
||||
fn map_factor_alpha(f: u32) -> wgpu::BlendFactor {
|
||||
match f {
|
||||
4 => wgpu::BlendFactor::SrcAlpha,
|
||||
5 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||
8 => wgpu::BlendFactor::DstAlpha,
|
||||
9 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||||
other => Self::map_factor(other),
|
||||
}
|
||||
}
|
||||
|
||||
fn map_op(o: u32) -> wgpu::BlendOperation {
|
||||
match o {
|
||||
0 => wgpu::BlendOperation::Add,
|
||||
1 => wgpu::BlendOperation::Subtract,
|
||||
2 => wgpu::BlendOperation::Min,
|
||||
3 => wgpu::BlendOperation::Max,
|
||||
4 => wgpu::BlendOperation::ReverseSubtract,
|
||||
_ => wgpu::BlendOperation::Add,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the `wgpu::ColorTargetState` for this draw.
|
||||
fn color_target(&self, format: wgpu::TextureFormat) -> wgpu::ColorTargetState {
|
||||
let bc = self.blend_control;
|
||||
let color_src = bc & 0x1F;
|
||||
let color_op = (bc >> 5) & 0x7;
|
||||
let color_dst = (bc >> 8) & 0x1F;
|
||||
let alpha_src = (bc >> 16) & 0x1F;
|
||||
let alpha_op = (bc >> 21) & 0x7;
|
||||
let alpha_dst = (bc >> 24) & 0x1F;
|
||||
|
||||
// wgpu requires `blend: None` when nothing would be written; also the
|
||||
// `One,Zero,Add` identity is the opaque case (canary's no-blend), which
|
||||
// we express as `blend: None` so it's a plain overwrite.
|
||||
let is_opaque = color_src == 1
|
||||
&& color_dst == 0
|
||||
&& color_op == 0
|
||||
&& alpha_src == 1
|
||||
&& alpha_dst == 0
|
||||
&& alpha_op == 0;
|
||||
let blend = if is_opaque {
|
||||
None
|
||||
} else {
|
||||
Some(wgpu::BlendState {
|
||||
color: wgpu::BlendComponent {
|
||||
src_factor: Self::map_factor(color_src),
|
||||
dst_factor: Self::map_factor(color_dst),
|
||||
operation: Self::map_op(color_op),
|
||||
},
|
||||
alpha: wgpu::BlendComponent {
|
||||
src_factor: Self::map_factor_alpha(alpha_src),
|
||||
dst_factor: Self::map_factor_alpha(alpha_dst),
|
||||
operation: Self::map_op(alpha_op),
|
||||
},
|
||||
})
|
||||
};
|
||||
|
||||
let mut write_mask = wgpu::ColorWrites::empty();
|
||||
if self.color_mask & 0x1 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::RED;
|
||||
}
|
||||
if self.color_mask & 0x2 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::GREEN;
|
||||
}
|
||||
if self.color_mask & 0x4 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::BLUE;
|
||||
}
|
||||
if self.color_mask & 0x8 != 0 {
|
||||
write_mask |= wgpu::ColorWrites::ALPHA;
|
||||
}
|
||||
|
||||
wgpu::ColorTargetState {
|
||||
format,
|
||||
blend,
|
||||
write_mask,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
|
||||
@@ -48,6 +183,13 @@ pub struct DrawRequest {
|
||||
pub vertex_count: u32,
|
||||
/// Xenos primitive-type code; shader may branch on it in P3b+.
|
||||
pub prim_kind: u32,
|
||||
/// iterate-3O: guest dword base of the per-draw vertex window uploaded to
|
||||
/// `vertex_buffer` (b4). 0 = no real vertex window (procedural fallback).
|
||||
pub vertex_base_dwords: u32,
|
||||
/// iterate-3S: guest→host NDC XY transform (Y pre-flipped). When all-zero
|
||||
/// the shader leaves the position untransformed (procedural fallback).
|
||||
pub ndc_scale: [f32; 2],
|
||||
pub ndc_offset: [f32; 2],
|
||||
}
|
||||
|
||||
/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
|
||||
@@ -57,7 +199,16 @@ const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS
|
||||
const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;
|
||||
|
||||
pub struct XenosPipeline {
|
||||
/// Interpreter pipeline with the legacy fixed (alpha-blend) state. Kept as
|
||||
/// the default; per-state variants are built lazily in `interp_cache`.
|
||||
pipeline: wgpu::RenderPipeline,
|
||||
/// iterate-3Y: the interpreter WGSL module, retained so per-render-state
|
||||
/// interpreter pipelines can be compiled on demand.
|
||||
interp_shader: wgpu::ShaderModule,
|
||||
/// iterate-3Y: interpreter pipelines keyed on the per-draw `RenderState`
|
||||
/// (blend + write mask), so flat/alpha/opaque draws composite correctly
|
||||
/// even when their (vs,ps) didn't translate.
|
||||
interp_cache: std::collections::HashMap<RenderState, wgpu::RenderPipeline>,
|
||||
draw_ctx_buffer: wgpu::Buffer,
|
||||
constants_buffer: wgpu::Buffer,
|
||||
vs_ucode_buffer: wgpu::Buffer,
|
||||
@@ -78,7 +229,12 @@ pub struct XenosPipeline {
|
||||
/// so every (vs, ps) pair gets compiled once and re-used for every
|
||||
/// subsequent draw. Interpreter pipeline remains the fallback.
|
||||
pipeline_layout: wgpu::PipelineLayout,
|
||||
translated_cache: std::collections::HashMap<(u32, u32), wgpu::RenderPipeline>,
|
||||
/// iterate-3Y: cached translator pipelines keyed on the shader pair AND the
|
||||
/// per-draw render state, so the same (vs,ps) with different blend/mask
|
||||
/// composites correctly. The translated WGSL module is itself cached per
|
||||
/// (vs,ps) so re-translation only happens once.
|
||||
translated_cache: std::collections::HashMap<(u32, u32, RenderState), wgpu::RenderPipeline>,
|
||||
translated_modules: std::collections::HashMap<(u32, u32), wgpu::ShaderModule>,
|
||||
pub target_format: wgpu::TextureFormat,
|
||||
}
|
||||
|
||||
@@ -193,7 +349,9 @@ impl XenosPipeline {
|
||||
draw_index: 0,
|
||||
vertex_count: 3,
|
||||
prim_kind: 4,
|
||||
_pad: 0,
|
||||
vertex_base_dwords: 0,
|
||||
ndc_scale: [0.0, 0.0],
|
||||
ndc_offset: [0.0, 0.0],
|
||||
};
|
||||
let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||
label: Some("xenos draw ctx"),
|
||||
@@ -242,8 +400,13 @@ impl XenosPipeline {
|
||||
usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
|
||||
view_formats: &[],
|
||||
});
|
||||
// Magenta (255, 0, 255, 255) so a missing-texture read visibly stands
|
||||
// out on-screen when the interpreter does sample it.
|
||||
// iterate-3Y: transparent black (0,0,0,0). When a textured draw's
|
||||
// real texture can't be resolved (e.g. its sampler slot is shadowed by
|
||||
// a vertex-fetch constant), sampling a *transparent* texel makes the
|
||||
// draw a no-op under its real premultiplied-alpha blend — instead of
|
||||
// fabricating an opaque magenta that overpaints everything (the old
|
||||
// debug stub). This removes a fake rather than adding one: we never
|
||||
// invent visible pixels for an unresolved texture.
|
||||
queue.write_texture(
|
||||
wgpu::ImageCopyTexture {
|
||||
texture: &dummy_tex,
|
||||
@@ -251,7 +414,7 @@ impl XenosPipeline {
|
||||
origin: wgpu::Origin3d::ZERO,
|
||||
aspect: wgpu::TextureAspect::All,
|
||||
},
|
||||
&[0xFFu8, 0x00, 0xFF, 0xFF],
|
||||
&[0x00u8, 0x00, 0x00, 0x00],
|
||||
wgpu::ImageDataLayout {
|
||||
offset: 0,
|
||||
bytes_per_row: Some(4),
|
||||
@@ -359,6 +522,8 @@ impl XenosPipeline {
|
||||
|
||||
Self {
|
||||
pipeline,
|
||||
interp_shader: shader,
|
||||
interp_cache: std::collections::HashMap::new(),
|
||||
draw_ctx_buffer,
|
||||
constants_buffer,
|
||||
vs_ucode_buffer,
|
||||
@@ -371,31 +536,22 @@ impl XenosPipeline {
|
||||
dummy_view,
|
||||
pipeline_layout: layout,
|
||||
translated_cache: std::collections::HashMap::new(),
|
||||
translated_modules: std::collections::HashMap::new(),
|
||||
target_format,
|
||||
}
|
||||
}
|
||||
|
||||
/// P7 — does the translator cache already have a pipeline for this
|
||||
/// (vs, ps) pair?
|
||||
/// P7 — has the translator already produced a WGSL *module* for this
|
||||
/// (vs, ps) pair? (A per-render-state pipeline may still need building.)
|
||||
pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
|
||||
self.translated_cache
|
||||
self.translated_modules
|
||||
.contains_key(&(vs_blob_key, ps_blob_key))
|
||||
}
|
||||
|
||||
/// P7 — fetch a cached translator pipeline. `None` if not yet built.
|
||||
pub fn translated_pipeline(
|
||||
&self,
|
||||
vs_blob_key: u32,
|
||||
ps_blob_key: u32,
|
||||
) -> Option<&wgpu::RenderPipeline> {
|
||||
self.translated_cache
|
||||
.get(&(vs_blob_key, ps_blob_key))
|
||||
}
|
||||
|
||||
/// P7 — compile a translator-produced WGSL module into a
|
||||
/// `wgpu::RenderPipeline` and insert it into the cache keyed on
|
||||
/// `(vs_blob_key, ps_blob_key)`. Returns `true` on success. Duplicate
|
||||
/// inserts are no-ops. Emits `gpu.shader.compile_ok` on success.
|
||||
/// P7 — compile a translator-produced WGSL module and cache it keyed on
|
||||
/// `(vs_blob_key, ps_blob_key)`. The actual `RenderPipeline` (which also
|
||||
/// depends on the per-draw blend/mask state) is built lazily by
|
||||
/// [`render_one_translated`]. Returns `true` on success.
|
||||
pub fn insert_translated(
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
@@ -404,7 +560,7 @@ impl XenosPipeline {
|
||||
wgsl: &str,
|
||||
) -> bool {
|
||||
let key = (vs_blob_key, ps_blob_key);
|
||||
if self.translated_cache.contains_key(&key) {
|
||||
if self.translated_modules.contains_key(&key) {
|
||||
return true;
|
||||
}
|
||||
let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||
@@ -420,31 +576,42 @@ impl XenosPipeline {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
self.translated_modules.insert(key, shader);
|
||||
metrics::counter!("gpu.shader.compile_ok").increment(1);
|
||||
true
|
||||
}
|
||||
|
||||
/// iterate-3Y: ensure a translator pipeline exists for `(vs,ps,rstate)`,
|
||||
/// building it from the cached module + the per-draw color/blend target.
|
||||
fn ensure_translated_for_state(
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
vs_key: u32,
|
||||
ps_key: u32,
|
||||
rstate: RenderState,
|
||||
) -> bool {
|
||||
let pkey = (vs_key, ps_key, rstate);
|
||||
if self.translated_cache.contains_key(&pkey) {
|
||||
return true;
|
||||
}
|
||||
let Some(module) = self.translated_modules.get(&(vs_key, ps_key)) else {
|
||||
return false;
|
||||
};
|
||||
let target = rstate.color_target(self.target_format);
|
||||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||||
label: Some("xenos translated pipeline"),
|
||||
layout: Some(&self.pipeline_layout),
|
||||
vertex: wgpu::VertexState {
|
||||
module: &shader,
|
||||
module,
|
||||
entry_point: "vs_main",
|
||||
compilation_options: Default::default(),
|
||||
buffers: &[],
|
||||
},
|
||||
fragment: Some(wgpu::FragmentState {
|
||||
module: &shader,
|
||||
module,
|
||||
entry_point: "fs_main",
|
||||
compilation_options: Default::default(),
|
||||
targets: &[Some(wgpu::ColorTargetState {
|
||||
format: self.target_format,
|
||||
blend: Some(wgpu::BlendState {
|
||||
color: wgpu::BlendComponent {
|
||||
src_factor: wgpu::BlendFactor::SrcAlpha,
|
||||
dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||
operation: wgpu::BlendOperation::Add,
|
||||
},
|
||||
alpha: wgpu::BlendComponent::OVER,
|
||||
}),
|
||||
write_mask: wgpu::ColorWrites::ALL,
|
||||
})],
|
||||
targets: &[Some(target)],
|
||||
}),
|
||||
primitive: wgpu::PrimitiveState {
|
||||
topology: wgpu::PrimitiveTopology::TriangleList,
|
||||
@@ -460,30 +627,78 @@ impl XenosPipeline {
|
||||
multiview: None,
|
||||
cache: None,
|
||||
});
|
||||
self.translated_cache.insert(key, pipeline);
|
||||
metrics::counter!("gpu.shader.compile_ok").increment(1);
|
||||
self.translated_cache.insert(pkey, pipeline);
|
||||
true
|
||||
}
|
||||
|
||||
/// Render one draw with the translator-produced pipeline instead of
|
||||
/// the interpreter. Mirrors [`render_one`] except the bound pipeline
|
||||
/// is swapped for `pipeline`.
|
||||
pub fn render_one_with_pipeline(
|
||||
&self,
|
||||
/// iterate-3Y: ensure an interpreter pipeline exists for `rstate`.
|
||||
fn ensure_interp_for_state(&mut self, device: &wgpu::Device, rstate: RenderState) {
|
||||
if self.interp_cache.contains_key(&rstate) {
|
||||
return;
|
||||
}
|
||||
let target = rstate.color_target(self.target_format);
|
||||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||||
label: Some("xenos interp pipeline (per-state)"),
|
||||
layout: Some(&self.pipeline_layout),
|
||||
vertex: wgpu::VertexState {
|
||||
module: &self.interp_shader,
|
||||
entry_point: "vs_main",
|
||||
compilation_options: Default::default(),
|
||||
buffers: &[],
|
||||
},
|
||||
fragment: Some(wgpu::FragmentState {
|
||||
module: &self.interp_shader,
|
||||
entry_point: "fs_main",
|
||||
compilation_options: Default::default(),
|
||||
targets: &[Some(target)],
|
||||
}),
|
||||
primitive: wgpu::PrimitiveState {
|
||||
topology: wgpu::PrimitiveTopology::TriangleList,
|
||||
strip_index_format: None,
|
||||
front_face: wgpu::FrontFace::Ccw,
|
||||
cull_mode: None,
|
||||
polygon_mode: wgpu::PolygonMode::Fill,
|
||||
unclipped_depth: false,
|
||||
conservative: false,
|
||||
},
|
||||
depth_stencil: None,
|
||||
multisample: wgpu::MultisampleState::default(),
|
||||
multiview: None,
|
||||
cache: None,
|
||||
});
|
||||
self.interp_cache.insert(rstate, pipeline);
|
||||
}
|
||||
|
||||
/// iterate-3Y: render one draw through the translator pipeline built for
|
||||
/// this draw's render state. Returns `false` if no module is cached for
|
||||
/// `(vs,ps)` (caller should fall back to the interpreter).
|
||||
pub fn render_one_translated(
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
queue: &wgpu::Queue,
|
||||
encoder: &mut wgpu::CommandEncoder,
|
||||
target_view: &wgpu::TextureView,
|
||||
req: DrawRequest,
|
||||
pipeline: &wgpu::RenderPipeline,
|
||||
) {
|
||||
vs_key: u32,
|
||||
ps_key: u32,
|
||||
rstate: RenderState,
|
||||
) -> bool {
|
||||
if !self.ensure_translated_for_state(device, vs_key, ps_key, rstate) {
|
||||
return false;
|
||||
}
|
||||
let cb = DrawConstants {
|
||||
draw_index: req.draw_index,
|
||||
vertex_count: req.vertex_count.max(3),
|
||||
prim_kind: req.prim_kind,
|
||||
_pad: 0,
|
||||
vertex_base_dwords: req.vertex_base_dwords,
|
||||
ndc_scale: req.ndc_scale,
|
||||
ndc_offset: req.ndc_offset,
|
||||
};
|
||||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||||
|
||||
let pipeline = self
|
||||
.translated_cache
|
||||
.get(&(vs_key, ps_key, rstate))
|
||||
.expect("just ensured");
|
||||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||||
label: Some("xenos translated draw"),
|
||||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||||
@@ -503,6 +718,7 @@ impl XenosPipeline {
|
||||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||||
pass.draw(0..rounded.max(3), 0..1);
|
||||
true
|
||||
}
|
||||
|
||||
/// Number of distinct translator pipelines cached. Surfaced to the HUD.
|
||||
@@ -594,22 +810,34 @@ impl XenosPipeline {
|
||||
queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
|
||||
}
|
||||
|
||||
/// Render one captured draw.
|
||||
/// Render one captured draw through the interpreter, using the per-draw
|
||||
/// `rstate` (blend/write-mask) so flat draws composite correctly even
|
||||
/// when their (vs,ps) didn't translate. `RenderState::OPAQUE` reproduces
|
||||
/// the legacy fixed behaviour for procedural/synthetic draws.
|
||||
pub fn render_one(
|
||||
&self,
|
||||
&mut self,
|
||||
device: &wgpu::Device,
|
||||
queue: &wgpu::Queue,
|
||||
encoder: &mut wgpu::CommandEncoder,
|
||||
target_view: &wgpu::TextureView,
|
||||
req: DrawRequest,
|
||||
rstate: RenderState,
|
||||
) {
|
||||
self.ensure_interp_for_state(device, rstate);
|
||||
let cb = DrawConstants {
|
||||
draw_index: req.draw_index,
|
||||
vertex_count: req.vertex_count.max(3),
|
||||
prim_kind: req.prim_kind,
|
||||
_pad: 0,
|
||||
vertex_base_dwords: req.vertex_base_dwords,
|
||||
ndc_scale: req.ndc_scale,
|
||||
ndc_offset: req.ndc_offset,
|
||||
};
|
||||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||||
|
||||
let pipeline = self
|
||||
.interp_cache
|
||||
.get(&rstate)
|
||||
.expect("just ensured");
|
||||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||||
label: Some("xenos draw"),
|
||||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||||
@@ -624,7 +852,7 @@ impl XenosPipeline {
|
||||
timestamp_writes: None,
|
||||
occlusion_query_set: None,
|
||||
});
|
||||
pass.set_pipeline(&self.pipeline);
|
||||
pass.set_pipeline(pipeline);
|
||||
pass.set_bind_group(0, &self.bind_group, &[]);
|
||||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||||
@@ -638,6 +866,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn draw_constants_layout_matches_wgsl_uniform() {
|
||||
assert_eq!(std::mem::size_of::<DrawConstants>(), 16);
|
||||
assert_eq!(std::mem::size_of::<DrawConstants>(), 32);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,6 +31,9 @@ impl VfsDevice for HostPathDevice {
|
||||
is_directory: metadata.is_dir(),
|
||||
size: metadata.len(),
|
||||
offset: 0,
|
||||
// Host FS carries no Xbox attribute byte; synthesise the
|
||||
// DIRECTORY/NORMAL split like canary's HostPathDevice.
|
||||
attributes: if metadata.is_dir() { 0x10 } else { 0x80 },
|
||||
});
|
||||
}
|
||||
Ok(entries)
|
||||
@@ -49,6 +52,7 @@ impl VfsDevice for HostPathDevice {
|
||||
is_directory: metadata.is_dir(),
|
||||
size: metadata.len(),
|
||||
offset: 0,
|
||||
attributes: if metadata.is_dir() { 0x10 } else { 0x80 },
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,6 +29,11 @@ const GDFX_MAGIC: &[u8; 20] = b"MICROSOFT*XBOX*MEDIA";
|
||||
/// File attribute: directory
|
||||
const FILE_ATTRIBUTE_DIRECTORY: u8 = 0x10;
|
||||
|
||||
/// File attribute: read-only. Canary OR's this into every GDFX entry's
|
||||
/// attribute byte because a pressed disc is inherently read-only
|
||||
/// (`disc_image_device.cc:154`: `attributes | kFileAttributeReadOnly`).
|
||||
const FILE_ATTRIBUTE_READONLY: u8 = 0x01;
|
||||
|
||||
/// Known game partition offsets to try
|
||||
const LIKELY_OFFSETS: &[u64] = &[
|
||||
0x0000_0000,
|
||||
@@ -131,6 +136,11 @@ impl DiscImageDevice {
|
||||
|
||||
let name = String::from_utf8_lossy(&buffer[p + 14..p + 14 + name_length]).to_string();
|
||||
let is_directory = (attributes & FILE_ATTRIBUTE_DIRECTORY) != 0;
|
||||
// Match canary: the on-disc attribute byte (DIRECTORY/HIDDEN/SYSTEM/
|
||||
// ARCHIVE/NORMAL bits as authored) OR the implicit READONLY bit for
|
||||
// pressed media. We forward the FULL byte, not a path-shape guess, so
|
||||
// attribute queries report exactly what the disc records.
|
||||
let attributes = (attributes | FILE_ATTRIBUTE_READONLY) as u32;
|
||||
let file_offset = self.game_offset + sector * SECTOR_SIZE;
|
||||
let full_path = if prefix.is_empty() {
|
||||
name.clone()
|
||||
@@ -143,6 +153,7 @@ impl DiscImageDevice {
|
||||
is_directory,
|
||||
size: length,
|
||||
offset: file_offset,
|
||||
attributes,
|
||||
});
|
||||
|
||||
// Descend into subdirectories. Zero-length directory entries exist
|
||||
@@ -260,4 +271,73 @@ mod tests {
|
||||
.expect("read_file on nested path");
|
||||
assert!(!bytes.is_empty(), "nested read returned empty buffer");
|
||||
}
|
||||
|
||||
/// Build a one-node GDFX directory buffer in memory and parse it with
|
||||
/// `collect_entries`, asserting the real on-disc attribute byte is
|
||||
/// forwarded into `VfsEntry.attributes` (with READONLY OR'd in, matching
|
||||
/// canary `disc_image_device.cc:154`) rather than synthesised from the
|
||||
/// path shape.
|
||||
fn parse_single_entry(name: &str, on_disc_attr: u8) -> VfsEntry {
|
||||
// GDFX dirent: node_l(u16) node_r(u16) sector(u32) length(u32)
|
||||
// attributes(u8) name_length(u8) name(bytes). The directory bit
|
||||
// gates subdirectory descent; use length=0 so a "directory" entry
|
||||
// is treated as an empty leaf and we don't recurse off the buffer.
|
||||
let mut buf = Vec::new();
|
||||
buf.extend_from_slice(&0u16.to_le_bytes()); // node_l
|
||||
buf.extend_from_slice(&0u16.to_le_bytes()); // node_r
|
||||
buf.extend_from_slice(&0u32.to_le_bytes()); // sector
|
||||
buf.extend_from_slice(&0u32.to_le_bytes()); // length (0 => leaf)
|
||||
buf.push(on_disc_attr); // attributes
|
||||
buf.push(name.len() as u8); // name_length
|
||||
buf.extend_from_slice(name.as_bytes());
|
||||
|
||||
let mut dev = DiscImageDevice {
|
||||
name: "test".into(),
|
||||
path: std::path::PathBuf::new(),
|
||||
game_offset: 0,
|
||||
entries: Vec::new(),
|
||||
};
|
||||
// `file` is only touched when descending into a non-empty directory;
|
||||
// our length=0 entries never recurse, so a dummy handle is fine.
|
||||
let mut file = std::fs::File::open("/dev/null").expect("open /dev/null");
|
||||
dev.collect_entries(&mut file, &buf, 0, "").expect("parse");
|
||||
assert_eq!(dev.entries.len(), 1);
|
||||
dev.entries.into_iter().next().unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn directory_entry_reports_directory_attribute() {
|
||||
// On-disc 0x10 (DIRECTORY) -> attributes carries 0x10 and READONLY.
|
||||
let e = parse_single_entry("dat", FILE_ATTRIBUTE_DIRECTORY);
|
||||
assert!(e.is_directory, "directory bit not decoded");
|
||||
assert_ne!(
|
||||
e.attributes & 0x10,
|
||||
0,
|
||||
"FILE_ATTRIBUTE_DIRECTORY must be set for a directory entry"
|
||||
);
|
||||
assert_ne!(e.attributes & 0x01, 0, "READONLY must be OR'd in (canary)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn file_entry_has_no_directory_attribute() {
|
||||
// On-disc 0x80 (NORMAL) -> not a directory; READONLY still OR'd in.
|
||||
let e = parse_single_entry("default.xex", 0x80);
|
||||
assert!(!e.is_directory, "non-directory misdecoded as directory");
|
||||
assert_eq!(
|
||||
e.attributes & 0x10,
|
||||
0,
|
||||
"FILE_ATTRIBUTE_DIRECTORY must be clear for a file entry"
|
||||
);
|
||||
assert_ne!(e.attributes & 0x80, 0, "NORMAL bit must be preserved");
|
||||
assert_ne!(e.attributes & 0x01, 0, "READONLY must be OR'd in (canary)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn archive_and_hidden_bits_are_preserved() {
|
||||
// ARCHIVE(0x20) | HIDDEN(0x02) authored on disc must survive intact.
|
||||
let e = parse_single_entry("save.dat", 0x20 | 0x02);
|
||||
assert_eq!(e.attributes & 0x20, 0x20, "ARCHIVE bit dropped");
|
||||
assert_eq!(e.attributes & 0x02, 0x02, "HIDDEN bit dropped");
|
||||
assert_eq!(e.attributes & 0x10, 0, "spurious DIRECTORY bit");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,6 +22,16 @@ pub struct VfsEntry {
|
||||
pub is_directory: bool,
|
||||
pub size: u64,
|
||||
pub offset: u64,
|
||||
/// Xbox `FILE_ATTRIBUTE_*` bitmask for this entry, sourced from the
|
||||
/// backing device's real on-disc metadata rather than inferred from
|
||||
/// the path shape. For GDFX disc images this is the on-disc attribute
|
||||
/// byte at dirent offset +12 OR'd with `FILE_ATTRIBUTE_READONLY`
|
||||
/// (matches xenia-canary `disc_image_device.cc:154`:
|
||||
/// `entry->attributes_ = attributes | kFileAttributeReadOnly`).
|
||||
///
|
||||
/// Bit layout (canary `vfs/entry.h:66-76`): READONLY=0x01, HIDDEN=0x02,
|
||||
/// SYSTEM=0x04, DIRECTORY=0x10, ARCHIVE=0x20, NORMAL=0x80.
|
||||
pub attributes: u32,
|
||||
}
|
||||
|
||||
/// Trait for VFS device implementations (XISO, STFS, host path, etc.)
|
||||
|
||||
Reference in New Issue
Block a user