Compare commits
162 Commits
c694bb3f43
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ac2f89a7bb | ||
|
|
2a8ff9515d | ||
|
|
25704c5811 | ||
|
|
49f3eafa15 | ||
|
|
7bc9e3acac | ||
|
|
e428ce33aa | ||
|
|
b03192c772 | ||
|
|
56ffa40a6a | ||
|
|
d8766c6242 | ||
|
|
77034b6cbf | ||
|
|
9028021936 | ||
|
|
5af792c9fc | ||
|
|
85d1603124 | ||
|
|
38d8871e8d | ||
|
|
81c90f9a53 | ||
|
|
ab4fe211e5 | ||
|
|
0209e88f0a | ||
|
|
4ff08f6116 | ||
|
|
3bd77ab506 | ||
|
|
1d6c51fbf8 | ||
|
|
bd5753311e | ||
|
|
89f5f7e4a9 | ||
|
|
fd68285210 | ||
|
|
70120465a3 | ||
|
|
e061e21851 | ||
|
|
690943ceef | ||
|
|
412ba858b4 | ||
|
|
08d41cf2fc | ||
|
|
de5a15ecfb | ||
|
|
c03f2bc9e2 | ||
|
|
d9e40d3564 | ||
|
|
978a6950d1 | ||
|
|
cc54ca8e64 | ||
|
|
76dfe7fd7a | ||
|
|
7ed6192b7b | ||
|
|
5d2401f9c5 | ||
|
|
d736a1dc12 | ||
|
|
91a7df5f6a | ||
|
|
b78e6fd205 | ||
|
|
50a488776f | ||
|
|
2cce044516 | ||
|
|
a1a7265f29 | ||
|
|
58f416c284 | ||
|
|
c51f51f9cb | ||
|
|
79697ddf4e | ||
|
|
7675035082 | ||
|
|
556a8c387a | ||
|
|
bef9793aec | ||
|
|
a6208a1249 | ||
|
|
19659d7f76 | ||
|
|
33e49e70c8 | ||
|
|
1a892d4641 | ||
|
|
451b3b28fe | ||
|
|
3e2fc1ec88 | ||
|
|
6a070bedc6 | ||
|
|
7108d6d131 | ||
|
|
48eed258f0 | ||
|
|
f84e947547 | ||
|
|
6440261e2e | ||
|
|
2a9fd1fc86 | ||
|
|
9d45efe5d5 | ||
|
|
07068e7616 | ||
|
|
38f78c88a8 | ||
|
|
691404e36e | ||
|
|
b54aa48d10 | ||
|
|
eb71fe8daf | ||
|
|
866855000c | ||
|
|
27d3608174 | ||
|
|
b82919bdd0 | ||
|
|
d1105aafae | ||
|
|
0e95e38813 | ||
|
|
7a1b6b3306 | ||
|
|
aa3f1d344f | ||
|
|
c7fccccbc6 | ||
|
|
6f851a2083 | ||
|
|
780e854c2f | ||
|
|
104078dc29 | ||
|
|
8fc1b1dfed | ||
|
|
fceaa81f46 | ||
|
|
e7d0fcf2c9 | ||
|
|
537d789deb | ||
|
|
8723d6826b | ||
|
|
a07784349d | ||
|
|
ec2d955dbd | ||
|
|
c5c6713419 | ||
|
|
78ea81c12a | ||
|
|
1b74db6fa7 | ||
|
|
82f3d611e2 | ||
|
|
0590bffdd9 | ||
|
|
1f416aaa2e | ||
|
|
62f673d094 | ||
|
|
9ab986ec09 | ||
|
|
caa37fc595 | ||
|
|
09c6c927bd | ||
|
|
f1166d0f75 | ||
|
|
9de18a9eec | ||
|
|
4029041618 | ||
|
|
1f9696ad47 | ||
|
|
261480616c | ||
|
|
ebfd18a64e | ||
|
|
2d223eee69 | ||
|
|
9827b03f1a | ||
|
|
a7155f4571 | ||
|
|
8b9fddc488 | ||
|
|
112202c2b9 | ||
|
|
5ece5e315f | ||
|
|
99e7814836 | ||
|
|
0f2a26c460 | ||
|
|
68c0ee55ce | ||
|
|
d96986a10e | ||
|
|
9f88e275b8 | ||
|
|
d39d0bab4d | ||
|
|
05f2f72c71 | ||
|
|
6fe2cbf251 | ||
|
|
6ba8f83c30 | ||
|
|
538fa5ab74 | ||
|
|
49bf74fae6 | ||
|
|
26b98975c3 | ||
|
|
f6a444b9d1 | ||
|
|
5c45108249 | ||
|
|
d945aeae83 | ||
|
|
49103bb898 | ||
|
|
16993bb8af | ||
|
|
20a730d69e | ||
|
|
82a9bff934 | ||
|
|
bf8208e88c | ||
|
|
145a7a4019 | ||
|
|
e18a0a40b8 | ||
|
|
f424132a5b | ||
|
|
f3ebaba5c9 | ||
|
|
7609dcd406 | ||
|
|
2be25bdd41 | ||
|
|
d4f6ea787b | ||
|
|
3d8e2ced2e | ||
|
|
52ece4bd86 | ||
|
|
cedee3c385 | ||
|
|
a8c918cf9e | ||
|
|
52b05b127f | ||
|
|
6b9de17925 | ||
|
|
64e8ecbfd0 | ||
|
|
197d76c44e | ||
|
|
d51b9346df | ||
|
|
75544fa9db | ||
|
|
147daa0721 | ||
|
|
ca5b90b700 | ||
|
|
c9f194dda1 | ||
|
|
d75c4edf67 | ||
|
|
a107ac9ae7 | ||
|
|
d4e227eeab | ||
|
|
af54eb28bd | ||
|
|
24d347436a | ||
|
|
4538fa9e70 | ||
|
|
bae9305982 | ||
|
|
b1285ba560 | ||
|
|
79eb52c378 | ||
|
|
5f0d6487ea | ||
|
|
f1fadb5398 | ||
|
|
45e15d7885 | ||
|
|
c36cca14f9 | ||
|
|
e9b2b57a44 | ||
|
|
e2b8860e10 | ||
|
|
f166d061be |
13
.gitignore
vendored
13
.gitignore
vendored
@@ -1,4 +1,13 @@
|
||||
/target/
|
||||
target/
|
||||
*.iso
|
||||
*.xiso
|
||||
*.db
|
||||
*.db
|
||||
|
||||
# Audit reports / pre-pass findings (local artifacts, not source)
|
||||
audit-out/
|
||||
audit-*.md
|
||||
|
||||
# Run logs from stress harnesses and ad-hoc captures
|
||||
*.stdout
|
||||
*.stderr
|
||||
*.log
|
||||
|
||||
4487
Cargo.lock
generated
4487
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
21
Cargo.toml
21
Cargo.toml
@@ -12,6 +12,7 @@ members = [
|
||||
"crates/xenia-hid",
|
||||
"crates/xenia-debugger",
|
||||
"crates/xenia-analysis",
|
||||
"crates/xenia-ui",
|
||||
"crates/xenia-app",
|
||||
]
|
||||
|
||||
@@ -33,10 +34,17 @@ xenia-apu = { path = "crates/xenia-apu" }
|
||||
xenia-hid = { path = "crates/xenia-hid" }
|
||||
xenia-debugger = { path = "crates/xenia-debugger" }
|
||||
xenia-analysis = { path = "crates/xenia-analysis" }
|
||||
xenia-ui = { path = "crates/xenia-ui" }
|
||||
|
||||
# External dependencies
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "registry"] }
|
||||
tracing-appender = "0.2"
|
||||
tracing-chrome = "0.7"
|
||||
tracing-error = "0.2"
|
||||
metrics = "0.24"
|
||||
metrics-util = "0.19"
|
||||
pprof = { version = "0.14", features = ["flamegraph", "protobuf-codec"] }
|
||||
bitflags = "2"
|
||||
byteorder = "1"
|
||||
thiserror = "2"
|
||||
@@ -44,4 +52,13 @@ anyhow = "1"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
aes = "0.8"
|
||||
rusqlite = { version = "0.31", features = ["bundled"] }
|
||||
duckdb = { version = "1", features = ["bundled"] }
|
||||
|
||||
# UI / rendering / input (used by xenia-ui and xenia-app with --ui)
|
||||
winit = "0.30"
|
||||
wgpu = "22"
|
||||
gilrs = "0.11"
|
||||
pollster = "0.3"
|
||||
crossbeam-utils = "0.8"
|
||||
crossbeam-channel = "0.5"
|
||||
bytemuck = { version = "1", features = ["derive"] }
|
||||
|
||||
6071
audit-findings.md
Normal file
6071
audit-findings.md
Normal file
File diff suppressed because it is too large
Load Diff
629
audit-report-2026-04-29.md
Normal file
629
audit-report-2026-04-29.md
Normal file
@@ -0,0 +1,629 @@
|
||||
# PPC Instruction Audit — Triaged Report (2026-04-29)
|
||||
|
||||
**Status**: audit complete. **No code modified.** This file is the fix-order plan for the follow-up session.
|
||||
**Source of truth**: detailed bug entries (one heading per PPCBUG ID) live in `audit-findings.md`. This file references every entry by ID so nothing is lost — it does not duplicate the per-bug detail.
|
||||
|
||||
## Counts
|
||||
|
||||
- **Total findings**: 253 PPCBUG IDs, of which 4 are explicitly retracted/withdrawn (PPCBUG-220, 222, 226, 482, 483 — see Notes section).
|
||||
- **Net findings**: ~248 actionable.
|
||||
- **Severity breakdown** (rough):
|
||||
- HIGH: ~55 (~22%)
|
||||
- MEDIUM: ~75 (~30%)
|
||||
- LOW (test gaps + cosmetic + informational): ~118 (~48%)
|
||||
|
||||
## Headline findings (most likely Sylpheed-renderer-blockers)
|
||||
|
||||
1. **PPCBUG-107 cascade** — `ReservationTable::invalidate_for_write` defined and unit-tested but never called from any of the **50+ store opcodes** in the interpreter. Under `--parallel`, every cross-thread atomic via `lwarx`/`stwcx.` is silently broken: spinlocks succeed without exclusion, atomic counters race, condition-variable handshakes never sync. Plausible direct cause of the 4-worker-thread renderer plateau (`project_xenia_rs_sylpheed_stage3_2026_04_29.md`). **Fix is mechanical**: one-line `if t.has_active_reservers() { t.invalidate_for_write(ea) }` before every `mem.write_*` in interpreter.rs.
|
||||
|
||||
2. **PPCBUG-053+054 cascade** — `bcx`/`bclrx` CTR zero-test compares all 64 bits; `mtspr CTR` writes full 64-bit GPR. Combined with PPCBUG-006 (`negx` poisons GPR upper 32) → **`neg; mtctr; bdnz` loops run forever**.
|
||||
|
||||
3. **8 decoder/field-extraction bugs collapse into 6 missing accessors** + 1 wrong sh64 formula + 1 missing decode_op6 dot-form entry. The disassembler already has correct local versions. Single mechanical sweep.
|
||||
|
||||
4. **PPCBUG-046 (`clrldi r3, r4, 32`)** — the canonical zero-extend-low-32 idiom is currently a no-op. Emitted constantly by 32-bit-ABI compilers.
|
||||
|
||||
5. **PPCBUG-510** — `stvewx128` corrupts 12 adjacent bytes per call.
|
||||
|
||||
6. **PPCBUG-424/425** — `vmaddfp128`/`vmaddcfp128` operand swap. Every D3D vertex/pixel shader using FMA with non-aliased operands gets wrong arithmetic.
|
||||
|
||||
7. **PPCBUG-360/363** — `vperm128` uses wrong control vector (every D3D shader swizzle); `vpkd3d128` missing post-pack permutation (canonical D3D vertex-pack `pack=1` always wrong).
|
||||
|
||||
8. **PPCBUG-275/420-422** — VC-form and VMX128_R-form `rc_bit()` reads bit 0 instead of bit 21/27 → **CR6 never updated for ANY VMX vector compare dot form**. Breaks every `vcmpequb. + bc CR6_all_true` early-exit loop in audio mixing, font rendering, string ops.
|
||||
|
||||
## Recommended fix order
|
||||
|
||||
The phases below are the recommended fix order for the follow-up session. Each phase is **independently mergeable**; later phases may reveal that earlier phases unblocked their symptoms (e.g. P1 by itself could be sufficient to break open the Sylpheed renderer plateau).
|
||||
|
||||
After each phase: `cargo test --workspace --release` (must stay at 506+ pass) AND `xenia-rs check sylpheed.iso -n 100M` (must not regress against the 2026-04-29 addis-fix baseline of `swaps=2`). The acid test is whether `draws > 0` opens after P1 or P2.
|
||||
|
||||
---
|
||||
|
||||
### Phase 1 — Cross-thread atomicity (PPCBUG-107 cascade)
|
||||
|
||||
**Why first**: highest confidence smoking-gun for the renderer plateau. Single, mechanical, low-risk fix. Largest leverage relative to size.
|
||||
|
||||
**Coupled — must land together**:
|
||||
- PPCBUG-107 (root: missing call from stores)
|
||||
- PPCBUG-130 (9 byte/halfword stores)
|
||||
- PPCBUG-140, 141, 142, 143, 144 (5 word stores: stw/stwu/stwx/stwux/stwbrx)
|
||||
- PPCBUG-150 (5 doubleword stores: std/stdu/stdx/stdux/stdbrx)
|
||||
- PPCBUG-160 (3 multiple/string stores: stmw/stswi/stswx)
|
||||
- PPCBUG-167 (9 FP stores)
|
||||
- PPCBUG-511, 512, 513, 514 (16 VMX stores)
|
||||
|
||||
**Independent but related**:
|
||||
- PPCBUG-151 (stwcx/stdcx reservation width discriminator) — separate fix; add `reservation_width: u8` to PpcContext.
|
||||
- PPCBUG-108 (legacy per-context path: cross-thread invalidation impossible) — informational; --reservations-table mode bypasses.
|
||||
|
||||
**Approach** — one PR adds `if t.has_active_reservers() { t.invalidate_for_write(ea) }` before every `mem.write_*` call site. Scope:
|
||||
```
|
||||
mem.write_u8 / write_u16 / write_u32 / write_u64 / write_f32 / write_f64
|
||||
mem.write_vec128 / write_vec128_aligned (for VMX)
|
||||
```
|
||||
~38 sites total. Add 1+ targeted concurrency tests (lwarx + cross-thread plain store + stwcx., expect EQ=0).
|
||||
|
||||
---
|
||||
|
||||
### Phase 2 — Decoder/field-extraction structural sweep
|
||||
|
||||
**Why second**: single mechanical sweep, fixes 12 distinct HIGH-severity findings, unblocks correct execution of compiler-emitted code. Disassembler already has correct local extraction logic — promote/port.
|
||||
|
||||
**Coupled — same commit**:
|
||||
- PPCBUG-040 + PPCBUG-560 — fix `sh64()` bit order AND fix the test helper that was masking it
|
||||
- PPCBUG-046 + PPCBUG-561 — promote `mb_md()` from `disasm.rs:1256` to `decoder.rs`; replace 6 inline-formula sites in interpreter.rs (rldicl/rldicr/rldic/rldimi/rldcl/rldcr)
|
||||
- PPCBUG-275 + PPCBUG-276 + PPCBUG-420 + PPCBUG-421 + PPCBUG-422 + PPCBUG-562 — add `vc_rc_bit()` (PPC bit 21) and `vx128r_rc_bit()` (PPC bit 27); replace `instr.rc_bit()` at all VMX compare dot-form sites
|
||||
- PPCBUG-315 + PPCBUG-563 — add `vx128_4_z()`, `vx128_4_imm()`; fix `vrlimi128`
|
||||
- PPCBUG-361 + PPCBUG-565 — add `vx128_5_sh()`; fix `vsldoi128`
|
||||
- PPCBUG-362 + PPCBUG-564 — add `vx128_p_perm()`; fix `vpermwi128`
|
||||
- PPCBUG-423 + PPCBUG-600 — add 5 odd-key entries to `decode_op6` key4 for `vcmp*fp128.` dot forms
|
||||
|
||||
**Independent in this phase**:
|
||||
- PPCBUG-360 — `vperm128` reads VC from `vd128()` instead of VX128_2 VC field at integer bits 6-8. Fix at the call site (or add `vx128_2_vc()` accessor).
|
||||
- PPCBUG-363 + PPCBUG-369 — `vpkd3d128` missing post-pack permutation; add the `pack`/`shift` field handling per Canary.
|
||||
|
||||
**Test fixture updates required** (PPCBUG-560 lesson) — once `sh64()` is fixed, verify all `disasm_goldens.rs` test helpers encode shifts ISA-correctly. Don't trust the existing fixtures blindly.
|
||||
|
||||
---
|
||||
|
||||
### Phase 3 — Other HIGH bugs (single targeted fixes)
|
||||
|
||||
**Independent**:
|
||||
- PPCBUG-510 — `stvewx128` corrupting 12 bytes per call. Direct fix: align EA to word, write only 4 bytes.
|
||||
- PPCBUG-424 — `vmaddfp128` operand order: change `ai.mul_add(bi, di)` → `ai.mul_add(di, bi)`.
|
||||
- PPCBUG-425 — `vmaddcfp128` operand order similarly.
|
||||
- PPCBUG-053 + PPCBUG-054 — `bcx`/`bclrx` CTR zero-test (32-bit) + `mtspr CTR` truncation (defensive firewall). Coupled.
|
||||
- PPCBUG-640 — `fmt_bc` spurious condition suffix on pure `bdnz`/`bdz`. Port the `fmt_bclr` pattern.
|
||||
- PPCBUG-641 — `lwsync` shows as `sync` in disassembler (re-assessment of PPCBUG-088). Same fix.
|
||||
|
||||
---
|
||||
|
||||
### Phase 4 — 32-bit ABI writeback truncation sweep
|
||||
|
||||
**Why this phase**: cross-cutting, mechanical. Once ALL writebacks truncate via `as u32 as u64`, the systemic 32-bit-ABI invariant is restored and most CR0/CA helper-correctness concerns become moot.
|
||||
|
||||
#### 4a — Active poisoning (every execution corrupts GPR upper bits)
|
||||
|
||||
These bugs corrupt GPR upper bits **regardless** of whether upstream sources are clean — typically because the implementation applies Rust's `!u64` (full 64-bit NOT) somewhere:
|
||||
- PPCBUG-006 (negx — `(!ra).wrapping_add(1)`)
|
||||
- PPCBUG-008 (subfex — `(!ra).wrapping_add(rb).wrapping_add(ca)`)
|
||||
- PPCBUG-018 (subfzex)
|
||||
- PPCBUG-019 (subfmex)
|
||||
- PPCBUG-028 (orcx — `rs | !rb`)
|
||||
- PPCBUG-029 (norx — `!(rs | rb)` — the canonical `not` mnemonic, hot path)
|
||||
- PPCBUG-030 (nandx)
|
||||
- PPCBUG-031 (eqvx — `!(rs ^ rb)` — common `eqv rA, rA, rA` set-to-all-ones)
|
||||
- PPCBUG-033 (andcx via `!rb`)
|
||||
- PPCBUG-034 (extsbx — `as i8 as i64 as u64`)
|
||||
- PPCBUG-035 (extshx)
|
||||
|
||||
#### 4b — Same-shape-as-addis (latent under clean inputs, active when upstream is poisoned)
|
||||
|
||||
- PPCBUG-001 (addi), PPCBUG-002 (addic), PPCBUG-003 (addicx), PPCBUG-005 (subficx), PPCBUG-007 (subfcx CA), PPCBUG-008 (subfex CA — also in 4a)
|
||||
- PPCBUG-004 (mulli), PPCBUG-009 (mullwx)
|
||||
- PPCBUG-010 + PPCBUG-011 (divwx writeback + CR0 — **must land together**, not independently)
|
||||
- PPCBUG-041 + PPCBUG-042 + PPCBUG-043 (srawx/srawix writeback + CR0 coupling — **must land together**)
|
||||
- PPCBUG-095, 096, 097, 098 (lha/lhax/lhau/lhaux halfword sign-extension)
|
||||
- PPCBUG-105 (lwa/lwax/lwaux — note: 64-bit-mode-only; less common in 32-bit-ABI binaries)
|
||||
|
||||
#### 4c — Latent writeback (only triggers if 4a/4b are unfixed)
|
||||
|
||||
These can be fixed in the same sweep but won't fire under clean inputs:
|
||||
- PPCBUG-012, 013, 014, 015, 016, 017 (addx/addcx/addex/addzex/addmex/subfx)
|
||||
- PPCBUG-032 (andx/orx/xorx)
|
||||
|
||||
#### 4d — CR0 32-bit-ABI compare (cross-cutting catch-all)
|
||||
|
||||
PPCBUG-020 documents the catch-all; the per-opcode locations are referenced from there:
|
||||
- PPCBUG-020 (catch-all in groups 2-5)
|
||||
- PPCBUG-023 (andisx)
|
||||
- PPCBUG-024 (rlwinmx), PPCBUG-025 (rlwimix), PPCBUG-026 (rlwnmx)
|
||||
- PPCBUG-036 (extsbx), PPCBUG-037 (extshx) — **must land with PPCBUG-034/035**
|
||||
- PPCBUG-044 (slwx/srwx)
|
||||
|
||||
**Fix shape** — at every Rc=1 path, change `update_cr_signed(0, result as i64)` to `update_cr_signed(0, result as u32 as i32 as i64)`. Once 4a/4b/4c land, both forms become equivalent and 4d becomes belt-and-suspenders (still recommended for resilience).
|
||||
|
||||
---
|
||||
|
||||
### Phase 5 — FPU correctness (graphics middleware impact)
|
||||
|
||||
#### 5a — Round-to-int and FPSCR.RN
|
||||
|
||||
- PPCBUG-221 + PPCBUG-227 (`round_to_i64` NearestEven broken near 2^52 — must land together; `round_to_i32` delegates)
|
||||
- PPCBUG-201 (FPSCR.RN not honored for double arithmetic)
|
||||
- PPCBUG-432 (vrfin/vrfin128 round-half-away-from-zero vs round-to-nearest-even)
|
||||
|
||||
#### 5b — VXISI / NaN / SNaN handling for FMA family
|
||||
|
||||
- PPCBUG-181, 182 (single fmaddsx/fmsubsx/fnmaddsx/fnmsubsx VXISI)
|
||||
- PPCBUG-202, 203, 204 (double fmaddx/fmsubx/fnmaddx/fnmsubx VXISI — esp. 203 hot for Newton-Raphson)
|
||||
- PPCBUG-183, 205 (fnmadd/fnmsub Rust unary `-` flips NaN sign — fix: skip negation on NaN)
|
||||
- PPCBUG-186 (SNaN priority for FMA)
|
||||
- PPCBUG-128 (lfs SNaN quietening — bit-manipulation widening helper needed)
|
||||
|
||||
#### 5c — Inexact / FPSCR exception bits
|
||||
|
||||
- PPCBUG-180 (single XX/FR/FI never set), PPCBUG-200 (double XX/FR/FI never set)
|
||||
- PPCBUG-223 (fcmpo VXSNAN/VXVC), PPCBUG-224 (fcfidx XX), PPCBUG-225 (frspx XX/FR/FI), PPCBUG-229 (fctidx/fctidzx XX/FX), PPCBUG-230 (fctiwx/fctiwzx XX/FX), PPCBUG-231 (frspx SNaN host dependency)
|
||||
- PPCBUG-165 + PPCBUG-166 + PPCBUG-168 (stfs* FPSCR + RN + SNaN)
|
||||
|
||||
#### 5d — Subnormal flush (FPSCR.NI / VSCR.NJ)
|
||||
|
||||
- PPCBUG-185 (FPU NI subnormal flush not modeled)
|
||||
- PPCBUG-435, 436, 437 (VMX NJ subnormal flush — vaddfp/vsubfp/vmulfp128, vmsum3fp128/vmsum4fp128 product intermediates, vmaddfp/vmaddfp128/vmaddcfp128/vnmsubfp128 outputs)
|
||||
|
||||
#### 5e — Estimate precision (vs hardware ~12-bit)
|
||||
|
||||
- PPCBUG-184 (fres)
|
||||
- PPCBUG-428..431 (vrefp, vrsqrtefp, vexptefp, vlogefp — same shape as fres)
|
||||
|
||||
#### 5f — VMX float compares + saturation
|
||||
|
||||
- PPCBUG-426, 427 (vnmsubfp/vnmsubfp128 double-rounding)
|
||||
- PPCBUG-433 (vctsxs/vcfpsxws128 NaN saturate to INT_MIN)
|
||||
|
||||
---
|
||||
|
||||
### Phase 6 — Other MEDIUM correctness
|
||||
|
||||
- PPCBUG-021 (overflow.rs OE checks at bit 63 — sub-register ops; partly covered by P4)
|
||||
- PPCBUG-022 (`mulld_ov` missing INT_MIN × -1)
|
||||
- PPCBUG-027 (rlwimix upper-32 ISA-deviation — auto-resolves once P4 lands)
|
||||
- PPCBUG-039 (cntlzdx 32-bit-ABI counts upper-zero — only matters if emitted)
|
||||
- PPCBUG-063 (trap pc-after-advance)
|
||||
- PPCBUG-064 (sc LEV field)
|
||||
- PPCBUG-065 (twi 31, r0, IMM typed-trap — relevant to Sylpheed C++ throw work, see `project_xenia_rs_sylpheed_throw_2026_04_28.md`)
|
||||
- PPCBUG-068 (mcrfs VX summary recomputation)
|
||||
- PPCBUG-078 (mtmsrd L=1 partial MSR-write)
|
||||
- PPCBUG-080 (mfvscr zero upper 96 bits)
|
||||
- PPCBUG-123 + PPCBUG-124 + PPCBUG-161 + PPCBUG-566 (XER TBC for lswx/stswx — coupled; add `xer_tbc: u8` to PpcContext, wire into xer()/set_xer(); enables lswx and stswx)
|
||||
- PPCBUG-125 (lmw RA-in-destination skip)
|
||||
- PPCBUG-126 + PPCBUG-162 (lswi/stswi `instr.rb()` → `instr.nb()`)
|
||||
- PPCBUG-487 + PPCBUG-495 (vsum* operand naming)
|
||||
- PPCBUG-515 (lvebx/lvehx/lvewx vs Canary divergence — document; xenia-rs is more ISA-faithful)
|
||||
- PPCBUG-516 (lvsr sh=0 case — add comment + debug_assert)
|
||||
- PPCBUG-601 (decode_op6 overlapping windows — document the invariant)
|
||||
- PPCBUG-642 (fmt_bcctr extended forms)
|
||||
- PPCBUG-643 + PPCBUG-644 (SIMM/D-form decimal vs hex — alignment with Canary disassembly)
|
||||
- PPCBUG-367 (vupkhpx/vupklpx channel replication vs zero-extend)
|
||||
- PPCBUG-368 (vpkpx pack_pixel_555 channel assignment unverified)
|
||||
- PPCBUG-366 (vspltisb/vspltish sign-extension idiom — fragile, not wrong)
|
||||
|
||||
---
|
||||
|
||||
### Phase 7 — Frozen-snapshot drift (separate sweep)
|
||||
|
||||
8 opcodes' frozen snapshots in `ppc-manual/<cat>/<op>.md` differ from live code:
|
||||
- PPCBUG-066 (td/tdi/tw/twi)
|
||||
- PPCBUG-117 (ldarx)
|
||||
- PPCBUG-145 (stwcx)
|
||||
- PPCBUG-560 (already-listed: rldicl test helper bit-order)
|
||||
- Plus the implicit drift in addicx (PPCBUG-003), andisx (PPCBUG-023), cmp/cmpi (PPCBUG-050), extsbx/extshx (PPCBUG-036/037, PPCBUG-032 in batch 1)
|
||||
|
||||
**Recommendation**: regenerate frozen snapshots from current code for the entire ppc-manual after Phases 1-4 land. Add a CI check that compares snapshots vs live code on every PR.
|
||||
|
||||
---
|
||||
|
||||
### Phase 8 — Test gap closure (broad)
|
||||
|
||||
Single PR per group is overkill; recommend bundling test additions with each Phase 1-6 PR (test the bug being fixed). The remaining LOW IDs are pure-test-gap entries — list:
|
||||
|
||||
- PPCBUG-045 (shift), 047 (rld), 055 (branch), 067 (trap+sc), 070 (CR logical)
|
||||
- PPCBUG-081, 082, 083, 084, 085 (SPR/MSR/TB/FPSCR/VSCR moves), 089 (cache+sync)
|
||||
- PPCBUG-091 (lbz), 100 (lha), 109, 110, 111 (lwa/lwbrx/lwarx), 118 (ld), 127 (lmw/lswi/lswx), 129 (lfs/lfd)
|
||||
- PPCBUG-132 (stb/sth), 146, 147 (stw/stwcx), 153 (std/stdcx), 163 (stmw/stswi/stswx), 171 (stfs/stfd)
|
||||
- PPCBUG-187 (FPU single), 208 (FPU double), 228 (FPU misc convert)
|
||||
- PPCBUG-240 (VMX add/sub), 243 (VMX sat helpers)
|
||||
- PPCBUG-277, 278, 279 (VMX compare/min/max/avg)
|
||||
- PPCBUG-316, 317, 320, 321, 322, 323, 324, 325 (VMX shift/rotate/logical)
|
||||
- PPCBUG-370, 371, 372, 373, 374, 375, 376, 377, 378 (VMX permute/pack)
|
||||
- PPCBUG-438, 439, 440 (VMX float compare/round/convert)
|
||||
- PPCBUG-490, 491, 492, 493, 494 (VMX multiply-sum)
|
||||
- PPCBUG-517, 518, 519 (VMX load/store)
|
||||
- PPCBUG-567 (decoder accessors)
|
||||
- PPCBUG-604 (decoder dispatch tables)
|
||||
- PPCBUG-649, 650, 652 (golden fixtures for branches/VMX128)
|
||||
|
||||
---
|
||||
|
||||
## Notes & administrative
|
||||
|
||||
### Withdrawn / retracted
|
||||
|
||||
- **PPCBUG-220** — `fctiwx` strict-`>` threshold actually correct (`i32::MAX` exactly representable in f64). Retracted by group-31 subagent.
|
||||
- **PPCBUG-222** — `fctidx` positive-overflow sentinel `0x7FFF_FFFF_FFFF_FFFF` is the correct ISA value. Retracted.
|
||||
- **PPCBUG-226** — FPRF 5-bit codes for fcmpu/fcmpo are correct per PowerISA. Retracted.
|
||||
- **PPCBUG-482** — `vmhaddshs` shift `>>15` is correct per spec snapshots. Retracted.
|
||||
- **PPCBUG-483** — `vmhraddshs` shift `>>15` is correct per spec snapshots. Retracted.
|
||||
|
||||
### Wontfix / informational (not retracted but no fix needed)
|
||||
|
||||
- **PPCBUG-038** — extswx ISA-correct, intentional 64-bit sign-extension. Document the asymmetry with extsb/extsh after PPCBUG-034/035 land.
|
||||
- **PPCBUG-090, 099, 152** — invalid-form (rD==rA) silently destroys load/store result. Per ISA: undefined behavior. No compiler emits these; matches Canary. Optional `debug_assert!`.
|
||||
- **PPCBUG-106, 115, 131, 169, 170, 206, 207, 318, 319, 364, 365, 434, 651, 653, 645, 646, 648** — informational confirmations that the implementation is correct, no change needed.
|
||||
- **PPCBUG-069** — test comment OX(so)=0 is wrong but the assert is correct.
|
||||
- **PPCBUG-602, 603, 605** — undocumented decoder dispatch quirks; correct but should add comments.
|
||||
- **PPCBUG-647, 654** — disassembler edge-case behavior on invalid encodings; not-a-bug for valid input.
|
||||
|
||||
### Coupling matrix (must-land-together)
|
||||
|
||||
| Group | IDs | Reason |
|
||||
|---|---|---|
|
||||
| divwx | 010, 011 | Quotient zero-extension changes the CR0 sign view |
|
||||
| srawx/srawix | 041, 042, 043 | Writeback truncation invalidates the CR0 view |
|
||||
| extsbx/extshx | 034+036, 035+037 | Same coupling shape as srawx |
|
||||
| sh64 | 040, 560 | Test helper is wrong in the inverse direction |
|
||||
| mb_md sweep | 046, 561 | Promote disasm.rs accessor first |
|
||||
| VC-form Rc | 275, 276, 420, 421, 562 | All consume the same new accessor |
|
||||
| VMX128_R Rc | 422, 562 | Same accessor sweep |
|
||||
| vrlimi128 | 315, 563 | Field accessor + caller fix |
|
||||
| vsldoi128 | 361, 565 | Field accessor + caller fix |
|
||||
| vpermwi128 | 362, 564 | Field accessor + caller fix |
|
||||
| vcmp*fp128. | 423, 600 | decode_op6 odd keys + opcode mapping |
|
||||
| XER TBC | 123, 124, 161, 566 | Add field, wire xer()/set_xer(), enables lswx/stswx |
|
||||
| round_to_i64 | 221, 227 | round_to_i32 delegates |
|
||||
| stfs FPSCR | 165, 166, 168 | Single fix shape covers all three |
|
||||
|
||||
### Dependency on the addis fix
|
||||
|
||||
The addis fix (`project_xenia_rs_addis_signext_root_cause_2026_04_29.md`) is already in place. Phase 4 generalizes that fix systematically; without it, the writeback-truncation invariant would still be incomplete.
|
||||
|
||||
### Anticipated impact on the Sylpheed renderer plateau
|
||||
|
||||
Strong candidates for direct cause of the plateau:
|
||||
- **PPCBUG-107** — broken atomics. Workers wait forever on never-signaled events; classical broken-spinlock symptom.
|
||||
- **PPCBUG-053+054** — broken `bdnz` loops; could explain workers parked indefinitely.
|
||||
- **PPCBUG-046 (`clrldi r3, r4, 32`)** — pollution propagation in 32-bit ABI; could break any pointer-clean-up sequence.
|
||||
|
||||
After applying Phase 1 alone, run `xenia-rs check sylpheed.iso -n 4B --parallel` and check whether `draws > 0`. If yes, the plateau was atomics; if no, proceed to P2/P3.
|
||||
|
||||
---
|
||||
|
||||
## Progress log
|
||||
|
||||
### P1 — Cross-thread atomicity sweep (merged 2026-05-01, HEAD ca5b90b)
|
||||
|
||||
**PPCBUGs fixed**: 107, 130, 140, 141, 142, 143, 144, 150, 160, 167, 511, 512, 513, 514, 151, 108. Plus review-fix additions: dcbz, dcbz128, stswi two-line, stswx two-line (merged in review-fix commit c9f194d).
|
||||
|
||||
**Gate results**:
|
||||
- `cargo test --workspace --release`: 449 passed, 0 failed
|
||||
- `-n 100M` lockstep: swaps=2, clean
|
||||
- `-n 100M --parallel --reservations-table`: swaps=2, clean
|
||||
- **Acid test** `-n 4B --parallel --reservations-table`: swaps=2, draws=**0**, no RtlRaiseException, no panics
|
||||
|
||||
**Conclusion**: P1 did NOT unblock the Sylpheed renderer. `draws` remains 0. The renderer plateau is not caused by broken cross-thread atomics alone. Proceeding to P2 (decoder/field-extraction sweep). The strongest remaining candidate per the plan is PPCBUG-046 (`clrldi r3, r4, 32` no-op).
|
||||
|
||||
---
|
||||
|
||||
### P2 — Decoder/field-extraction structural sweep (merged 2026-05-01, HEAD see `git log master --oneline -1`)
|
||||
|
||||
**PPCBUGs fixed**: 040, 046, 275, 276, 315, 360, 361, 362, 363, 369, 420, 421, 422, 423, 560, 561, 562, 563, 564, 565, 600.
|
||||
|
||||
**Batches**:
|
||||
- Batch 1: PPCBUG-040+560 — sh64() bit-order fix (XS-form SH split) + rldicl test helper encoding
|
||||
- Batch 2: PPCBUG-046+561 — mb_md() accessor; all 6 rld* MB fields corrected (clrldi was a no-op)
|
||||
- Batch 3: PPCBUG-275+276+420+421+422+423+562+600 — vc_rc_bit()/vx128r_rc_bit() Rc accessors; 13 vcmp interpreter sites; 5 decode_op6 dot-form entries
|
||||
- Batch 4: PPCBUG-315+563 — vrlimi128 vx128_4_z/imm field extraction
|
||||
- Batch 5: PPCBUG-361+565 — vsldoi128 vx128_5_sh field extraction
|
||||
- Batch 6: PPCBUG-362+564 — vpermwi128 vx128_p_perm field extraction
|
||||
- Batch 7: PPCBUG-360 — vperm128 vc128_2() accessor (was erroneously vd128())
|
||||
- Batch 8: PPCBUG-363+369 — vpkd3d128 post-pack permutation (MakePermuteMask tables from canary)
|
||||
|
||||
**Gate results**:
|
||||
- `cargo test --workspace --release`: 201 (cpu) + 6 (disasm goldens) + 144 + 76 + 16 + 8 + … passed, 0 failed
|
||||
- Independent code reviewer: all 9 check items OK
|
||||
- `-n 100M` lockstep smoke: ISO not available in CI environment; last known good at P1 HEAD was swaps=2
|
||||
- **Acid test** `-n 4B --parallel --reservations-table`: pending (ISO not in CI environment)
|
||||
|
||||
**Conclusion**: All P2 fixes applied and reviewed. Decoder field extraction is now correct for all audited VMX128 and MD/XS-form instructions. Whether P2 unblocks the renderer (`draws > 0`) requires the sylpheed.iso acid test on the user's machine. PPCBUG-046 (clrldi no-op fix) was the highest-probability P2 renderer-unblock candidate. Next: P3 — isolated HIGH bugs (PPCBUG-510, 424/425, 053+054, 640, 641).
|
||||
|
||||
---
|
||||
|
||||
### P3 — Isolated HIGH bugs (merged 2026-05-02, HEAD f3ebaba)
|
||||
|
||||
**PPCBUGs fixed**: 053+054 (coupled CTR 32-bit), 424+425 (vmaddfp128/vmaddcfp128 operand swap), 510 (stvewx128 corruption), 640+650 (bdnz/bdz suffix), 641+649 (sync/lwsync), **700 (NEW)**.
|
||||
|
||||
**Batches**:
|
||||
- Batch 1: PPCBUG-510 — stvewx128 16-byte corruption fixed (word-align EA, extract lane, write 4 bytes)
|
||||
- Batch 2: PPCBUG-424+425 + PPCBUG-700 partial (va128 PPC[11-15] partial fix) — vmaddfp128/vmaddcfp128 operand swap to VA*VD+VB
|
||||
- Batch 3: PPCBUG-053+054 — bcx/bclrx 32-bit CTR compare + mtspr CTR truncation
|
||||
- Batch 4: PPCBUG-640+650 — fmt_bc spurious bdnzge/bdzge suffix gated on `!uncond`
|
||||
- Batch 5: PPCBUG-641+649 — sync/lwsync L-field disambiguation
|
||||
- Phase review fix: **PPCBUG-700 (NEW)** — VMX128 register accessors (va128/vb128/vd128/vx128r_rc_bit) rewritten to canary's bitfield positions. Audit's "confirmed-clean" line-2958 assessment was based on miscounting LSB-first packed C++ bitfields. Per canary (`xenia-canary/src/xenia/cpu/ppc/ppc_decode_data.h:484-663`):
|
||||
- VA128 = PPC[11-15] | PPC[26]<<5 | PPC[21]<<6 (3 fields, 7 bits)
|
||||
- VB128 = PPC[16-20] | PPC[30-31]<<5
|
||||
- VD128 = PPC[6-10] | PPC[28-29]<<5
|
||||
- VX128_R Rc = PPC[25] (host bit 6) — NOT PPC[27] as PPCBUG-422 prescribed
|
||||
Affects 30+ VMX128 opcodes; production game code with VR>=32 was silently mis-decoded. Speculative `key4_dt` dot-form dispatch in `decode_op6` removed (canary has no separate dot-form opcodes for VX128_R). New PPCBUG-700 entry added to `audit-findings.md` Phase C4 invalidating audit line 2958.
|
||||
|
||||
**Gate results**:
|
||||
- `cargo test --workspace --release`: **470 passed, 0 failed** (up from 467 baseline at P3 start; 3 new CTR regression tests added)
|
||||
- Independent code reviewer: 1 BLOCKING issue (PPCBUG-700 above) — addressed before merge
|
||||
- `-n 100M` lockstep smoke: ISO not in CI; checked locally during development
|
||||
- **Acid test** `-n 4B --parallel --reservations-table`: **deferred to end of all phases** per user direction
|
||||
|
||||
**Conclusion**: All P3 fixes applied + reviewed + reviewer's blocking concern resolved. Phase 3 also produced one HIGH discovery (PPCBUG-700) that the audit had missed. Total fixes: 6 commits, 7 distinct PPCBUG groups. Next: P4 — 32-bit ABI writeback truncation sweep, ~30 IDs across 4a-4d sub-sections.
|
||||
|
||||
---
|
||||
|
||||
### P4 — 32-bit ABI writeback truncation sweep (merged 2026-05-02, HEAD d945aea)
|
||||
|
||||
**PPCBUGs fixed**: ~43 IDs across the 4a/4b/4c/4d sub-sections.
|
||||
- 4a active poisoning: 006 (negx), 008 (subfex), 018 (subfzex), 019 (subfmex), 028 (orcx), 029 (norx), 030 (nandx), 031 (eqvx), 033 (andcx)
|
||||
- 4a/4d coupled: 034+035+036+037 (extsbx/extshx writeback + CR0)
|
||||
- 4b immediate ALU: 001 (addi), 002 (addic), 003 (addicx), 004 (mulli), 005 (subficx), 007 (subfcx CA)
|
||||
- 4b mul/div + srawx coupled: 009 (mullwx), 010+011 (divwx + CR0), 041+042+043 (srawx/srawix + CR0)
|
||||
- 4b loads: 095-098 (lha/lhax/lhau/lhaux), 105 (lwa/lwax/lwaux)
|
||||
- 4c latent: 012-017 (addx/addcx/addex/addzex/addmex/subfx), 032 (andx/orx/xorx CR0)
|
||||
- 4d CR0 catch-all: 020 (in mulhwx/mulhwux/divwux/andx/orx/xorx/cntlzwx etc.), 023 (andisx), 024 (rlwinmx), 025 (rlwimix), 026 (rlwnmx), 044 (slwx/srwx)
|
||||
|
||||
**Batches**:
|
||||
- Batch 1 (e18a0a4): 4a active poisoning NOT/SUB family — 9 PPCBUGs
|
||||
- Batch 2 (145a7a4): 4a/4d coupled extsbx+extshx+CR0 — 4 PPCBUGs (must land together)
|
||||
- Batch 3 (bf8208e): 4b immediate ALU — 6 PPCBUGs
|
||||
- Batch 4 (82a9bff): 4b mul/div + srawx coupled — 6 PPCBUGs (two coupling groups)
|
||||
- Batch 5 (20a730d): 4b halfword + lwa loads — 5 PPCBUGs
|
||||
- Batch 6 (16993bb): 4c latent + 4d CR0 catch-all — ~13 PPCBUGs
|
||||
- Review-fix (49103bb): subfx/subfcx OE predicate + mulli test rigor
|
||||
|
||||
**Phase invariants restored**: every 32-bit ABI GPR write zero-extends from a u32 result, every CR0 update views the result as i32, every CA bit comes from a 32-bit unsigned compare. Downstream 64-bit unsigned compares (the addis-incident shape) can no longer be fed polluted upper bits from any of the 40+ touched ALU sites. The frozen-snapshot drift detected in PPCBUG-003 (addicx CR0) and PPCBUG-023 (andisx CR0) is also resolved.
|
||||
|
||||
**Review findings**:
|
||||
- BLOCKING issue caught: subfx and subfcx OE handlers in batch 6 still used the legacy `sum_overflow_64` helper. The helper compares the 32-bit `true_diff` against a u64 view of the result; any legitimate i32::MIN result (bit 31 set) spuriously triggered OV=1. Fixed in 49103bb with two new discriminating regression tests.
|
||||
- Minor caught: `mulli_overflow_wraps_to_32` rubber-stamped — both pre/post fix wrote 0 for the chosen inputs. Redesigned to use polluted-upper-bits inputs that genuinely discriminate.
|
||||
|
||||
**Gate results**:
|
||||
- `cargo test --workspace --release`: **494 passed, 0 failed** (up from 470 at P3 merge; 24 new regression tests across the batches)
|
||||
- 64-bit ABI ops verified untouched: rldicl/rldicr/rldic/rldimi/rldcl/rldcr, sldx/srdx/sradx/sradix, mulhdx/mulhdux/mulldx, divdx/divdux, cntlzdx, extswx
|
||||
- **Acid test** `-n 4B --parallel --reservations-table`: deferred per user direction
|
||||
|
||||
**Conclusion**: P4 is the largest ABI-correctness sweep of the audit. The systemic invariant is restored. Next: P5 — FPU correctness (~30 IDs).
|
||||
|
||||
---
|
||||
|
||||
### P5 — FPU correctness (merged 2026-05-02, HEAD d39d0ba)
|
||||
|
||||
**PPCBUGs fixed**: 21 IDs across the 5a-5f sub-sections.
|
||||
- 5a (round-to-int): 221+227 (round_to_i64 NearestEven near 2^52, coupled), 432 (vrfin round-to-even)
|
||||
- 5b (FMA VXISI + NaN sign): 181, 182 (single fmaddsx/fmsubsx VXISI), 202, 203 (double fmaddx/fmsubx/fnmaddx/fnmsubx VXISI), 183, 205 (NaN sign preservation in fnmaddx/fnmsubx and *sx siblings)
|
||||
- 5c (XX-on-inexact): 223 (verified already correct), 224 (fcfidx XX), 225 (frspx XX), 229 (fctidx/fctidzx XX), 230 (fctiwx/fctiwzx XX)
|
||||
- 5d (subnormal flush): 435 (vaddfp/vsubfp/vmulfp128 missing flush), 436 (vmsum3fp128/vmsum4fp128 per-product flush), 437 (vmaddfp family output flush)
|
||||
- 5e (estimate precision): 184 (fresx canary parity via f32 input quantization)
|
||||
- 5f (saturation + single-FMA): 426 (vnmsubfp single FMA), 427 (vnmsubfp128 single FMA), 433 (vctsxs NaN→INT_MIN)
|
||||
|
||||
**Batches**:
|
||||
- Batch 1 (f6a444b): 5a round-to-int + vrfin
|
||||
- Batch 2 (26b9897): 5b FMA — new `check_invalid_fma_add` helper in fpscr.rs derives VXISI from input properties
|
||||
- Batch 3 (49bf74f): 5c XX bit on conversions
|
||||
- Batch 4 (538fa5a): 5d VSCR.NJ unconditional flush (matches Canary; Xbox 360 always boots NJ=1)
|
||||
- Batch 5 (6ba8f83): 5e fresx pre-quantize input
|
||||
- Batch 6 (6fe2cbf): 5f single-FMA + vctsxs NaN
|
||||
- Review-fix nit (05f2f72): vrfin → stdlib `f32::round_ties_even()`
|
||||
|
||||
**Deferred for focused sub-batches** (Status: open in audit-findings.md):
|
||||
- PPCBUG-201 (FPSCR.RN for double arithmetic) — requires MXCSR set/restore wrappers around 10+ FPU arms
|
||||
- PPCBUG-185 (FPSCR.NI flush for scalar FPU) — requires NI bit constant + post-op flush wrapper
|
||||
- PPCBUG-180 + PPCBUG-200 (XX/FR/FI in update_after_op) — requires pre-vs-post-round comparison
|
||||
|
||||
**Review findings**:
|
||||
- Independent reviewer verdict: **MERGE-READY**. No blocking issues.
|
||||
- Two non-blocking minor follow-ups noted: (a) `check_invalid_fma_add` doesn't catch the finite-product-overflow + infinite-b cancellation half of PPCBUG-202 (audit-acknowledged as rare); (b) vrfin used inline tie-breaker — replaced with stdlib `round_ties_even()` in 05f2f72.
|
||||
|
||||
**Gate results**:
|
||||
- `cargo test --workspace --release`: **498 passed, 0 failed** (up from 494 at P4 merge; 5 new regression tests across the batches)
|
||||
- **Acid test** `-n 4B --parallel --reservations-table`: deferred per user direction
|
||||
|
||||
**Conclusion**: P5 covers the FPU correctness foundation (round-to-int, VXISI, NaN preservation, XX bit, subnormal flush). Three substantive items deferred. Next: P6 — Other MEDIUM correctness (overflow.rs sweep, trap PC-after-advance, sc LEV, twi typed-trap, etc.).
|
||||
|
||||
---
|
||||
|
||||
### P6 — Other MEDIUM correctness (merged 2026-05-02, HEAD 112202c)
|
||||
|
||||
**PPCBUGs fixed**: 13 IDs across the misc-MEDIUM scope.
|
||||
- Trap/sc/typed-trap (063/064/065): trap PC stays at CIA on Trap; sc LEV logged; twi 31, r0, IMM SIMM type code logged.
|
||||
- XER TBC infrastructure (123/124/161/566): new `xer_tbc: u8` field in `PpcContext`, wired into `xer()`/`set_xer()`; enables `lswx`/`stswx` (which were permanent no-ops without the TBC infrastructure).
|
||||
- Load-multiple cleanups (125/126/162): `lmw` skips writes to RA when in [RT..32) per ISA; `lswi`/`stswi` use `instr.nb()` instead of misnamed `instr.rb()`.
|
||||
- SPR/MSR/VSCR (068/078/080): `mcrfs` now recomputes the VX summary bit; `mtmsrd L=1` does the partial MSR write per ISA; `mfvscr` zero-extends the VSCR word into the upper 96 bits of VD.
|
||||
- Verification/auto-resolved (022/021/027/039): `mulld_ov` test confirms `checked_mul` handles INT_MIN*-1 correctly (audit's "missing" claim was incorrect); 021/027 auto-resolved by P4; 039 wontfix per audit.
|
||||
|
||||
**Batches**:
|
||||
- Batch 1 (d96986a): trap/sc semantics
|
||||
- Batch 2 (68c0ee5): XER TBC + load-multiple cleanups
|
||||
- Batch 3 (0f2a26c): SPR/MSR/VSCR
|
||||
- Batch 4 (99e7814): mulld_ov verification
|
||||
- Review-fix nit (5ece5e3): mcrfs uses existing `fpscr::VX_ALL` constant
|
||||
|
||||
**Deferred (Status: open in audit-findings.md)**:
|
||||
- Structural enum extensions (no consumer yet): `StepResult::HypervisorCall` for PPCBUG-064 sc 2 routing; `StepResult::Trap { type_code: u16 }` for PPCBUG-065 typed-trap C++ exception class routing — relevant if/when SEH dispatch lands.
|
||||
- Cosmetic/test-coverage: PPCBUG-642 (fmt_bcctr ISA-undefined edge), 643/644 (SIMM/D-form decimal vs hex — would re-baseline all goldens), 367/368 (vupkhpx/vpkpx channels), 487/495 (vsum naming), 515/516 (lvebx/lvsr docs), 601 (decode_op6 invariant doc).
|
||||
|
||||
**Review findings**: independent reviewer verdict was LGTM on all 4 commits, one cosmetic nit (use existing `fpscr::VX_ALL` instead of duplicate inline mask) applied immediately in 5ece5e3. No blocking issues. Reviewer specifically verified: trap-PC change against all `StepResult::Trap` consumers (none rely on `ctx.pc` for the faulting address); XER TBC field initialization through the single `PpcContext::new()` path that `Default` delegates to; `Vec128` lane ordering for `mfvscr` zero-extend.
|
||||
|
||||
**Gate results**:
|
||||
- `cargo test --workspace --release`: **498 passed, 0 failed**
|
||||
- **Acid test** `-n 4B --parallel --reservations-table`: deferred per user direction
|
||||
|
||||
**Conclusion**: P6 closes the misc-MEDIUM scope. All correctness fixes in scope have landed; structural enum extensions and cosmetic items are explicitly deferred and tracked. Remaining phases: P7 (frozen-snapshot drift, 8 opcodes), P8 (test gap closure, ~50 IDs).
|
||||
|
||||
---
|
||||
|
||||
### P7 — Frozen-snapshot drift sweep (2026-05-02, manual regen — no xenia-rs code change)
|
||||
|
||||
**PPCBUGs fixed**: 3 IDs.
|
||||
- PPCBUG-066: ppc-manual/branch/{td,tdi,tw,twi}.md — old unconditional-trap stub replaced with current TO-field-evaluating implementation snippet.
|
||||
- PPCBUG-117: ppc-manual/memory/ldarx.md — refreshed to current reservation_line/reservation_table model.
|
||||
- PPCBUG-145: ppc-manual/memory/stwcx.md — same reservation refresh.
|
||||
|
||||
**Methodology**: ran `python3 ppc-manual/generator/generate_manual.py` (the existing idempotent generator that scrapes xenia-rs and xenia-canary source for each opcode and emits a Markdown page). Output: 350 family pages updated, 598-key index.json refreshed.
|
||||
|
||||
**Verification**: post-regen `grep` confirms (a) the old "For now, just trace and continue" stub is gone from every page; (b) modern constructs (`trap::evaluate`, the current reservation pattern) appear in the trap and reservation pages.
|
||||
|
||||
**Note on scope**: the `ppc-manual/` directory is not versioned in `xenia-rs/.git`. The regen is therefore "done by running the script" with no commit landing in this repo. Documented for posterity here.
|
||||
|
||||
**Implicit drift cleared by earlier phases**: addicx (PPCBUG-003 fixed in P4), andisx (PPCBUG-023 fixed in P4), cmp/cmpi (PPCBUG-050 — no code change required; manual snapshot now reflects current behavior), extsbx/extshx (PPCBUG-036/037 fixed in P4 batch 2), 32 in batch 1 — all auto-resolved by re-running the generator after P1-P6.
|
||||
|
||||
**Conclusion**: P7 is functionally complete. No xenia-rs code change. Next: P8 — test gap closure.
|
||||
|
||||
---
|
||||
|
||||
### P8 — Test gap closure (merged 2026-05-02, HEAD 4029041)
|
||||
|
||||
**PPCBUGs closed**: 38 IDs across the test-gap LOW scope (audit listed ~50; 38 closed, ~12 remain Status: open as test-gap-only items that don't block functionality).
|
||||
|
||||
**Closed**:
|
||||
- Branch/CR/SPR/sync: 055, 067, 070, 081, 082, 083, 084, 085, 089
|
||||
- Loads: 091, 100, 109, 110, 111, 118, 127, 129
|
||||
- Stores: 132, 146, 147, 153, 163, 171
|
||||
- FPU: 187, 208, 228
|
||||
- VMX integer: 240, 277
|
||||
- VMX shift/rotate/logical: 316, 320, 321, 323
|
||||
- VMX permute: 370
|
||||
- VMX float compare/round/convert: 438, 439, 440
|
||||
- VMX multiply-add: 490
|
||||
- VMX load/store: 517
|
||||
|
||||
**Remaining open** (LOW test-gap, non-blocking): 045, 047, 066, 088 (PPCBUG-088 disasm-only test gap), 117, 145, 279, 317, 322, 324, 325, 371-378, 491-494, 518, 519, 567. These can stay open until a focused test-coverage sprint or incidentally landed during ongoing work.
|
||||
|
||||
**Batches**:
|
||||
- Batch 1 (9827b03): branch/CR-logical/SPR/MSR/FPSCR/sync — 12 tests
|
||||
- Batch 2 (2d223ee): load/store base + XER-TBC-driven lswx/stswx — 15 tests
|
||||
- Batch 3 (ebfd18a): FPU + VMX float — 14 tests; reviewer caught a VX-form encoding nit (XO at bit 0 not bit 1) during this batch and the author re-encoded all VX/VC tests before commit
|
||||
- Batch 4 (2614806): VMX integer/permute/load-store — 12 tests
|
||||
- Review-fix nit (1f9696a): test rename `vmsum3fp_horizontal_3lane_sum` → `vmaddfp_lane_fma` (test body actually exercised vmaddfp)
|
||||
|
||||
**Review findings**: independent reviewer verdict was LGTM on all 4 batches with no blocking issues. Every hand-encoded raw was mechanically cross-checked against canary's `INSTRUCTION(0x..., ..., kVX|kVC|kX|kA, ...)` base raw — no encoding mismatches. The XER-TBC-driven `lswx`/`stswx` tests are particularly load-bearing: they exercise the new infrastructure landed in P6 (68c0ee5); both opcodes were permanent no-ops pre-P6.
|
||||
|
||||
**Gate results**:
|
||||
- `cargo test --workspace --release`: **551 passed, 0 failed** (up from 498 at P7 merge — 53 net new tests; one `vmsum3fp_…` rename = -1+1 = net 0)
|
||||
- **Acid test** `-n 4B --parallel --reservations-table`: deferred per user direction
|
||||
|
||||
**Conclusion**: P8 closes the meaningful test-coverage gaps for opcode groups that previously had near-zero unit tests. Combined with the regression tests embedded in P1-P6 commits, the test suite now exercises every primary opcode form (branch, CR, SPR, FPU, VMX integer, VMX float, VMX load/store, scalar load/store) at least once. Remaining LOW test-gap items can be closed incrementally without blocking the audit's functional fixes.
|
||||
|
||||
---
|
||||
|
||||
### Post-P8 — End-to-end review + acid test (2026-05-02)
|
||||
|
||||
**End-to-end reviewer findings** (cross-cutting after all 8 phases):
|
||||
|
||||
1. **BLOCKING-LIKELY**: `lwa`/`lwax`/`lwaux` were converted to zero-extend in P4 batch 5 (PPCBUG-105 "minimal-fix"); reviewer flagged this as ISA-deviating. Per PowerISA, "Load Word and Algebraic" must sign-extend. Hotfix landed at HEAD f1166d0 — restored `as i32 as i64 as u64` form, updated test from `lwa_high_bit_set_zero_extends_upper` to `lwa_sign_extends_to_i64`.
|
||||
2. **Cosmetic** `fpscr.rs:289` duplicate-branch typo in `round_single_toward_zero` — both branches were `adj_bits - 1`. Replaced with the unconditional form + comment. HEAD 09c6c92.
|
||||
3. **Minor** reservation table's `active_reservers` counter is slot-occupancy, not reserver-count — once dirtied via cross-line-collision displacement, stores eternally pay the `invalidate_for_write` Acquire-load cost. Correctness-preserving (counter is upper bound), but performance can degrade. Documented; deferred to a focused performance sub-batch.
|
||||
4. **Asymmetric** `extswx` is the only sign-extend opcode left at 64-bit ABI (P4 converted every other extsXx to 32-bit). Per PPCBUG-038 (audit `wontfix`), this matches ISA's documented "argument-register canonicalization in 64-bit mode" intent. No code change. Reviewer flagged the asymmetry — accepted.
|
||||
|
||||
**Acid test result** (`xenia-rs check sylpheed.iso -n 4000000000 --parallel --reservations-table`, 2026-05-02 12:28→12:46):
|
||||
- Exit code: 0 (clean termination, no panics, no RtlRaiseException, no halts)
|
||||
- swaps=1 (frame=1 XE_SWAP, fb=0x4b0d7000, 1280×720)
|
||||
- draws=0
|
||||
- 14 ExCreateThread spawns, 2 worker exits via LR sentinel
|
||||
- The renderer plateau is **NOT unblocked** by the cumulative P1-P8 correctness fixes
|
||||
- Note: the binary tested was pre-lwa-hotfix (built before commit f1166d0). The lwa change is unlikely to affect Sylpheed (compilers don't emit `lwa` in 32-bit-ABI code), but a re-run after the hotfix would be the conservative confirmation.
|
||||
|
||||
**Implication**: the renderer plateau (`draws=0`) has a non-PPC-correctness root cause. The audit's catch was correctness-justified independent of the renderer (PPCBUGs are real bugs, well-grounded against canary), but the cumulative ~161 PPCBUG fixes do not unblock the specific Sylpheed-rendering issue. Next investigation tracks should focus on:
|
||||
- Graphics-pipeline-side issues (EDRAM resolve gaps per `project_xenia_rs_edram_resolve_gap.md`, RT readback)
|
||||
- Kernel HLE divergences (event signaling, timer queues, file system)
|
||||
- The unresolved BST-validation paradox documented in `project_xenia_rs_sylpheed_event_chain_2026_04_29.md` (sub_82175E68 registers 0x828F3F68 in the BST but the validator doesn't find it eight instructions later)
|
||||
|
||||
These are out of scope for the PPC instruction audit.
|
||||
|
||||
---
|
||||
|
||||
## Index — every PPCBUG referenced (in numerical order)
|
||||
|
||||
This list intentionally includes every ID found in `audit-findings.md` so nothing is dropped. For each entry's full description / file:line / fix snippet / test recommendation, see the corresponding `### PPCBUG-NNN` heading in `audit-findings.md`.
|
||||
|
||||
001-022 (batch 1: integer ALU): 001, 002, 003, 004, 005, 006, 007, 008, 009, 010, 011, 012, 013, 014, 015, 016, 017, 018, 019, 020, 021, 022.
|
||||
|
||||
023 (batch 2 group 6 logic immediate): 023.
|
||||
|
||||
024-027 (batch 2 group 9 word rotate): 024, 025, 026, 027.
|
||||
|
||||
028-033 (batch 2 group 7 logic register): 028, 029, 030, 031, 032, 033.
|
||||
|
||||
034-039 (batch 2 group 8 sign-extend / count-leading-zeros): 034, 035, 036, 037, 038, 039.
|
||||
|
||||
040-045 (batch 2 group 11 shift): 040, 041, 042, 043, 044, 045.
|
||||
|
||||
046-047 (batch 2 group 10 doubleword rotate): 046, 047.
|
||||
|
||||
048-052 reserved (group 12 compare): 048, 049, 050.
|
||||
|
||||
053-055 (batch 3 group 13 branch): 053, 054, 055.
|
||||
|
||||
063-067 (batch 3 group 14 trap+sc): 063, 064, 065, 066, 067.
|
||||
|
||||
068-070 (batch 3 group 15 CR logical): 068, 069, 070.
|
||||
|
||||
078-085 (batch 3 group 16 SPR/MSR/TB/FPSCR/VSCR): 078, 079, 080, 081, 082, 083, 084, 085.
|
||||
|
||||
088-089 (batch 3 group 17 cache+sync): 088, 089.
|
||||
|
||||
090-091 (batch 4 group 18 load byte): 090, 091.
|
||||
|
||||
095-100 (batch 4 group 19 load halfword): 095, 096, 097, 098, 099, 100.
|
||||
|
||||
105-111 (batch 4 group 20 load word + reservation): 105, 106, 107, 108, 109, 110, 111.
|
||||
|
||||
115-118 (batch 4 group 21 load doubleword): 115, 116, 117, 118.
|
||||
|
||||
123-127 (batch 4 group 22 load multiple/string): 123, 124, 125, 126, 127.
|
||||
|
||||
128-129 (batch 4 group 23 load float): 128, 129.
|
||||
|
||||
130-132 (batch 5 group 24 store byte/halfword): 130, 131, 132.
|
||||
|
||||
140-147 (batch 5 group 25 store word + stwcx): 140, 141, 142, 143, 144, 145, 146, 147.
|
||||
|
||||
150-153 (batch 5 group 26 store doubleword): 150, 151, 152, 153.
|
||||
|
||||
160-163 (batch 5 group 27 store multiple/string): 160, 161, 162, 163.
|
||||
|
||||
165-171 (batch 5 group 28 store float): 165, 166, 167, 168, 169, 170, 171.
|
||||
|
||||
180-187 (batch 6 group 29 FPU single arithmetic): 180, 181, 182, 183, 184, 185, 186, 187.
|
||||
|
||||
200-208 (batch 6 group 30 FPU double arithmetic): 200, 201, 202, 203, 204, 205, 206, 207, 208.
|
||||
|
||||
220-231 (batch 6 group 31 FPU sign/move/compare/convert): 220 [retracted], 221, 222 [retracted], 223, 224, 225, 226 [retracted], 227, 228, 229, 230, 231.
|
||||
|
||||
240-243 (batch 7 group 32 VMX integer add/sub): 240, 241, 242, 243.
|
||||
|
||||
275-279 (batch 7 group 33 VMX integer compare/min/max/avg): 275, 276, 277, 278, 279.
|
||||
|
||||
315-325 (batch 7 group 34 VMX integer logical/shift/rotate): 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325.
|
||||
|
||||
360-378 (batch 8 group 35 VMX permute/pack): 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378.
|
||||
|
||||
420-440 (batch 8 group 36 VMX float arith+compare): 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440.
|
||||
|
||||
482-495 (batch 8 group 37 VMX multiply-sum + special): 482 [retracted], 483 [retracted], 487, 490, 491, 492, 493, 494, 495.
|
||||
|
||||
510-519 (batch 8 group 38 VMX load/store): 510, 511, 512, 513, 514, 515, 516, 517, 518, 519.
|
||||
|
||||
560-567 (Phase C1 decoder field extractors): 560, 561, 562, 563, 564, 565, 566, 567.
|
||||
|
||||
600-605 (Phase C2 decoder opcode-lookup): 600, 601, 602, 603, 604, 605.
|
||||
|
||||
640-654 (Phase C3 disassembler formatter): 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654.
|
||||
|
||||
**Counted IDs**: 253. **Retracted**: 220, 222, 226, 482, 483 (5). **Net actionable**: 248.
|
||||
|
||||
**Counted by phase here**: P1 (~17 IDs), P2 (~17 IDs), P3 (~7 IDs), P4 (~30 IDs), P5 (~30 IDs), P6 (~25 IDs), P7 (~5 IDs), P8 (~50 IDs), Notes (~30 wontfix/informational/retracted). Total accounts for all 253 IDs — every ID is either in a fix phase, the wontfix/informational list, or retracted. **Nothing has been dropped.**
|
||||
1013
audit-runs/audit-004/run-50m-probe-v2.txt
Normal file
1013
audit-runs/audit-004/run-50m-probe-v2.txt
Normal file
File diff suppressed because it is too large
Load Diff
1151
audit-runs/audit-004/run-50m-probe.txt
Normal file
1151
audit-runs/audit-004/run-50m-probe.txt
Normal file
File diff suppressed because it is too large
Load Diff
281
audit-runs/audit-006/canary_export_queue.md
Normal file
281
audit-runs/audit-006/canary_export_queue.md
Normal file
@@ -0,0 +1,281 @@
|
||||
# Canary-Only Export Fix Queue (audit-006)
|
||||
|
||||
- Status: **POST-KE-001 (2026-05-06): 2 canary-only (XamUserReadProfileSettings DROPPED post-XamUserGetSigninState landing earlier; KE-001 unsuspended audio workers but KeReleaseSemaphore producer is downstream-gated and did NOT fire).** `KeResumeThread` is now a real impl per canary `xboxkrnl_threading.cc:216-227` (KRNBUG-KE-001, branch `ke-resume-thread/p0-canary-mirror`). Cascade A passed: tids 9 (entry=0x824D2878) and 10 (entry=0x824D2940) leave Suspended → run prologue → park on `WaitAny` for audio buffer-completion semaphores `0x828A3254` / `0x828A3230`. Cascade B partial: `NtSetEvent 667→3334` (5×) but `KeReleaseSemaphore=0` and `XAudioSubmitRenderDriverFrame=0` — workers stuck before the producer. Cascade C predicted 2→1, actual 2→2 (`ExTerminateThread`, `KeReleaseSemaphore` both still canary-only). Cascade D: `--pc-probe=0x82184318,0x82184374` armed — neither fires; `--dump-addr=0x828F4070` no DUMP lines; γ-cluster blocker unchanged; signal_attempts on 0x1004/0x100c/0x1020/0x15e4 still 0. swaps=2 draws=0 plateau intact. Lockstep `instructions=100000003 imports=987516` deterministic ×2. Goldens re-baselined `sylpheed_n50m.json instructions 50000003→50000011, imports 407255→407247`. See KRNBUG-KE-001 in `audit-findings.md`.
|
||||
|
||||
- Prior status (superseded by KE-001): **POST-IO-004 (2026-05-06): 7 → 3 canary-only.** Real `XamNotifyCreateListener` + `XNotifyGetNext` landed (KRNBUG-IO-004). Dispatch arm at `0x822f1be8` now fires; `sub_82173DC8` runs in a tight loop on tid=1; renderer-cluster L1 entries `0x822c6870`, `0x824563e0`, `0x823ddb50` are reached for the first time. 4 reclassified RE-FIRES (now reached): `KeResetEvent`, `ObCreateSymbolicLink`, `XamTaskCloseHandle`, `XamTaskSchedule`. Still canary-only: `ExTerminateThread`, `KeReleaseSemaphore`, `XamUserReadProfileSettings` — all REAL_BUT_UNREACHED at the new boot horizon. Worker count 18 → 20. signal_attempts on 0x15e0 = 1 (was 0). draws=0 still expected at this step. See KRNBUG-IO-004 in `audit-findings.md` and `project_xenia_rs_io_004_xnotify_listener_2026_05_06.md`.
|
||||
|
||||
- Prior status (superseded by IO-004): **AUDIT-009 (2026-05-05): GATE IS HIGHER THAN THE CLUSTER ITSELF.** AUDIT-008's β-hypothesis (gate sits among the 5 callers of `sub_821800D8` in 0x82287000-0x82292FFF) is **falsified**: a 21-PC `--branch-probe` (the 6 parents + 5 shims + dispatcher + 9 audit-005 producer-callsites) shows **0/21 firings** at -n 500M (`audit-runs/audit-009/probe-500m.err`). The whole 0x82287000-0x82294000 cluster is unreached. Static analysis: the cluster's level-1 root functions (`sub_82293448`, `sub_822919C8`) have **zero non-call xrefs in sylpheed.db** — they are reached only via vtable / function-pointer that's never written. Main parks at `sub_822F1AA8` frame-poll loop forever (1.49M XNotifyGetNext iterations). Three canary-only exports (`ExTerminateThread`, `KeReleaseSemaphore`, `XamUserReadProfileSettings`) remain REAL_BUT_UNREACHED — same as audit-008. **DO NOT pull from this queue.** Next-session probe set: cluster L1 roots + new thread entry trampolines (0x822c6870 / 0x824563e0 / 0x823dde30 / 0x823ddb50) + main's frame-poll callees + main's post-poll continuation list. See KRNBUG-AUDIT-009 in `audit-findings.md` and `project_xenia_rs_audit_009_renderer_unreached_2026_05_05.md`.
|
||||
|
||||
- Prior status (superseded by AUDIT-009): **AUDIT-008 MODEL RESET (2026-05-05).** 0x100c worker IS spawned post-IO-003 as tid=3 (ctx=0x828F3D08), 0x1004 as tid=11, 0x15e0 as tid=17. AUDIT-008 hypothesized the gate among the 5 non-create-chain callers of `sub_821800D8` whose parents live in 0x82287000-0x82292FFF. AUDIT-009 falsified that — those parents are themselves never entered, so the gate is one level above.
|
||||
|
||||
- Prior status (superseded by AUDIT-008): **PARTIAL CASCADE (2026-05-04, post-KRNBUG-IO-003). 7 → 3 canary-only exports.** `NtDeviceIoControlFile` real impl landed; the priv-11 query (`XexCheckExecutablePrivilege(0xB)`) and `XamTaskSchedule` now fire. **Reclassified (now firing on our side):** `KeResetEvent`, `ObCreateSymbolicLink`, `XamTaskCloseHandle`, `XamTaskSchedule`. **Bonus pickups:** `XeCryptSha`, `XeKeysConsolePrivateKeySign` (both 0→1 — were not on the canary-only list because they were already in `ours_exports` but unreached). **Still canary-only:** `ExTerminateThread`, `KeReleaseSemaphore`, `XamUserReadProfileSettings`. ~~Worker thread spawn count unchanged at 19; handle 0x100c remains UNCREATED.~~ (audit-008: 0x100c worker IS spawned, claim was wrong.) See KRNBUG-IO-003 in `audit-findings.md` and `project_xenia_rs_io_003_ioctl_2026_05_04.md`.
|
||||
|
||||
- Prior status (now superseded): **SUPERSEDED by AUDIT-007 (2026-05-04). Real gate identified: `NtDeviceIoControlFile` (FsCtlCode=0x74004) is `stub_success` at `crates/xenia-kernel/src/exports.rs:90`. Game-side `sub_824ABD88:0x824abea8-ac` reads `[out_buf+8]` of the IOCTL response, finds zero (stub doesn't write OUT), assigns hardcoded `0xC0000034` (STATUS_OBJECT_NAME_NOT_FOUND); caller `sub_824A9710` exits at `0x824a9944` before priv-11. Tier 4 entries remain parked, classification unchanged (still REAL_BUT_UNREACHED), awaiting KRNBUG-IO-003. See `project_xenia_rs_audit_007_branch_probe_2026_05_04.md` for the runtime trace + decisive proof.**
|
||||
|
||||
- Prior status: **PARTIAL — KRNBUG-IO-002 landed, but predicted cascade did NOT fire (7 → 7). Tier 0 marked superseded; Tier 4 entries STILL parked. Re-audit needed to find the real upstream gate.**
|
||||
- Pre-state: master HEAD `556a8c3`, exports diff captured 2026-05-04
|
||||
- Post-IO-002 state: branch `xboxkrnl-vol-allocunit/p0-65536-cluster`, fresh 500 M trace at `audit-runs/post-IO-002/`. Canary-only kernel exports remain identical: `{ExTerminateThread, KeReleaseSemaphore, KeResetEvent, ObCreateSymbolicLink, XamTaskCloseHandle, XamTaskSchedule, XamUserReadProfileSettings}`.
|
||||
- Inputs:
|
||||
- `canary.log` (348720 B, identical to audit-005 oracle, canary build `9467c77f0`)
|
||||
- `ours.log` (692 MB, 5.6 M trace lines, run at 17:20–17:21 today, post-IO-001)
|
||||
- Tooling: `diff.py` + plain `comm -23` set-difference on extracted call names
|
||||
|
||||
## Headline finding
|
||||
|
||||
**7/7 canary-only entries classify as REAL_BUT_UNREACHED or STUB_BUT_UNREACHED.**
|
||||
Per the audit-006 spec stop condition ("if two-thirds of entries are
|
||||
REAL_BUT_UNREACHED, the problem isn't stubs — it's an upstream gate"),
|
||||
the next session should **NOT** pull a Tier-1 entry from this queue.
|
||||
Instead, it should fix the gate.
|
||||
|
||||
The gate is **KRNBUG-IO-002**: our `nt_query_volume_information_file`
|
||||
class-3 (FileFsSizeInformation) returns alloc_unit = 1 × 2048 = 2048,
|
||||
but Sylpheed's `main(1, 0x10000, 0xFF000)` expects alloc_unit = 65536
|
||||
(see `project_xenia_rs_io_nullfile_2026_05_04.md`).
|
||||
Sylpheed's verifier `sub_824ABA98` rejects 2048, propagates failure to
|
||||
`sub_824A9710`, which exits early before its `XexCheckExecutablePrivilege(0xB)`
|
||||
call site. Canary fires the priv-11 query *and* the entire downstream
|
||||
cluster (`XamTaskSchedule` → Cache0 callback thread → 0x100c worker spawn
|
||||
→ display-init pump → profile-settings cascade); we fire none of it.
|
||||
|
||||
Direct evidence (telemetry):
|
||||
- Our `XexCheckExecutablePrivilege` count = **1** (priv=0xA only).
|
||||
Canary count = **2** (priv=0xA + priv=0xB).
|
||||
- All 7 canary-only entries have ours-side count = **0** at -n 500M.
|
||||
- Our trace ends with main thread (hw=0) parked on `XNotifyGetNext +
|
||||
NtWaitForSingleObjectEx(0x10f4, lr=0x824ac578)` and hw=1 parked on
|
||||
`NtWaitForMultipleObjectsEx(lr=0x824ab214) + cs=0x828f3e70` —
|
||||
classic post-cache-recreate spin.
|
||||
- The 44 `NtWriteFile` calls in ours.log (cache zero-fill) are followed by
|
||||
more NtClose / NtCreateFile cycles, but `XexCheckExecutablePrivilege(0xB)`
|
||||
never fires → priv-11 site in `sub_824A9710` is unreached.
|
||||
- Memory's predicted `0xC000014F` does not yet appear in ours.log; first
|
||||
cache-related error is `0xC0000034` (OBJECT_NAME_NOT_FOUND) from
|
||||
`lr=0x824a97e4`. This still fits the gate hypothesis: the recreate path
|
||||
is reached, completes its writes, re-opens, queries volume info, and
|
||||
the *game-side* verifier rejects our reply silently (no kernel error).
|
||||
|
||||
---
|
||||
|
||||
## Tier 0 — upstream gate (SUPERSEDED 2026-05-04 — fix landed but cascade did NOT fire)
|
||||
|
||||
### KRNBUG-IO-002 — `nt_query_volume_information_file` block size — **LANDED, gate hypothesis FALSIFIED**
|
||||
|
||||
**Outcome:** the block-size literals at `exports.rs:1255-1256` were corrected
|
||||
to canary's NullDevice values (`sectors_per_unit=0x80, bytes_per_sector=0x200`,
|
||||
product `0x10000`). 591 → 592 tests, lockstep `instructions=100000010, swaps=2,
|
||||
draws=0` deterministic across two reruns (`audit-runs/post-IO-002/lock_n100m_run{1,2}.json`).
|
||||
sylpheed_n50m oracle still matches its existing golden (no observable change at -n 50M).
|
||||
|
||||
**However, the predicted cascade DID NOT fire.** Set-difference on a fresh
|
||||
500 M trace (`audit-runs/post-IO-002/ours.log`) produces the **identical**
|
||||
seven-entry canary-only set audit-006 captured pre-fix:
|
||||
|
||||
```
|
||||
ExTerminateThread, KeReleaseSemaphore, KeResetEvent,
|
||||
ObCreateSymbolicLink, XamTaskCloseHandle, XamTaskSchedule,
|
||||
XamUserReadProfileSettings
|
||||
```
|
||||
|
||||
`XexCheckExecutablePrivilege` count remains **1** (priv=0xA only, priv=0xB
|
||||
unreached). `XamTaskSchedule` count remains **0**. Worker thread spawns
|
||||
fell from 19 → 18 (within noise — single thread variance per call-site
|
||||
breakdown: `lr=0x824ac5f0×15 + 0x824cd984×1 + 0x824d2e68×2`). The 16
|
||||
NtQueryVolumeInformationFile call sites in `ours.log` all originate from
|
||||
a single LR `0x82611f38` — meaning the `audit-006` premise that
|
||||
`sub_824ABA98`/`sub_824A9710` consume the volume-info reply at the
|
||||
priv-11 gate may be **incorrect**, or the gate consumes a *different*
|
||||
information class entirely.
|
||||
|
||||
**Stop-condition triggered.** Per the IO-002 task brief, this session does
|
||||
not pivot to a second fix. The fix is correct (it makes our reply
|
||||
byte-identical to canary's NullDevice and survives every test we have);
|
||||
it is just not load-bearing for the priv-11 gate. The branch landed as a
|
||||
strict no-op at our current boot horizon — kept because it's correct and
|
||||
unblocks no regression.
|
||||
|
||||
**Next-session next gate hypothesis (untested):**
|
||||
- The audit-005 disasm of `sub_824ABA98` may have mis-attributed the consumer
|
||||
of bytes_per_sector. The IO-001 trace decisively located the failure at
|
||||
the `NtReadFile` inside `sub_824A9710`, not at any volume-info site.
|
||||
Re-read the `sub_824A9710` disasm with that in mind.
|
||||
- Volume-info LR `0x82611f38` is far downstream of the priv-10/priv-11
|
||||
cluster (the calls *complete* successfully — they don't gate anything
|
||||
visible). The actual gate may be `nt_query_information_file`,
|
||||
`nt_query_full_attributes_file`, an FsCtl IOCTL, or a different
|
||||
alloc-unit query path.
|
||||
- Per AUDIT-005 instrumentation, the priv-11 site at `sub_824A9710` PC
|
||||
cluster has **never fired** in any session. Probe `sub_824A9710` entry
|
||||
with `--pc-probe` and trace which conditional exits the function before
|
||||
the priv-11 query — that's the real gate.
|
||||
|
||||
---
|
||||
|
||||
### KRNBUG-IO-002 — `nt_query_volume_information_file` block size (original spec, kept for archaeology)
|
||||
|
||||
- **Where in our code:** `crates/xenia-kernel/src/exports.rs:1241-1269` (function
|
||||
`nt_query_volume_information_file`).
|
||||
- **Classification:** `REAL_BUT_BUGGY`. Registered at exports.rs:100, called
|
||||
16× in ours.log (16× in canary.log too — call counts match), returns
|
||||
`STATUS_SUCCESS`, but the FileFsSizeInformation payload is wrong.
|
||||
- **Bug:** class=3 branch writes `(total=0x100000, free=0,
|
||||
sectors_per_unit=1, bytes_per_sector=2048)`. Product = 2048 bytes per
|
||||
cluster.
|
||||
- **Canary reference:**
|
||||
- Entry function `NtQueryVolumeInformationFile_entry` at
|
||||
`xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_io_info.cc:323` (case
|
||||
`XFileFsSizeInformation` at lines 355–365). Canary delegates to per-device
|
||||
methods on `file->device()`.
|
||||
- `NullDevice` (the device backing `\Device\Harddisk0\Cache0`) returns
|
||||
`sectors_per_allocation_unit() = 0x80` and `bytes_per_sector() = 0x200`
|
||||
at `xenia-canary/src/xenia/vfs/devices/null_device.h:38-46`. Product =
|
||||
65536, matching Sylpheed's expectation.
|
||||
- Other device backings (HostPath, DiscImage, DiscZArchive) all return
|
||||
`(1, 0x200)` = 512. Sylpheed's first volume query at this site is
|
||||
against Cache0 (NullDevice), so the relevant value is 65536.
|
||||
- **Fix sketch (minimum):** in the class=3 branch, change the two writes to
|
||||
`mem.write_u32(info+16, 128); mem.write_u32(info+20, 512);` (and reduce
|
||||
TotalAllocationUnits accordingly so the disk size remains plausible —
|
||||
e.g. 0x10 units × 128 sectors × 512 bytes ≈ 1 MB, matching NullDevice).
|
||||
Total diff ≤ 4 lines.
|
||||
- **Fix sketch (proper, deferred until the cluster fires reliably):**
|
||||
introduce a per-handle device-info lookup so HostPath / DiscImage paths
|
||||
return their canary-correct values too. Skipped for now because Sylpheed
|
||||
only queries Cache0 at this gate.
|
||||
- **Expected observable post-fix:**
|
||||
- `XexCheckExecutablePrivilege` count: 1 → 2.
|
||||
- `XamTaskSchedule` count: 0 → 1 (callback=0x824A93C8, message=0x828A28F0).
|
||||
- `kernel.calls{XamTaskSchedule}` finally non-zero — closes the
|
||||
APUBUG-PRODUCER-001 / XAMBUG-PRODUCER-001 producer hunt that
|
||||
falsified XAudio + XamTaskSchedule producer hypotheses.
|
||||
- Spawn of the Cache0 callback thread (XThread::Execute thid 7 in
|
||||
canary, our equivalent to come).
|
||||
- Inside that thread: `StfsCreateDevice` (still undefined extern in
|
||||
canary too — does not block) + `ObCreateSymbolicLink` +
|
||||
`ExRegisterTitleTerminateNotification`.
|
||||
- Back on main: `KeResetEvent(0x8287094C)`, `NtCreateEvent`,
|
||||
`ExCreateThread(entry=0x82181830, ctx=0x828F3D08)` — and `0x82181830`
|
||||
is the worker entry for **dispatcher 0x100c**, one of the four
|
||||
parked-handle producers (per
|
||||
`project_xenia_rs_producer_stack_trace_2026_05_03.md`). Spawning
|
||||
that worker should advance handle 0x100c's `signal_attempts`
|
||||
counter off zero.
|
||||
- Eventually (further into the boot): `XamUserGetXUID`,
|
||||
`XamUserReadProfileSettings`, `XamContentCreateEnumerator`,
|
||||
`KeReleaseSemaphore` display-pump (268+ calls in canary at this
|
||||
horizon).
|
||||
- **Risk:** low. Two-line value change. NullDevice is the only device
|
||||
Sylpheed asks about at this gate; other devices are not yet hit.
|
||||
- **Effort:** trivial.
|
||||
- **Dependencies:** none. Land directly.
|
||||
- **Verification chain:** `cargo test -p xenia-kernel`,
|
||||
then `cargo run --release -p xenia-app -- exec sylpheed.iso -n 500_000_000`
|
||||
with kernel-call tracing on, then re-run audit-006's set-difference;
|
||||
expect canary-only count to drop from 7 toward 0 as the cluster fires.
|
||||
|
||||
---
|
||||
|
||||
## Tier 4 — REAL_BUT_UNREACHED / STUB_BUT_UNREACHED — do not fix yet
|
||||
|
||||
These are downstream of Tier 0. Reachability is blocked on KRNBUG-IO-002
|
||||
landing. After IO-002 lands, re-derive this list — most entries should
|
||||
have moved off, and any survivors will be classifiable on real evidence.
|
||||
|
||||
| # | Export | Ordinal | Library | Our state | Canary impl | Canary calls (at horizon) | Cascade rank |
|
||||
|---|--------|---------|---------|-----------|-------------|---------------------------|--------------|
|
||||
| 1 | `XamTaskSchedule` | 0x01AF | xam | REAL_BUT_UNREACHED (`xam_task_schedule`, xam.rs:213) | `xam_task.cc:43-80` | 1 (gate-pivot call) | upstream-of-cluster — fires the entire post-IO-002 cascade |
|
||||
| 2 | `XamTaskCloseHandle` | 0x01B1 | xam | STUB_BUT_UNREACHED (`stub_success`, xam.rs:33) | `xam_task.cc:83-93` (one-liner: `NtClose` + last-error) | 1 | low (cleanup after #1) |
|
||||
| 3 | `KeResetEvent` | 0x8F | xboxkrnl | REAL_BUT_UNREACHED (`ke_reset_event`, exports.rs:3172) | `xboxkrnl_threading.cc:566` | 1 | medium — clears 0x8287094C right before ExCreateThread(0x82181830) on main |
|
||||
| 4 | `ObCreateSymbolicLink` | 0x0103 | xboxkrnl | STUB_BUT_UNREACHED (`stub_success`, exports.rs:121) | `xboxkrnl_ob.cc:351` | 1 | low — Cache0-symlink registration; cosmetic for Sylpheed boot |
|
||||
| 5 | `KeReleaseSemaphore` | 0x88 | xboxkrnl | REAL_BUT_UNREACHED (`ke_release_semaphore`, exports.rs:3280) | `xboxkrnl_threading.cc:724` | 268 | high (in volume) — display-init pump on the post-cluster main loop |
|
||||
| 6 | `ExTerminateThread` | 0x19 | xboxkrnl | REAL_BUT_UNREACHED (`ex_terminate_thread`, exports.rs:312) | `xboxkrnl_threading.cc:173` | 2 | low — thread cleanup on Cache0 / profile threads |
|
||||
| 7 | `XamUserReadProfileSettings` | 0x0219 | xam | REAL_BUT_UNREACHED (`xam_user_read_profile_settings`, xam.rs:327) | `xam_user.cc:329` | 2 | medium — gates the `XamUserGetXUID → profile load` flow far downstream |
|
||||
|
||||
**Why every entry above is Tier 4 (not Tier 1):**
|
||||
|
||||
- Each entry's first call in `canary.log` falls **after** line 1210
|
||||
(`XamTaskSchedule(824A93C8, ...)`), which is the gate-pivot call.
|
||||
- Our trace contains zero of any of the seven, despite running 500 M
|
||||
instructions and reaching the post-cache-recreate horizon.
|
||||
- Six of the seven are already real implementations. The two stubs
|
||||
(`XamTaskCloseHandle`, `ObCreateSymbolicLink`) are minor cleanups; even
|
||||
upgrading them would not move boot progress until #1 (`XamTaskSchedule`)
|
||||
fires.
|
||||
- Therefore: fixing any of these in isolation is wasted effort. They
|
||||
should be re-classified after KRNBUG-IO-002 lands and the priv-11 /
|
||||
Cache0 callback chain runs.
|
||||
|
||||
---
|
||||
|
||||
## Tier 1 / 2 / 3 — empty for this audit
|
||||
|
||||
No entry qualifies as Tier 1 or Tier 2 in the current state. The single
|
||||
high-cascade fix worth pulling next is the Tier-0 gate (KRNBUG-IO-002),
|
||||
which is **not itself a canary-only export** — it's a wrong-value bug in
|
||||
an export both sides call, so the diff.py based set-difference doesn't
|
||||
surface it. That is exactly why audit-006 was scoped this way: to confirm
|
||||
the gate hypothesis from `project_xenia_rs_io_nullfile_2026_05_04.md`
|
||||
before another implementation session is started.
|
||||
|
||||
---
|
||||
|
||||
## Cross-check vs IO-001 snapshot
|
||||
|
||||
IO-001 memory recorded these 7 still-canary-only exports:
|
||||
> ExTerminateThread, KeReleaseSemaphore, KeResetEvent, ObCreateSymbolicLink,
|
||||
> XamTaskCloseHandle, XamTaskSchedule, XamUserReadProfileSettings.
|
||||
|
||||
Audit-006 set-difference produces the **identical** 7, in 1:1
|
||||
correspondence. No new canary-only export has appeared since IO-001
|
||||
landed; no entry has moved off. Cascade is still parked at the same gate.
|
||||
|
||||
The `XeCryptSha`, `XeKeysConsolePrivateKeySign`, and `NtDeviceIoControlFile`
|
||||
entries that IO-001 was credited with unblocking are confirmed: ours
|
||||
calls them 1, 1, 2 times respectively (canary calls them 1, 1, 2 — exact
|
||||
match). They are correctly off the canary-only list.
|
||||
|
||||
---
|
||||
|
||||
## Methodology notes
|
||||
|
||||
1. **"Cascade rank" definition:** estimated by where the export's first
|
||||
canary call falls in the boot sequence and how many downstream code
|
||||
paths depend on it. "high" = upstream-of-cluster (XamTaskSchedule).
|
||||
"medium" = intermediate (KeResetEvent, profile cascade).
|
||||
"low" = leaf cleanup or cosmetic (XamTaskCloseHandle, ObCreateSymbolicLink).
|
||||
Rank only matters once Tier 0 is landed; until then everything is parked.
|
||||
2. **Reachability oracle:** binary `grep -c "call=NAME"` against ours.log
|
||||
at -n 500M. Zero counts are conclusive for "unreached" because tracing
|
||||
is unconditional.
|
||||
3. **Canary log freshness:** the log is from 17:34 (3 h before this
|
||||
audit) but is byte-identical to audit-005's input — canary's behavior
|
||||
is deterministic given the same ROM and the canary build header
|
||||
(`canary_experimental@9467c77f0 on May 2 2026`) hasn't changed.
|
||||
Re-running through Lutris is unnecessary.
|
||||
4. **Gate confirmation:** memory predicted block-size mismatch as the
|
||||
IO-002 blocker; this audit confirmed it by eliminating the alternative
|
||||
(no Tier-1-eligible canary-only export exists in the current 7-entry
|
||||
list). The 0xC000014F status memory predicted is not yet visible in
|
||||
ours.log because the recreate path completes the writes — the
|
||||
verifier inside `sub_824ABA98` rejects the volume-info reply at the
|
||||
game level (no kernel error logged).
|
||||
5. **What this queue is *not*:** a list of fixes to land. The audit-006
|
||||
discipline was scoping; the discipline of subsequent sessions is to
|
||||
re-run audit-006's diff after IO-002, then either close audit-006 (if
|
||||
the cluster fires through and all 7 entries drop) or open audit-007
|
||||
on whatever new canary-only set surfaces.
|
||||
|
||||
---
|
||||
|
||||
## Recommended next session
|
||||
|
||||
**KRNBUG-IO-002 (block-size fix), one-shot.** Two-line edit at
|
||||
`crates/xenia-kernel/src/exports.rs:1255-1256`. Verify the cluster fires
|
||||
by re-running audit-006's set-difference; expect 7 → 0 (or close to 0)
|
||||
canary-only entries. If new entries surface in either direction, that's
|
||||
audit-007's input.
|
||||
|
||||
**Do not** open this queue's Tier 4 entries before IO-002 closes. Their
|
||||
classification is pending; their fix sketches will look very different
|
||||
once they're observably called and their actual return values can be
|
||||
compared to canary.
|
||||
@@ -7,7 +7,11 @@ build = "build.rs"
|
||||
|
||||
[dependencies]
|
||||
xenia-xex = { workspace = true }
|
||||
xenia-cpu = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
rusqlite = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
duckdb = { workspace = true }
|
||||
msvc-demangler = "0.11"
|
||||
|
||||
570
crates/xenia-analysis/SCHEMA.md
Normal file
570
crates/xenia-analysis/SCHEMA.md
Normal file
@@ -0,0 +1,570 @@
|
||||
# `xenia-analysis` schema reference
|
||||
|
||||
Authoritative documentation for the DuckDB tables and SQL views produced by
|
||||
`xenia-rs dis --db sylpheed.db`. Track schema changes here alongside any
|
||||
update to the `db_schema_golden` test fixture.
|
||||
|
||||
The base + disasm tables (`metadata`, `sections`, `imports`, `functions`,
|
||||
`labels`, `instructions`, `xrefs`, opt-in `exec_trace` / `import_calls` /
|
||||
`branch_trace`) are documented inline in `src/db.rs` doc comment. This file
|
||||
collects layered analysis additions and forward-work notes.
|
||||
|
||||
---
|
||||
|
||||
## Layer M1 — `.pdata` boundary correction (landed)
|
||||
|
||||
### Schema additions
|
||||
- `functions.pdata_validated BOOLEAN NOT NULL` — `true` when the row's
|
||||
`address` matches a `RUNTIME_FUNCTION.BeginAddress` from `.pdata`. Linker
|
||||
ground truth.
|
||||
- `functions.pdata_length BIGINT NULL` — `function_length` (bytes) from the
|
||||
matching pdata entry; `NULL` when the row is prologue-only.
|
||||
- New table `pdata_entries(begin_address BIGINT PRIMARY KEY, end_address
|
||||
BIGINT, function_length BIGINT, prolog_length BIGINT, flags BIGINT)` — every
|
||||
parsed `.pdata` `RUNTIME_FUNCTION` entry (raw, before any merge with
|
||||
prologue analysis).
|
||||
- Index `idx_functions_pdata_validated` on `functions(pdata_validated)`.
|
||||
|
||||
### What this layer does
|
||||
- Parses `.pdata` 8-byte `RUNTIME_FUNCTION` entries (PowerPC PE32 layout):
|
||||
word 0 `BeginAddress` (absolute VA), word 1 packed
|
||||
`{prolog_length:8, function_length:22, flags:2}`, both big-endian.
|
||||
- Unions pdata `BeginAddress` values into the function-candidate set fed to
|
||||
the prologue walker, so functions our prologue heuristic missed still get
|
||||
rows.
|
||||
- When pdata supplies a longer `function_length` than the prologue walk
|
||||
found, extends `end_address` to the pdata-implied end (catches mis-split
|
||||
where the walker stopped at an early `blr`).
|
||||
- After the walker, performs a forward pass that trims `function.end` to the
|
||||
next start when they overlap (catches mis-merge where one row spanned two
|
||||
prologues — the audit-031 `sub_824D23B0` / `sub_824D29F0` case).
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not adjust prolog-derived `frame_size` / `saved_gprs` from `.pdata`'s
|
||||
`prolog_length` field — those remain prologue-only inferences.
|
||||
- Does not classify functions further than the existing `is_leaf` /
|
||||
`is_saverestore` columns. Class membership is M3.
|
||||
- Does not detect functions whose entries are missing from BOTH `.pdata`
|
||||
and the bl-target scan (extremely rare; would require executable-byte
|
||||
linear sweep).
|
||||
|
||||
### Reference docs
|
||||
- Microsoft PE32+ exception data spec for PowerPC RUNTIME_FUNCTION.
|
||||
- xenia-canary `src/xenia/cpu/xex_module.cc:1570-1587` — canary's reference
|
||||
parser (extracts `BeginAddress` only; we additionally decode word 1).
|
||||
|
||||
### Validation queries
|
||||
```sql
|
||||
-- All pdata entries found
|
||||
SELECT COUNT(*) FROM pdata_entries; -- ~23073 for Sylpheed
|
||||
-- Functions cross-validated against pdata
|
||||
SELECT COUNT(*) FROM functions WHERE pdata_validated;
|
||||
-- Functions detected ONLY by prologue (orphans of pdata)
|
||||
SELECT COUNT(*) FROM functions WHERE NOT pdata_validated;
|
||||
-- Pdata orphans NOT yet in functions (should be 0 after this layer)
|
||||
SELECT COUNT(*) FROM pdata_entries p
|
||||
LEFT JOIN functions f ON f.address = p.begin_address
|
||||
WHERE f.address IS NULL;
|
||||
-- Audit-031 mis-merge resolved: 0x824D29F0 should have its own row
|
||||
SELECT name FROM functions WHERE address = 2186674160; -- 0x824D29F0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Layer M2 — MSVC C++ name demangler (landed)
|
||||
|
||||
### Schema additions
|
||||
- New table `demangled_names(address BIGINT NULL, mangled VARCHAR NOT NULL,
|
||||
raw_demangled VARCHAR NOT NULL, namespace_path VARCHAR NULL,
|
||||
class_name VARCHAR NULL, method_name VARCHAR NULL,
|
||||
params_signature VARCHAR NULL)`.
|
||||
- Indices on `address`, `class_name`, `method_name`.
|
||||
|
||||
### What this layer does
|
||||
- Wraps `msvc_demangler::demangle` (a Rust port of LLVM's
|
||||
`MicrosoftDemangle.cpp`) and splits the formatted output into structured
|
||||
fields via a heuristic top-level parser (handles templates and nested parens
|
||||
correctly).
|
||||
- Populates `demangled_names` from any label whose name starts with `?` plus
|
||||
any import name that happens to be mangled (defensive — typical kernel
|
||||
imports use C names).
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not parse the AST returned by `msvc_demangler::parse` — uses the formatted
|
||||
string and a heuristic split. Adequate for typical class member functions
|
||||
and RTTI strings; exotic template / lambda forms still get `raw_demangled`
|
||||
populated but may have NULL structured fields.
|
||||
- Does not yet ingest RTTI strings discovered in `.rdata` — that's M3's job;
|
||||
M3 will append rows to this table at the addresses where it finds RTTI
|
||||
TypeDescriptors.
|
||||
|
||||
### Reference docs
|
||||
- `msvc-demangler` crate (`https://docs.rs/msvc-demangler/0.11`).
|
||||
- LLVM `MicrosoftDemangle.cpp` (the parser this crate ports).
|
||||
|
||||
## Layer M3 — Vtable + RTTI detection (landed)
|
||||
|
||||
### Schema additions
|
||||
- `vtables(address PK, length, col_address NULL, class_name, rtti_present,
|
||||
base_classes_json NULL)` — every detected static vtable.
|
||||
- `methods(vtable_address, slot, function_address, mangled_name NULL,
|
||||
demangled_name NULL, PRIMARY KEY (vtable_address, slot))` — one row per
|
||||
method slot.
|
||||
- `classes(name PK, vtable_address, rtti_present, base_classes_json NULL)` —
|
||||
deduped by class name (first-detected vtable wins).
|
||||
- Indices: `methods.function_address`, `classes.rtti_present`.
|
||||
|
||||
### What this layer does
|
||||
- Walks `.rdata` and `.data` looking for runs of ≥3 consecutive 4-byte BE
|
||||
values where each value is a known function start (from M1's corrected
|
||||
`functions` table). Single-2-method vtables are intentionally rejected to
|
||||
control false-positive rate.
|
||||
- Attempts the MSVC RTTI walk `vtable[-1] → CompleteObjectLocator → TypeDescriptor`
|
||||
for each candidate. When successful, the demangled `class ClassName`
|
||||
string fills `class_name` and a best-effort
|
||||
`RTTIClassHierarchyDescriptor` walk fills `base_classes_json` (JSON array
|
||||
of base class names).
|
||||
- Falls back to `ANON_Class_<8-hex>` keyed by FNV-1a hash of the sorted
|
||||
method-PC tuple when RTTI is absent (typical for shipped game binaries).
|
||||
Identical vtables across the binary (multiple instances) collapse to the
|
||||
same anonymous name.
|
||||
|
||||
### What this layer does NOT do
|
||||
- Vtables built at runtime in heap-allocated memory (e.g. by ctors copying
|
||||
static templates) are out of scope — only static `.rdata`/`.data` content.
|
||||
- Multiple-inheritance "extra" vftables (one per base subobject) are detected
|
||||
as independent vtables with no link between them.
|
||||
- Inheritance-tree walking beyond `RTTIClassHierarchyDescriptor`'s direct
|
||||
base list is not attempted.
|
||||
|
||||
### Reference docs
|
||||
- openrce.org "Reversing Microsoft Visual C++" — RTTI layout articles
|
||||
(CompleteObjectLocator at vtable[-1]; TypeDescriptor at COL+0xC; mangled
|
||||
name at TD+0x8).
|
||||
|
||||
## Layer M4 — Class-aware probe targeting (landed)
|
||||
|
||||
CLI extension only — no schema changes. The probe-token grammar adds three
|
||||
symbolic forms on top of the existing `0xADDR` literal:
|
||||
|
||||
- `Class::method` — joins `classes` × `methods` × `demangled_names` to find
|
||||
every PC whose vtable belongs to that class and whose demangled
|
||||
`method_name` matches.
|
||||
- `Class::*` — joins `classes` × `methods` to find every method PC of that
|
||||
class.
|
||||
- `function_name` — falls back to `functions.name` lookup for free functions
|
||||
/ saverestore stubs / labels.
|
||||
|
||||
Numeric tokens never touch the DB (preserves zero-IO fast path; lockstep
|
||||
digest unaffected). Symbolic tokens require the DuckDB at `--probe-db PATH`
|
||||
or `XENIA_PROBE_DB`; default is `sylpheed.db` next to the .iso when present.
|
||||
|
||||
Resolution happens BEFORE guest exec begins, so it cannot affect the
|
||||
lockstep digest.
|
||||
|
||||
See `crates/xenia-analysis/src/lookup.rs`.
|
||||
|
||||
---
|
||||
|
||||
## Layer M5 — Indirect-dispatch reachability (landed)
|
||||
|
||||
### Schema additions
|
||||
- New value `'ind_call'` in the `xrefs.kind` set.
|
||||
- New SQL view `v_indirect_reachability_from_entry` — strict superset of
|
||||
`v_reachability_from_entry`, taking `ind_call` edges in the BFS.
|
||||
|
||||
### What this layer does
|
||||
- Walks each `FuncAnalysis.functions` entry with a per-basic-block register
|
||||
tracker. Recognises the canonical static-vtable pattern:
|
||||
`lis+addi → lwz off(rA) → mtctr → bcctrl`, where `rA` ends up holding a
|
||||
known vtable's start address from M3.
|
||||
- Honours the PowerPC ABI: `bl`-style calls (op 18 / 16 with LK=1) clobber
|
||||
volatile r0..r12 + ctr but preserve non-volatile r13..r31, so a vtable
|
||||
pointer parked in r30/r31 before a call survives.
|
||||
- Treats every M3 `loc_*` label as a basic-block boundary (kills register
|
||||
state) so jump-IN paths cannot induce false positives.
|
||||
|
||||
### What this layer does NOT do (and observed impact)
|
||||
- Vtable pointer loaded from a `this`-pointer field
|
||||
(`lwz r_vt, off(rA)` where `rA = this`) — by far the dominant pattern in
|
||||
real C++ — is unresolvable without alias / points-to analysis.
|
||||
- On Sylpheed: the layer detects 0 edges. The binary's 1,001 lis+addi
|
||||
references into vtables are mostly constructor-side **vptr writes**
|
||||
(`stw rVtable, vptr_offset(this)`), not direct dispatches. The renderer
|
||||
hunt's audit-009 cluster therefore needs a future M5.5 with `this`-flow
|
||||
tracking before this layer surfaces it.
|
||||
|
||||
### Reference docs
|
||||
- IBM PowerPC ABI: register-save convention (volatile r0..r12 + ctr,
|
||||
non-volatile r13..r31).
|
||||
|
||||
## Layer M7 — String / constant-pool detection (landed)
|
||||
|
||||
### Schema additions
|
||||
- New table `strings(address PK, encoding, length, content)`.
|
||||
- Index `idx_strings_encoding`.
|
||||
|
||||
### What this layer does
|
||||
- Scans `.rdata` for runs of length ≥ 6 of printable ASCII bytes followed by
|
||||
a NUL terminator.
|
||||
- Scans `.rdata` for UTF-16LE runs of length ≥ 6 code units (printable-ASCII
|
||||
basic plane only) followed by a u16 NUL terminator.
|
||||
- Cross-reference is implicit: existing `xrefs.kind='ref'` rows whose
|
||||
`target` falls in `strings.address`'s exact match set name the referencing
|
||||
PCs. SQL: `SELECT s.content, x.source FROM xrefs x JOIN strings s
|
||||
ON s.address = x.target WHERE x.kind='ref'`.
|
||||
|
||||
### What this layer does NOT do
|
||||
- No UTF-8 multibyte / non-ASCII basic plane in either encoding.
|
||||
- No `.data` scan (read-only-section bias).
|
||||
- No multi-byte CJK encodings — Japanese text in localised builds may be
|
||||
represented in shift_jis / utf-8 with non-printable bytes that this
|
||||
scanner skips.
|
||||
|
||||
### Sylpheed yield
|
||||
- 6,311 ASCII strings (including full embedded HLSL shader source).
|
||||
- 0 UTF-16LE strings (binary uses ASCII / native CJK encoding).
|
||||
- 9,132 lis+addi sites cross-reference into the detected strings — names
|
||||
the source PCs that reference each string.
|
||||
|
||||
## Layer M6 — Extended store-class xrefs + `addr_mode` column (landed)
|
||||
|
||||
### Schema additions
|
||||
- `xrefs.addr_mode VARCHAR NULL` — sub-classifies how the source instruction
|
||||
computes its target. NULL for control-flow edges (call / ind_call / j /
|
||||
br); one of the following tags for data edges:
|
||||
- `d_form` — standard signed-16 displacement (lwz/stw/lfs/stfs/etc.)
|
||||
- `lis_addi` — address materialised via `lis + addi` register tracking
|
||||
- `lis_ori` — address materialised via `lis + ori`
|
||||
- `multiword` — `lmw / stmw` (one xref per slot; up to 32-rS slots)
|
||||
- `x_form_indexed` — `stwx / stbx / sthx / stwux / stbux / sthux / stdx /
|
||||
stdux / lwzx / lbzx / lhzx / lhax / lwzux / lbzux / lhzux / lhaux / ldx /
|
||||
ldux` — emitted only when both rA and rB are tracked constants
|
||||
- `x_form_byterev` — `stwbrx / sthbrx / lwbrx / lhbrx`
|
||||
- `atomic` — `stwcx. / stdcx.` reservation-conditional stores
|
||||
- `dcbz` — cache-line clear (32-byte zero at rA+rB)
|
||||
- Index `idx_xrefs_addr_mode`.
|
||||
|
||||
### What this layer does
|
||||
- Tags every existing data xref with its addressing mode (`d_form` for the
|
||||
bulk; `lis_addi` / `lis_ori` for the lift-and-add cases that produce
|
||||
DataRef rows).
|
||||
- Adds new dispatch for opcode 47 (`stmw`) and 46 (`lmw`), expanding to
|
||||
per-slot DataWrite / DataRead rows.
|
||||
- Adds new dispatch for opcode 31 X-form: stores, atomic, byte-reverse,
|
||||
dcbz. X-form rows are emitted ONLY when both rA and rB resolve to known
|
||||
constants (otherwise the address is runtime-dependent and we skip).
|
||||
|
||||
### What this layer does NOT do
|
||||
- VMX / VMX128 vector stores (opcode 31 with vector XO codes) are not
|
||||
emitted — they always have register-indexed addresses that the
|
||||
lis+addi tracker can't usually resolve, and detecting them adds noise
|
||||
without improving target resolution.
|
||||
- The dominant runtime-of-stwx pattern (rA = base, rB = runtime index) is
|
||||
not resolved — by design; mem-watch covers the runtime side per VERIFY-B.
|
||||
|
||||
### Sylpheed yield
|
||||
- 28,834 `lis_addi` refs, 18,485 `d_form` reads, 3,288 `d_form` writes —
|
||||
the existing baseline now properly tagged.
|
||||
- **442 newly-detected `x_form_indexed` reads** — primarily lwzx/lhzx
|
||||
reads from in-table dispatch (each pair (rA,rB) resolved statically).
|
||||
- **40 newly-detected `atomic` writes** — every `stwcx.` site with a
|
||||
resolvable address; useful for reservation-table audits.
|
||||
- 9 `lis_ori` refs.
|
||||
- 0 multiword / dcbz / byterev — these instructions exist in the binary
|
||||
but are not in lis+addi-tracked code paths.
|
||||
|
||||
## Layer M8 + M11 — Function-pointer arrays beyond vtables (landed)
|
||||
|
||||
### Schema additions
|
||||
- New table `function_pointer_arrays(address PK, length, kind)` where
|
||||
`kind` is `'vtable'` (M3 re-emit), `'dispatch_table'` (M8), or
|
||||
`'static_init'` (M11).
|
||||
- New table `function_pointer_array_entries(array_address, slot,
|
||||
function_address, PRIMARY KEY (array_address, slot))` — one row per
|
||||
slot of every detected array (vtable + non-vtable).
|
||||
- Indices on `function_pointer_arrays.kind` and
|
||||
`function_pointer_array_entries.function_address`.
|
||||
|
||||
### What this layer does
|
||||
- Walks `.rdata` (only — `.data` produces too many false positives) for
|
||||
runs of ≥ 2 consecutive 4-byte BE values where each value is a known
|
||||
function entry from M1's `functions` table.
|
||||
- Skips runs whose start matches an M3 vtable head — those are re-emitted
|
||||
in this table with `kind='vtable'` for unified queries but not
|
||||
re-classified.
|
||||
- Heuristically classifies non-vtable runs:
|
||||
- `static_init` (M11): every entry's first instruction is `mfspr r12, LR`
|
||||
AND the next is `stwu r1, -N(r1)` with `N ≤ 0x80` (or a save-stub `bl`).
|
||||
Mirrors the typical C++ static-initialiser prologue.
|
||||
- `dispatch_table` (M8): everything else.
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not parse symbol-table-bracketed regions like `__xc_a` / `__xc_z`
|
||||
/ `__xi_a` / `__xi_z` directly — Sylpheed's symbol table is stripped.
|
||||
- Does not chain multi-segment static-init drivers; future M11.5 could
|
||||
walk the entry-point's static-init driver call chain to surface
|
||||
ground-truth ctor PCs.
|
||||
- 2-slot runs in `.rdata` may be false positives where two struct fields
|
||||
happen to alias function VAs; downstream queries should use a length
|
||||
filter (`WHERE length >= 3`) when high precision matters.
|
||||
|
||||
### Sylpheed yield
|
||||
- 722 vtables (M3 re-emit) + 388 dispatch_tables = 1,110 arrays in
|
||||
`function_pointer_arrays`.
|
||||
- 0 static_init detected — Sylpheed's ctors don't all match the
|
||||
conservative prologue heuristic. Lengths concentrate at 2 slots
|
||||
(typical of switch-case jump tables).
|
||||
|
||||
## Layer M9 — `has_eh` from `.pdata` exception flag (landed)
|
||||
|
||||
### Schema additions
|
||||
- `functions.has_eh BOOLEAN NOT NULL` — true when `.pdata`'s exception-
|
||||
handler-present bit (bit 31 of word 1, the high bit) is set.
|
||||
- Index `idx_functions_has_eh`.
|
||||
|
||||
### What this layer does
|
||||
- Derived directly from M1's already-parsed `pdata.flags` bit field (no
|
||||
new parsing). The bit was always available in `pdata_entries.flags`;
|
||||
this layer surfaces it as a first-class column on `functions`.
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not parse the actual `__CxxFrameHandler` / `__C_specific_handler`
|
||||
scope-table records that the exception bit gates. Walking those tables
|
||||
would let us name try/catch ranges and per-state cleanup actions, but
|
||||
is out of scope for a derive-only milestone.
|
||||
|
||||
### Sylpheed yield
|
||||
- 2,975 of 23,073 pdata-validated functions have `has_eh=true` (12.9%) —
|
||||
plausible MSVC C++ EH coverage rate. Largest EH function: 26,328 bytes
|
||||
(`sub_823518F0`).
|
||||
|
||||
## Layer M10 — `.tls` section / TLS directory (landed)
|
||||
|
||||
### Schema additions
|
||||
- New table `tls_info(raw_data_start, raw_data_end, index_address,
|
||||
callback_array, zero_fill_size, characteristics)` — at most one row
|
||||
(the IMAGE_TLS_DIRECTORY32).
|
||||
- New table `tls_callbacks(slot PK, address)` — one row per resolved TLS
|
||||
callback function.
|
||||
|
||||
### What this layer does
|
||||
- Reads the first 24 bytes of the `.tls` section as an
|
||||
`IMAGE_TLS_DIRECTORY32` and walks the zero-terminated callback array.
|
||||
- All addresses stored as absolute VAs.
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not parse the raw TLS template content (the variable initialiser
|
||||
block); just records its start/end VAs.
|
||||
|
||||
### Sylpheed yield
|
||||
- 0 rows — Sylpheed has no `.tls` section. Infrastructure ready for any
|
||||
binary that uses `__declspec(thread)` storage.
|
||||
|
||||
## Layer M12 — `--lr-trace` runtime canary-diff harness (landed)
|
||||
|
||||
### Runtime additions (no DB)
|
||||
- New CLI flag `--lr-trace=PC[,PC,...]` on `exec` — comma-separated PCs
|
||||
to capture as JSONL records on every fire. Symbolic tokens (`Class::method`)
|
||||
resolve via M4's lookup against `--probe-db`. Settable via
|
||||
`XENIA_LR_TRACE`.
|
||||
- New CLI flag `--lr-trace-out=PATH` — writes JSONL to a file (one
|
||||
record per line). Stdout when omitted. Settable via `XENIA_LR_TRACE_OUT`.
|
||||
- New kernel state fields `lr_trace_pcs: HashSet<u32>` +
|
||||
`lr_trace_writer: Option<Mutex<File>>` and helper
|
||||
`KernelState::fire_lr_trace_if_match(hw_id)` invoked from the
|
||||
per-instruction probe slot.
|
||||
|
||||
### JSONL record fields
|
||||
`pc, tid, hw, cycle, r3, r4, r5, r6, lr` — superset of what
|
||||
xenia-canary's `--log_lr_on_pc` patch emits, with a cycle counter added
|
||||
for cross-run reproducibility.
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not capture VMX / FP register state (only GPRs r3..r6).
|
||||
- Does not buffer / batch records — one `write_all` per fire. For
|
||||
high-frequency probes (e.g. tight loops at >1M fires/sec), redirect
|
||||
to a file and use a SSD.
|
||||
|
||||
### Determinism
|
||||
Lockstep digest unaffected: probe firing happens after the per-instr
|
||||
hooks for ctor/branch probes and only emits side-channel output. Verified
|
||||
end-of-session: `check sylpheed.iso --stable-digest -n 2M` ×2 produced
|
||||
byte-identical digests (`instructions=2000005`).
|
||||
|
||||
---
|
||||
|
||||
## Layer M5.5 — `this`-flow indirect-dispatch resolution (landed)
|
||||
|
||||
### Schema additions
|
||||
- New table `vptr_writes(writer_pc, vtable_address, vptr_offset, writer_function)` —
|
||||
every detected `stw rVtable, vptr_off(rThis)` site.
|
||||
- New table `indirect_dispatch_sites(dispatch_pc PK, vptr_offset, slot, candidate_count)` —
|
||||
one row per resolved dispatch.
|
||||
- New table `indirect_dispatch_candidates(dispatch_pc, vtable_address, method_address)` —
|
||||
one row per (dispatch × candidate vtable). Joined to existing
|
||||
`xrefs.kind='ind_call'` edges (one ind_call row per candidate).
|
||||
- New indices on `vptr_writes.vtable_address`, `vptr_writes.vptr_offset`,
|
||||
`indirect_dispatch_candidates.method_address`,
|
||||
`indirect_dispatch_candidates.vtable_address`,
|
||||
`indirect_dispatch_sites.(vptr_offset, slot)`.
|
||||
|
||||
### What this layer does (class-membership inference)
|
||||
1. **Phase 1 — vptr-write scan**: walk every function with the lis+addi
|
||||
tracker; whenever `stw rA, off(rB)` writes a known M3 vtable address,
|
||||
record `(vtable_addr, vptr_offset, writer_pc)`.
|
||||
2. **Phase 2 — invert**: build `vtables_by_offset[vptr_off] = {V}` for the
|
||||
set of vtables ever written at that offset.
|
||||
3. **Phase 3 — dispatch detection**: walk back ≤16 instructions from each
|
||||
`bcctrl`/`bctr LK=1`, find the canonical
|
||||
`lwz vt, off(this); lwz fn, slot*4(vt); mtctr fn` chain. Extract
|
||||
`(vptr_off, slot)`. Bail on register clobber, branch, or label
|
||||
boundary.
|
||||
4. **Phase 4 — emit**: for each `(dispatch_pc, vptr_off, slot)`, emit one
|
||||
`xrefs.kind='ind_call'` row per candidate vtable that has a
|
||||
matching slot. Multi-candidate rows are an over-approximation.
|
||||
|
||||
### What this layer does NOT do
|
||||
- No alias resolution at multi-candidate sites — emits one edge per
|
||||
matching vtable. Downstream queries should filter
|
||||
`indirect_dispatch_sites WHERE candidate_count=1` for high-confidence
|
||||
edges.
|
||||
- No flow-sensitive analysis: register state is killed at every label
|
||||
(basic-block boundary) and at `bl`/`bcl` calls (volatile r0..r12 +
|
||||
ctr). We do NOT propagate values across calls in the chain-walker.
|
||||
- No tracking of vptr writes via X-form indexed (`stwx`), VMX, or
|
||||
multiword stores. Only D-form `stw rA, off(rB)`.
|
||||
- Does not synthesise vptr writes for inlined / elided constructors.
|
||||
If a class never has a writer at offset `vptr_off`, dispatches
|
||||
through that offset find no candidates.
|
||||
|
||||
### Sylpheed yield
|
||||
- 567 vptr writes covering 214 distinct vtables (~30% of M3's 722).
|
||||
- 29 distinct vptr offsets used; offset 0 dominates (501/567 = 88%,
|
||||
single-inheritance).
|
||||
- **6,842 dispatch sites resolved**: 97 single-candidate
|
||||
(high-confidence) + 6,745 multi-candidate (over-approximation).
|
||||
- 687,963 `ind_call` xref rows total.
|
||||
- **2,746 newly-reachable functions** via the M5 BFS view
|
||||
(`v_indirect_reachability_from_entry`) compared to call/j/br alone.
|
||||
- Audit-009 cluster (renderer plateau): functions newly visible
|
||||
include `0x823BC9E0`, `0x823BC290`, `0x823BC5A0`, `0x823BB158`,
|
||||
`0x823BB1E0`, `0x823BCAF0`, `0x823BC4C8` — actionable starting
|
||||
points for the cluster's reachability hunt.
|
||||
|
||||
### Reference docs
|
||||
- IBM PowerPC ABI (volatile/non-volatile register partition).
|
||||
- Itanium C++ ABI on vtable layout (offset-from-`this` model adapted
|
||||
by MSVC for Win32 PPC).
|
||||
|
||||
## Layer M9.5 — `__CxxFrameHandler` scope-table parsing (landed)
|
||||
|
||||
### Schema additions
|
||||
- New table `eh_funcinfo(address PK, magic, max_state, p_unwind_map,
|
||||
n_try_blocks, p_try_block_map, n_ip_map_entries, p_ip_to_state_map,
|
||||
p_es_type_list, eh_flags)`.
|
||||
- New table `eh_unwind_map(funcinfo_address, state_index, to_state, action_pc,
|
||||
PRIMARY KEY (funcinfo_address, state_index))`.
|
||||
- New table `eh_try_blocks(funcinfo_address, try_index, try_low, try_high,
|
||||
catch_high, n_catches, p_handler_array,
|
||||
PRIMARY KEY (funcinfo_address, try_index))`.
|
||||
|
||||
### What this layer does
|
||||
- Magic-scans `.rdata` for the documented MSVC FuncInfo signatures
|
||||
(0x19930520 / 0x19930521 / 0x19930522), reading 4-byte BE values
|
||||
on 4-byte alignment.
|
||||
- Sanity-checks `max_state` ≤ 10,000, `n_try_blocks` ≤ 1,000, all
|
||||
internal pointers landing in valid sections.
|
||||
- Walks `pUnwindMap` (8-byte UnwindMapEntry) and `pTryBlockMap`
|
||||
(20-byte TryBlockMapEntry) into one row each.
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not associate FuncInfo records with their owning function via
|
||||
the `bl __CxxFrameHandler` registration site — joins to `functions`
|
||||
by best-effort PC-range queries. A future M9.6 can chase the
|
||||
registration to make the link explicit.
|
||||
- Does not parse `pHandlerArray` (per-try-block catch type info).
|
||||
|
||||
### Sylpheed yield
|
||||
- 2,588 FuncInfo records (all version 0x19930522).
|
||||
- 10,019 unwind-map entries.
|
||||
- 315 try-blocks across the binary.
|
||||
|
||||
## Layer M11.5 — Static-init driver chain detection (landed)
|
||||
|
||||
### Schema additions
|
||||
- Reuses existing `function_pointer_arrays` table — drivers' arrays are
|
||||
emitted with `kind='static_init'`, replacing M11's prologue-heuristic
|
||||
output where the structurally-grounded pattern fires.
|
||||
|
||||
### What this layer does
|
||||
- Walks every detected function looking for the canonical `_initterm`-
|
||||
style loop: `lwz cursor; mtctr; bcctrl; addi cursor, cursor, 4`
|
||||
bounded by a comparison against another constant register.
|
||||
- Extracts `(array_start, array_end)` from the cursor's initial
|
||||
constant value and the end-comparand register.
|
||||
- Reads the array, validates each entry against
|
||||
`func_analysis.functions`, and emits the array as `static_init`.
|
||||
|
||||
### What this layer does NOT do
|
||||
- Doesn't handle drivers with multiple back-to-back trampoline loops.
|
||||
- Doesn't follow `_initterm_e` return-status semantics — both
|
||||
`_initterm` and `_initterm_e` match if the loop body matches.
|
||||
|
||||
### Sylpheed yield
|
||||
- 0 drivers detected. Sylpheed's static-init structure does not match
|
||||
the canonical CRT loop pattern; the binary likely calls ctors via
|
||||
another mechanism (inline at the entry point, or via a different
|
||||
driver shape). Infrastructure ready for any binary with the
|
||||
documented MSVC pattern.
|
||||
|
||||
## Layer VMX — Vector-store xrefs (M6 follow-up, landed)
|
||||
|
||||
Extends the M6 X-form opcode-31 dispatch in `xref.rs` with AltiVec/VMX
|
||||
vector loads and stores. New entries (XO codes):
|
||||
|
||||
- `lvx` (103), `lvxl` (359), `lvebx` (7), `lvehx` (39), `lvewx` (71)
|
||||
— `addr_mode='x_form_indexed'`, `kind='read'`.
|
||||
- `stvx` (231), `stvxl` (487), `stvebx` (135), `stvehx` (167),
|
||||
`stvewx` (199) — `addr_mode='x_form_indexed'`, `kind='write'`.
|
||||
|
||||
Same constraint as M6: rows emitted only when both `rA` and `rB`
|
||||
resolve to known constants (rare but useful).
|
||||
|
||||
### Sylpheed yield
|
||||
- 110 `stvx` writes newly resolved.
|
||||
|
||||
## Layer SJIS+UTF-8 — Localised-string detection (M7 follow-up, landed)
|
||||
|
||||
Extends `xenia_analysis::strings::analyze` with two additional scanners.
|
||||
|
||||
### Shift_JIS detection
|
||||
Per JIS X 0208: lead byte ∈ [0x81, 0x9F] ∪ [0xE0, 0xEF];
|
||||
trail byte ∈ [0x40, 0x7E] ∪ [0x80, 0xFC]. Single-byte ASCII and JIS
|
||||
half-width katakana (0xA1..=0xDF) are passed through. At least one
|
||||
multi-byte pair must be present (so we don't double-count pure ASCII).
|
||||
SJIS bytes are rendered as `\\xHH` escapes in the `content` column for
|
||||
diagnostic readability — full SJIS→UTF-8 decoding is a future
|
||||
enhancement.
|
||||
|
||||
### UTF-8 detection
|
||||
Validates 2-byte (`110xxxxx 10xxxxxx`) and 3-byte
|
||||
(`1110xxxx 10xxxxxx 10xxxxxx`) sequences plus printable ASCII. Skips
|
||||
4-byte (supplementary plane) which is rare in game text.
|
||||
|
||||
### Sylpheed yield
|
||||
- 790 Shift_JIS strings (Japanese debug + UI text, including
|
||||
`[WARNING] ノードに割り当てるエフェクトIDの指定がない ノードデータが見つからない` style mission strings).
|
||||
- 39 UTF-8 strings.
|
||||
- 6,311 ASCII strings (unchanged from M7).
|
||||
|
||||
## Forward work (not yet landed)
|
||||
|
||||
- **M9.6** — link `eh_funcinfo` records back to their owning functions
|
||||
via `bl __CxxFrameHandler` registration sites + per-try-block
|
||||
`pHandlerArray` parsing.
|
||||
- **M11.6** — relax M11.5 to detect non-canonical static-init driver
|
||||
shapes (`_initterm_e` with status return, custom drivers).
|
||||
- Full SJIS → UTF-8 decoding in the `strings.content` column.
|
||||
- VMX128 (opcode 4) vector-store xrefs — separate encoding space, low
|
||||
ROI; document if Sylpheed's renderer cluster uses it.
|
||||
File diff suppressed because it is too large
Load Diff
277
crates/xenia-analysis/src/demangle.rs
Normal file
277
crates/xenia-analysis/src/demangle.rs
Normal file
@@ -0,0 +1,277 @@
|
||||
//! MSVC C++ name demangling for Xbox 360 binaries.
|
||||
//!
|
||||
//! Wraps [`msvc_demangler::demangle`] (a Rust port of LLVM's
|
||||
//! `MicrosoftDemangle.cpp`) and splits the resulting human-readable string
|
||||
//! into structured fields (namespace path, class name, method name, params
|
||||
//! signature) for storage in the `demangled_names` DB table.
|
||||
//!
|
||||
//! The structured split is heuristic — it operates on the formatted output,
|
||||
//! not the parsed AST. This is good enough for typical RTTI strings of the
|
||||
//! form `?AVClassName@Namespace@@` and standard member functions; exotic
|
||||
//! template / lambda forms degrade gracefully (the structured fields end up
|
||||
//! `None` while `raw_demangled` retains the full LLVM-style output).
|
||||
//!
|
||||
//! Reference: <https://docs.rs/msvc-demangler> (LLVM `MicrosoftDemangle.cpp` port).
|
||||
|
||||
use msvc_demangler::DemangleFlags;
|
||||
|
||||
/// Structured view of one demangled MSVC symbol.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Demangled {
|
||||
/// Original mangled string.
|
||||
pub mangled: String,
|
||||
/// Full LLVM-style demangled output (e.g. `xe::apu::AudioSystem::Setup(void)`).
|
||||
pub raw_demangled: String,
|
||||
/// `::`-joined namespace path leading up to the class, e.g. `xe::apu`. None
|
||||
/// when the symbol is at global scope.
|
||||
pub namespace_path: Option<String>,
|
||||
/// Class name for member functions, e.g. `AudioSystem`. None when the
|
||||
/// symbol is a free function.
|
||||
pub class_name: Option<String>,
|
||||
/// Method or free-function name, e.g. `Setup`. None when the heuristic
|
||||
/// could not separate the name from the rest of the demangled string.
|
||||
pub method_name: Option<String>,
|
||||
/// Parameter signature without the surrounding parens, e.g. `void` or
|
||||
/// `int, char *`. None when not a function or no `(...)` was found.
|
||||
pub params_signature: Option<String>,
|
||||
}
|
||||
|
||||
/// Demangle one mangled MSVC C++ symbol. Returns `None` if the input does not
|
||||
/// start with `?` (early-out for non-mangled names) OR if the underlying
|
||||
/// demangler fails to parse it. Callers that want a "best effort" record
|
||||
/// (NULL fields + raw=mangled) should use [`demangle_or_raw`] instead.
|
||||
pub fn demangle(mangled: &str) -> Option<Demangled> {
|
||||
if !mangled.starts_with('?') {
|
||||
return None;
|
||||
}
|
||||
let raw = msvc_demangler::demangle(mangled, DemangleFlags::llvm()).ok()?;
|
||||
Some(split_structured(mangled.to_string(), raw))
|
||||
}
|
||||
|
||||
/// Demangle, or fall back to a record that just carries the original mangled
|
||||
/// string in `raw_demangled` and leaves all structured fields `None`. Useful
|
||||
/// for DB insert paths that want one row per mangled input regardless of
|
||||
/// parser success.
|
||||
pub fn demangle_or_raw(mangled: &str) -> Demangled {
|
||||
if let Some(d) = demangle(mangled) {
|
||||
return d;
|
||||
}
|
||||
Demangled {
|
||||
mangled: mangled.to_string(),
|
||||
raw_demangled: mangled.to_string(),
|
||||
namespace_path: None,
|
||||
class_name: None,
|
||||
method_name: None,
|
||||
params_signature: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split a fully-formatted demangled string into structured fields.
|
||||
///
|
||||
/// Strategy:
|
||||
/// 1. Find the first un-nested `(` — everything before it is the qualified
|
||||
/// name; everything inside the matching parens is `params_signature`.
|
||||
/// 2. Strip leading return-type tokens before the qualified name (everything
|
||||
/// up to the LAST whitespace not inside `<...>` or `(...)` brackets).
|
||||
/// 3. Split the qualified name on `::` (top-level only) — last segment is
|
||||
/// `method_name`, second-to-last is `class_name`, the rest joined back
|
||||
/// with `::` is `namespace_path`.
|
||||
fn split_structured(mangled: String, raw: String) -> Demangled {
|
||||
let raw_view = raw.as_str();
|
||||
|
||||
let (qualified_name, params) = match find_paren_split(raw_view) {
|
||||
Some((before, inside)) => (before.trim_end().to_string(), Some(inside.to_string())),
|
||||
None => (raw_view.to_string(), None),
|
||||
};
|
||||
|
||||
// Drop any return-type prefix: keep everything after the last top-level
|
||||
// whitespace boundary (where "top-level" means depth-0 in <...>/(...)).
|
||||
let qname_clean = strip_return_type_prefix(&qualified_name);
|
||||
|
||||
let (namespace_path, class_name, method_name) = split_qname(&qname_clean);
|
||||
|
||||
Demangled {
|
||||
mangled,
|
||||
raw_demangled: raw,
|
||||
namespace_path,
|
||||
class_name,
|
||||
method_name,
|
||||
params_signature: params,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(text_before_paren, text_inside_outer_parens)` for the first
|
||||
/// top-level `(` in `s`. Returns `None` when no top-level paren is present.
|
||||
fn find_paren_split(s: &str) -> Option<(&str, &str)> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' if depth_angle == 0 => {
|
||||
// Find matching close at depth 0 on parens.
|
||||
let mut depth_paren = 1i32;
|
||||
let mut depth_angle2 = 0i32;
|
||||
for (j, &b2) in bytes.iter().enumerate().skip(i + 1) {
|
||||
match b2 {
|
||||
b'<' => depth_angle2 += 1,
|
||||
b'>' if depth_angle2 > 0 => depth_angle2 -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' => {
|
||||
depth_paren -= 1;
|
||||
if depth_paren == 0 {
|
||||
return Some((&s[..i], &s[i + 1..j]));
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Strip a leading return-type token (everything up to and including the
|
||||
/// last top-level whitespace). E.g. `void __cdecl Foo::Bar` → `Foo::Bar`.
|
||||
fn strip_return_type_prefix(s: &str) -> String {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
let mut depth_paren: i32 = 0;
|
||||
let mut last_ws_at: Option<usize> = None;
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' if depth_paren > 0 => depth_paren -= 1,
|
||||
b' ' if depth_angle == 0 && depth_paren == 0 => last_ws_at = Some(i),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
match last_ws_at {
|
||||
Some(i) => s[i + 1..].to_string(),
|
||||
None => s.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split a fully-qualified name on top-level `::` and tag the parts.
|
||||
fn split_qname(qname: &str) -> (Option<String>, Option<String>, Option<String>) {
|
||||
if qname.is_empty() {
|
||||
return (None, None, None);
|
||||
}
|
||||
let parts = top_level_split_colon_colon(qname);
|
||||
match parts.len() {
|
||||
0 => (None, None, None),
|
||||
1 => (None, None, Some(parts[0].clone())),
|
||||
2 => (None, Some(parts[0].clone()), Some(parts[1].clone())),
|
||||
_ => {
|
||||
let n = parts.len();
|
||||
let method = parts[n - 1].clone();
|
||||
let class = parts[n - 2].clone();
|
||||
let ns = parts[..n - 2].join("::");
|
||||
(Some(ns), Some(class), Some(method))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Split on top-level `::` — `::` inside `<...>` or `(...)` is preserved.
|
||||
fn top_level_split_colon_colon(s: &str) -> Vec<String> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
let mut depth_paren: i32 = 0;
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
let mut start = 0usize;
|
||||
let mut i = 0usize;
|
||||
while i < bytes.len() {
|
||||
let b = bytes[i];
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' if depth_paren > 0 => depth_paren -= 1,
|
||||
b':' if depth_angle == 0
|
||||
&& depth_paren == 0
|
||||
&& i + 1 < bytes.len()
|
||||
&& bytes[i + 1] == b':' =>
|
||||
{
|
||||
out.push(s[start..i].to_string());
|
||||
start = i + 2;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
out.push(s[start..].to_string());
|
||||
out.into_iter().filter(|p| !p.is_empty()).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn early_out_on_non_mangled() {
|
||||
assert!(demangle("plain_c_name").is_none());
|
||||
assert!(demangle("Foo::Bar").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn demangle_or_raw_records_failures() {
|
||||
let d = demangle_or_raw("not_mangled");
|
||||
assert_eq!(d.mangled, "not_mangled");
|
||||
assert_eq!(d.raw_demangled, "not_mangled");
|
||||
assert!(d.method_name.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_member_function() {
|
||||
// ?Setup@AudioSystem@apu@xe@@QEAAXXZ → public: __cdecl xe::apu::AudioSystem::Setup(void)
|
||||
let d = demangle("?Setup@AudioSystem@apu@xe@@QEAAXXZ").expect("should parse");
|
||||
assert_eq!(d.method_name.as_deref(), Some("Setup"));
|
||||
assert_eq!(d.class_name.as_deref(), Some("AudioSystem"));
|
||||
assert_eq!(d.namespace_path.as_deref(), Some("xe::apu"));
|
||||
assert_eq!(d.params_signature.as_deref(), Some("void"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rtti_type_descriptor_string() {
|
||||
// RTTI TypeDescriptor mangled name format: ".?AVClassName@@" → "class ClassName".
|
||||
// We strip the leading "." and call demangle on the "?AV…" part below in M3.
|
||||
// For now confirm the demangler handles the minimal class form.
|
||||
let d = demangle("?AVAudioSystem@apu@xe@@").expect("should parse");
|
||||
assert!(
|
||||
d.raw_demangled.contains("AudioSystem"),
|
||||
"raw='{}'",
|
||||
d.raw_demangled
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_qname_handles_namespace_chain() {
|
||||
let (ns, cls, m) = split_qname("a::b::c::Klass::method");
|
||||
assert_eq!(ns.as_deref(), Some("a::b::c"));
|
||||
assert_eq!(cls.as_deref(), Some("Klass"));
|
||||
assert_eq!(m.as_deref(), Some("method"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paren_split_handles_template_in_args() {
|
||||
// Templates inside the param list must not confuse paren matching.
|
||||
let s = "void __cdecl Foo::Bar(std::vector<int>, std::map<a, b>)";
|
||||
let (before, inside) = find_paren_split(s).expect("paren found");
|
||||
assert_eq!(before, "void __cdecl Foo::Bar");
|
||||
assert_eq!(inside, "std::vector<int>, std::map<a, b>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn double_colon_inside_template_not_split() {
|
||||
let parts = top_level_split_colon_colon("a::b<c::d>::e");
|
||||
assert_eq!(parts, vec!["a", "b<c::d>", "e"]);
|
||||
}
|
||||
}
|
||||
51
crates/xenia-analysis/src/disasm.rs
Normal file
51
crates/xenia-analysis/src/disasm.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
//! Analysis-side enrichment over [`xenia_cpu::disasm::iter_disasm`].
|
||||
//!
|
||||
//! Turns a stream of decoder-only [`xenia_cpu::disasm::DisasmItem`]s into a
|
||||
//! stream of [`RichDisasmItem`]s carrying section name + enclosing function +
|
||||
//! label name. The three sinks in [`crate::sinks`] (text, JSON, DuckDB) all
|
||||
//! consume `RichDisasmItem`.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use xenia_cpu::disasm::DisasmItem;
|
||||
|
||||
use crate::func::FuncAnalysis;
|
||||
|
||||
/// `DisasmItem` plus the analysis context (section/function/label).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RichDisasmItem<'a> {
|
||||
pub item: DisasmItem,
|
||||
pub section: &'a str,
|
||||
pub function: Option<u32>,
|
||||
pub label: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// Walk one code section, yielding rich items annotated with section name,
|
||||
/// rolling-window enclosing function, and label-at-address.
|
||||
///
|
||||
/// The `function` field tracks the most recent function-start the iterator
|
||||
/// has crossed — matching the legacy `current_func` behaviour in
|
||||
/// `db.rs::insert_instructions_streaming`.
|
||||
pub fn enrich_section<'a>(
|
||||
image: &'a [u8],
|
||||
image_base: u32,
|
||||
section_name: &'a str,
|
||||
va_start: u32,
|
||||
va_end: u32,
|
||||
func_analysis: &'a FuncAnalysis,
|
||||
labels: &'a HashMap<u32, String>,
|
||||
) -> impl Iterator<Item = RichDisasmItem<'a>> + 'a {
|
||||
let mut current_func: Option<u32> = None;
|
||||
xenia_cpu::disasm::iter_disasm(image, image_base, va_start, va_end).map(move |item| {
|
||||
if func_analysis.is_function_start(item.addr) {
|
||||
current_func = Some(item.addr);
|
||||
}
|
||||
let label = labels.get(&item.addr).map(|s| s.as_str());
|
||||
RichDisasmItem {
|
||||
item,
|
||||
section: section_name,
|
||||
function: current_func,
|
||||
label,
|
||||
}
|
||||
})
|
||||
}
|
||||
296
crates/xenia-analysis/src/eh_scope.rs
Normal file
296
crates/xenia-analysis/src/eh_scope.rs
Normal file
@@ -0,0 +1,296 @@
|
||||
//! M9.5 — MSVC `__CxxFrameHandler` scope-table parsing.
|
||||
//!
|
||||
//! When MSVC compiles C++ try/catch on Win32 PowerPC, the compiler emits
|
||||
//! per-function `FuncInfo` records in `.rdata` containing the scope-state
|
||||
//! tables that `__CxxFrameHandler` walks during unwinding. Each record
|
||||
//! starts with one of the documented magic numbers:
|
||||
//!
|
||||
//! - `0x19930520` — original FuncInfo (no aligned-state-array)
|
||||
//! - `0x19930521` — adds `pESTypeList` field
|
||||
//! - `0x19930522` — adds `EHFlags` field
|
||||
//!
|
||||
//! Layout (4-byte little-endian on x86; **on Xbox 360 PowerPC PE the
|
||||
//! struct is big-endian** because the binary is BE throughout):
|
||||
//!
|
||||
//! ```text
|
||||
//! +0x00 uint32 magicNumber (one of 0x199305{20,21,22})
|
||||
//! +0x04 int32 maxState (number of UnwindMapEntry rows)
|
||||
//! +0x08 uint32 pUnwindMap (VA → UnwindMapEntry[])
|
||||
//! +0x0C uint32 nTryBlocks
|
||||
//! +0x10 uint32 pTryBlockMap (VA → TryBlockMapEntry[])
|
||||
//! +0x14 uint32 nIPMapEntries (ignored on x86; present on PPC)
|
||||
//! +0x18 uint32 pIPtoStateMap (VA → IPtoStateMapEntry[])
|
||||
//! +0x1C uint32 pESTypeList (only when magic ≥ 0x19930521)
|
||||
//! +0x20 uint32 EHFlags (only when magic = 0x19930522)
|
||||
//! ```
|
||||
//!
|
||||
//! Each `UnwindMapEntry` is 8 bytes: `(toState i32, action u32)`.
|
||||
//! Each `TryBlockMapEntry` is 20 bytes:
|
||||
//! `(tryLow i32, tryHigh i32, catchHigh i32, nCatches u32, pHandlerArray u32)`.
|
||||
//!
|
||||
//! ### What this module does
|
||||
//!
|
||||
//! - Magic-scan `.rdata` for the three FuncInfo signatures (read as BE u32).
|
||||
//! - Parse the FuncInfo record + walk the unwind map and try-block map.
|
||||
//! - Skip records whose internal pointers don't land in valid sections,
|
||||
//! or whose lengths exceed sane caps.
|
||||
//!
|
||||
//! ### What this module does NOT do
|
||||
//!
|
||||
//! - Does not associate a FuncInfo back to its owning function. The
|
||||
//! `bl __CxxFrameHandler` registration would name that linkage, but
|
||||
//! it requires walking all `has_eh=true` functions' prologues; a
|
||||
//! future M9.6 can do that. For now the FuncInfo record stands on its
|
||||
//! own — joins to `functions` by best-effort PC range queries.
|
||||
//! - Does not parse the `pHandlerArray` per try-block (catch type info).
|
||||
//!
|
||||
//! Reference: LLVM `llvm/lib/CodeGen/AsmPrinter/WinException.cpp`,
|
||||
//! Microsoft openrce.org documentation on FuncInfo.
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
const MAGIC_OLD: u32 = 0x1993_0520;
|
||||
const MAGIC_V21: u32 = 0x1993_0521;
|
||||
const MAGIC_V22: u32 = 0x1993_0522;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct UnwindMapEntry {
|
||||
pub to_state: i32,
|
||||
pub action_pc: u32, // VA of the cleanup action; 0 if none
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct TryBlockMapEntry {
|
||||
pub try_low: i32,
|
||||
pub try_high: i32,
|
||||
pub catch_high: i32,
|
||||
pub n_catches: u32,
|
||||
pub p_handler_array: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EhFuncInfo {
|
||||
pub address: u32, // VA of the FuncInfo record itself
|
||||
pub magic: u32,
|
||||
pub max_state: i32,
|
||||
pub p_unwind_map: u32,
|
||||
pub n_try_blocks: u32,
|
||||
pub p_try_block_map: u32,
|
||||
pub n_ip_map_entries: u32,
|
||||
pub p_ip_to_state_map: u32,
|
||||
pub p_es_type_list: Option<u32>,
|
||||
pub eh_flags: Option<u32>,
|
||||
pub unwind_map: Vec<UnwindMapEntry>,
|
||||
pub try_blocks: Vec<TryBlockMapEntry>,
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
) -> Vec<EhFuncInfo> {
|
||||
let started = std::time::Instant::now();
|
||||
let mut out: Vec<EhFuncInfo> = Vec::new();
|
||||
|
||||
// Compute the union of valid VA ranges across all sections — used to
|
||||
// sanity-check internal pointers in the FuncInfo records.
|
||||
let valid_ranges: Vec<(u32, u32)> = sections.iter()
|
||||
.map(|s| (image_base + s.virtual_address,
|
||||
image_base + s.virtual_address + s.virtual_size))
|
||||
.collect();
|
||||
let in_valid = |va: u32| valid_ranges.iter().any(|(lo, hi)| va >= *lo && va < *hi);
|
||||
|
||||
let read_u32 = |abs: u32| -> Option<u32> {
|
||||
let off = abs.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||
};
|
||||
let read_i32 = |abs: u32| -> Option<i32> { read_u32(abs).map(|u| u as i32) };
|
||||
|
||||
for section in sections {
|
||||
if section.name != ".rdata" { continue; }
|
||||
let raw_start = section.virtual_address as usize;
|
||||
let raw_end = (section.virtual_address + section.virtual_size) as usize;
|
||||
if raw_end > pe.len() { continue; }
|
||||
let bytes = &pe[raw_start..raw_end.min(pe.len())];
|
||||
let va_base = image_base + section.virtual_address;
|
||||
|
||||
// Walk on 4-byte alignment looking for the magic.
|
||||
let mut i = 0;
|
||||
while i + 4 <= bytes.len() {
|
||||
if !i.is_multiple_of(4) { i += 1; continue; }
|
||||
let m = u32::from_be_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]);
|
||||
if m == MAGIC_OLD || m == MAGIC_V21 || m == MAGIC_V22 {
|
||||
let addr = va_base + i as u32;
|
||||
if let Some(rec) = parse_funcinfo(addr, m, &read_u32, &read_i32, &in_valid) {
|
||||
out.push(rec);
|
||||
}
|
||||
}
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
let n_unwind: usize = out.iter().map(|r| r.unwind_map.len()).sum();
|
||||
let n_try: usize = out.iter().map(|r| r.try_blocks.len()).sum();
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "eh_scope").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
records = out.len(),
|
||||
unwind_entries = n_unwind,
|
||||
try_blocks = n_try,
|
||||
elapsed_ms,
|
||||
"M9.5 EH scope-table scan complete",
|
||||
);
|
||||
out
|
||||
}
|
||||
|
||||
fn parse_funcinfo(
|
||||
addr: u32,
|
||||
magic: u32,
|
||||
read_u32: &impl Fn(u32) -> Option<u32>,
|
||||
read_i32: &impl Fn(u32) -> Option<i32>,
|
||||
in_valid: &impl Fn(u32) -> bool,
|
||||
) -> Option<EhFuncInfo> {
|
||||
let max_state = read_i32(addr + 0x04)?;
|
||||
let p_unwind_map = read_u32(addr + 0x08)?;
|
||||
let n_try_blocks = read_u32(addr + 0x0C)?;
|
||||
let p_try_block_map = read_u32(addr + 0x10)?;
|
||||
let n_ip_map_entries = read_u32(addr + 0x14)?;
|
||||
let p_ip_to_state_map = read_u32(addr + 0x18)?;
|
||||
|
||||
// Sanity caps: real FuncInfo records have max_state ≤ a few thousand,
|
||||
// n_try_blocks ≤ a few hundred. Reject obviously bogus values that
|
||||
// happened to alias the magic.
|
||||
if !(0..=10_000).contains(&max_state) { return None; }
|
||||
if n_try_blocks > 1_000 { return None; }
|
||||
if n_ip_map_entries > 100_000 { return None; }
|
||||
// Pointers must either be NULL or land in a valid section.
|
||||
if p_unwind_map != 0 && !in_valid(p_unwind_map) { return None; }
|
||||
if p_try_block_map != 0 && !in_valid(p_try_block_map) { return None; }
|
||||
if p_ip_to_state_map != 0 && !in_valid(p_ip_to_state_map) { return None; }
|
||||
|
||||
let (p_es_type_list, eh_flags) = if magic == MAGIC_V21 {
|
||||
(read_u32(addr + 0x1C), None)
|
||||
} else if magic == MAGIC_V22 {
|
||||
(read_u32(addr + 0x1C), read_u32(addr + 0x20))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
// Walk unwind map (8-byte entries).
|
||||
let mut unwind_map: Vec<UnwindMapEntry> = Vec::with_capacity(max_state as usize);
|
||||
if p_unwind_map != 0 && max_state > 0 {
|
||||
for i in 0..max_state {
|
||||
let p = p_unwind_map.wrapping_add((i * 8) as u32);
|
||||
let to_state = read_i32(p)?;
|
||||
let action_pc = read_u32(p + 4)?;
|
||||
unwind_map.push(UnwindMapEntry { to_state, action_pc });
|
||||
}
|
||||
}
|
||||
|
||||
// Walk try-block map (20-byte entries).
|
||||
let mut try_blocks: Vec<TryBlockMapEntry> = Vec::with_capacity(n_try_blocks as usize);
|
||||
if p_try_block_map != 0 && n_try_blocks > 0 {
|
||||
for i in 0..n_try_blocks {
|
||||
let p = p_try_block_map.wrapping_add(i * 20);
|
||||
let try_low = read_i32(p)?;
|
||||
let try_high = read_i32(p + 4)?;
|
||||
let catch_high = read_i32(p + 8)?;
|
||||
let n_catches = read_u32(p + 12)?;
|
||||
let p_handler_a = read_u32(p + 16)?;
|
||||
try_blocks.push(TryBlockMapEntry {
|
||||
try_low, try_high, catch_high, n_catches, p_handler_array: p_handler_a,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Some(EhFuncInfo {
|
||||
address: addr,
|
||||
magic,
|
||||
max_state,
|
||||
p_unwind_map,
|
||||
n_try_blocks,
|
||||
p_try_block_map,
|
||||
n_ip_map_entries,
|
||||
p_ip_to_state_map,
|
||||
p_es_type_list,
|
||||
eh_flags,
|
||||
unwind_map,
|
||||
try_blocks,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
fn mk_section(name: &str, va: u32, size: u32) -> PeSection {
|
||||
PeSection {
|
||||
name: name.into(),
|
||||
virtual_address: va, virtual_size: size,
|
||||
raw_offset: va, raw_size: size,
|
||||
flags: 0x4000_0040,
|
||||
}
|
||||
}
|
||||
|
||||
fn write_be(pe: &mut [u8], at: usize, v: u32) {
|
||||
pe[at..at + 4].copy_from_slice(&v.to_be_bytes());
|
||||
}
|
||||
fn write_be_i32(pe: &mut [u8], at: usize, v: i32) {
|
||||
pe[at..at + 4].copy_from_slice(&v.to_be_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_minimal_funcinfo_v0() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
// FuncInfo at .rdata + 0x10.
|
||||
let fi_off = (rdata_va + 0x10) as usize;
|
||||
let fi_va = image_base + rdata_va + 0x10;
|
||||
let unwind_off = (rdata_va + 0x80) as usize;
|
||||
let unwind_va = image_base + rdata_va + 0x80;
|
||||
|
||||
write_be(&mut pe, fi_off, MAGIC_OLD); // magic
|
||||
write_be_i32(&mut pe, fi_off + 4, 2); // maxState
|
||||
write_be(&mut pe, fi_off + 8, unwind_va); // pUnwindMap
|
||||
write_be(&mut pe, fi_off + 12, 0); // nTryBlocks
|
||||
write_be(&mut pe, fi_off + 16, 0); // pTryBlockMap
|
||||
write_be(&mut pe, fi_off + 20, 0); // nIPMapEntries
|
||||
write_be(&mut pe, fi_off + 24, 0); // pIPtoStateMap
|
||||
|
||||
// Two unwind entries.
|
||||
write_be_i32(&mut pe, unwind_off, -1); // to_state
|
||||
write_be(&mut pe, unwind_off + 4, image_base + 0x500); // action_pc
|
||||
write_be_i32(&mut pe, unwind_off + 8, 0);
|
||||
write_be(&mut pe, unwind_off + 12, image_base + 0x600);
|
||||
|
||||
let sections = vec![mk_section(".rdata", rdata_va, 0x100)];
|
||||
let recs = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(recs.len(), 1);
|
||||
let r = &recs[0];
|
||||
assert_eq!(r.address, fi_va);
|
||||
assert_eq!(r.magic, MAGIC_OLD);
|
||||
assert_eq!(r.max_state, 2);
|
||||
assert_eq!(r.unwind_map.len(), 2);
|
||||
assert_eq!(r.unwind_map[0].to_state, -1);
|
||||
assert_eq!(r.unwind_map[0].action_pc, image_base + 0x500);
|
||||
assert_eq!(r.try_blocks.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_bogus_max_state() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
let fi_off = (rdata_va + 0x10) as usize;
|
||||
write_be(&mut pe, fi_off, MAGIC_OLD);
|
||||
write_be_i32(&mut pe, fi_off + 4, 0xFFFF); // bogus maxState
|
||||
let sections = vec![mk_section(".rdata", rdata_va, 0x100)];
|
||||
let recs = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(recs.len(), 0);
|
||||
}
|
||||
}
|
||||
@@ -6,8 +6,10 @@ use std::io::Write;
|
||||
use xenia_xex::header::ImportLibrary;
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::disasm::enrich_section;
|
||||
use crate::func::FuncAnalysis;
|
||||
use crate::xref::{XrefKind, Xref, XrefMap, section_for_addr, resolve_source_label};
|
||||
use crate::sinks::text::write_instr_line;
|
||||
use crate::xref::{XrefKind, Xref, XrefMap, resolve_source_label};
|
||||
|
||||
/// Metadata passed to the formatter (avoids exposing full Xex2Header internals).
|
||||
pub struct DisasmInfo<'a> {
|
||||
@@ -88,11 +90,14 @@ pub fn write_asm(
|
||||
writeln!(out)?;
|
||||
|
||||
let mut in_function = false;
|
||||
let mut addr = va_start;
|
||||
while addr < va_end {
|
||||
let abs_addr = info.image_base + addr;
|
||||
let off = (addr - va_start) as usize + file_start;
|
||||
if off + 4 > pe.len() { break; }
|
||||
let abs_start = info.image_base + va_start;
|
||||
let abs_end = info.image_base + va_end;
|
||||
|
||||
let items = enrich_section(
|
||||
pe, info.image_base, §ion.name, abs_start, abs_end, func_analysis, labels,
|
||||
);
|
||||
for ri in items {
|
||||
let abs_addr = ri.item.addr;
|
||||
|
||||
// Function start? Emit separator + header
|
||||
if let Some(fi) = func_analysis.get(abs_addr) {
|
||||
@@ -126,7 +131,6 @@ pub fn write_asm(
|
||||
writeln!(out, "; FUNCTION: {lbl}{detail_str}")?;
|
||||
}
|
||||
|
||||
// Xrefs for function entry
|
||||
if let Some(xref_lines) = format_xrefs(abs_addr, xrefs, func_analysis, labels) {
|
||||
for line in &xref_lines {
|
||||
writeln!(out, "{line}")?;
|
||||
@@ -141,7 +145,6 @@ pub fn write_asm(
|
||||
if let Some(lbl) = labels.get(&abs_addr) {
|
||||
if !func_analysis.is_function_start(abs_addr) {
|
||||
writeln!(out)?;
|
||||
// Xrefs for local labels
|
||||
if let Some(xref_lines) = format_xrefs(abs_addr, xrefs, func_analysis, labels) {
|
||||
for line in &xref_lines {
|
||||
writeln!(out, "{line}")?;
|
||||
@@ -159,37 +162,8 @@ pub fn write_asm(
|
||||
writeln!(out, " ; IMPORT: {imp_name}")?;
|
||||
}
|
||||
|
||||
let instr = u32::from_be_bytes([
|
||||
pe[off], pe[off+1], pe[off+2], pe[off+3]
|
||||
]);
|
||||
|
||||
let decoded = crate::ppc::disasm(instr, abs_addr);
|
||||
let disasm_text = decoded.display().to_string();
|
||||
|
||||
// Annotate branch targets with label names
|
||||
let mut annotated = annotate_branch(&disasm_text, labels);
|
||||
|
||||
// Annotate data references
|
||||
if let Some(&(data_addr, kind)) = data_annotations.get(&abs_addr) {
|
||||
let tag = match kind {
|
||||
XrefKind::DataRead => "[R]",
|
||||
XrefKind::DataWrite => "[W]",
|
||||
_ => "[&]",
|
||||
};
|
||||
let sec = section_for_addr(data_addr, info.sections, info.image_base)
|
||||
.unwrap_or("?");
|
||||
let data_lbl = labels.get(&data_addr)
|
||||
.map(|s| format!(" = {s}"))
|
||||
.unwrap_or_default();
|
||||
if !annotated.contains("; ->") {
|
||||
annotated = format!("{annotated:<40} ; {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
} else {
|
||||
annotated = format!("{annotated} {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(out, " {:08X}: {:08X} {}", abs_addr, instr, annotated)?;
|
||||
addr += 4;
|
||||
let data_annot = data_annotations.get(&abs_addr).copied();
|
||||
write_instr_line(out, &ri, labels, info.sections, info.image_base, data_annot)?;
|
||||
}
|
||||
if in_function {
|
||||
writeln!(out, "; end function")?;
|
||||
@@ -298,21 +272,3 @@ fn format_xrefs(
|
||||
|
||||
Some(lines)
|
||||
}
|
||||
|
||||
fn annotate_branch(disasm: &str, labels: &HashMap<u32, String>) -> String {
|
||||
if let Some(pos) = disasm.find("0x") {
|
||||
let hex_start = pos + 2;
|
||||
let hex_end = disasm[hex_start..].find(|c: char| !c.is_ascii_hexdigit())
|
||||
.map(|i| hex_start + i)
|
||||
.unwrap_or(disasm.len());
|
||||
let hex_str = &disasm[hex_start..hex_end];
|
||||
if hex_str.len() == 8 {
|
||||
if let Ok(addr) = u32::from_str_radix(hex_str, 16) {
|
||||
if let Some(lbl) = labels.get(&addr) {
|
||||
return format!("{disasm:<40} ; -> {lbl}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
disasm.to_string()
|
||||
}
|
||||
|
||||
@@ -32,6 +32,17 @@ pub struct FuncInfo {
|
||||
pub is_leaf: bool,
|
||||
/// True if this is a save/restore GPR helper stub.
|
||||
pub is_saverestore: bool,
|
||||
/// True if `.pdata` has a RUNTIME_FUNCTION whose `BeginAddress` matches `start`.
|
||||
/// Authoritative ground truth from the linker; rows without this flag are
|
||||
/// prologue-detected only and may carry boundary errors.
|
||||
pub pdata_validated: bool,
|
||||
/// Function size in bytes per `.pdata`'s `function_length` field, if known.
|
||||
/// Absent (None) when this row is prologue-only.
|
||||
pub pdata_length: Option<u32>,
|
||||
/// True when `.pdata`'s exception-flag bit is set on this entry — the
|
||||
/// function has a registered C++ EH (or SEH) frame handler. Always false
|
||||
/// for entries without `.pdata` coverage. (M9)
|
||||
pub has_eh: bool,
|
||||
}
|
||||
|
||||
/// Result of the function analysis pass.
|
||||
@@ -42,6 +53,9 @@ pub struct FuncAnalysis {
|
||||
pub save_gpr_base: Option<u32>,
|
||||
/// Addresses in the restore-GPR region (start of __restgprlr block).
|
||||
pub restore_gpr_base: Option<u32>,
|
||||
/// Raw `.pdata` entries from the binary, in original order. Empty when no
|
||||
/// `.pdata` was supplied. Mirrored into the DB as `pdata_entries`.
|
||||
pub pdata_entries: Vec<xenia_xex::pdata::PdataEntry>,
|
||||
}
|
||||
|
||||
// ── Instruction field helpers ──────────────────────────────────────────────
|
||||
@@ -184,12 +198,37 @@ fn find_saverestore_stubs(
|
||||
|
||||
// ── Main analysis ──────────────────────────────────────────────────────────
|
||||
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base), entry_point = format_args!("{:#010x}", entry_point)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
entry_point: u32,
|
||||
code_sections: &[(u32, u32, u32)], // (va_start, va_size, flags)
|
||||
) -> FuncAnalysis {
|
||||
analyze_with_pdata(pe, image_base, entry_point, code_sections, &[])
|
||||
}
|
||||
|
||||
/// Same as [`analyze`] but also unions `.pdata` `RUNTIME_FUNCTION` entries
|
||||
/// into the candidate set. Each surviving function carries `pdata_validated`
|
||||
/// when its start matches a pdata `BeginAddress`, and `pdata_length` when
|
||||
/// the linker-supplied length disagrees with the prologue walk.
|
||||
///
|
||||
/// Pdata entries that have no prologue match (orphans) are still emitted,
|
||||
/// using the linker-supplied length to bound the function.
|
||||
///
|
||||
/// What this layer does NOT do:
|
||||
/// - Does not edit the `prolog_length` we'd derive from prologue analysis;
|
||||
/// `frame_size` and `saved_gprs` remain best-effort prologue inferences.
|
||||
/// - Does not infer base/derived call edges — that's M3+M5.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base), entry_point = format_args!("{:#010x}", entry_point), pdata_entries = pdata.len()))]
|
||||
pub fn analyze_with_pdata(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
entry_point: u32,
|
||||
code_sections: &[(u32, u32, u32)],
|
||||
pdata: &[xenia_xex::pdata::PdataEntry],
|
||||
) -> FuncAnalysis {
|
||||
let started = std::time::Instant::now();
|
||||
let code_ranges: Vec<(u32, u32)> = code_sections.iter()
|
||||
.map(|(va, sz, _)| (image_base + va, image_base + va + sz))
|
||||
.collect();
|
||||
@@ -197,10 +236,10 @@ pub fn analyze(
|
||||
// 1. Find save/restore stubs
|
||||
let (save_base, restore_base) = find_saverestore_stubs(pe, image_base, &code_ranges);
|
||||
if let Some(sb) = save_base {
|
||||
eprintln!("[func] __savegprlr stub at 0x{sb:08X}");
|
||||
tracing::debug!(addr = format_args!("{:#010x}", sb), "__savegprlr stub");
|
||||
}
|
||||
if let Some(rb) = restore_base {
|
||||
eprintln!("[func] __restgprlr stub at 0x{rb:08X}");
|
||||
tracing::debug!(addr = format_args!("{:#010x}", rb), "__restgprlr stub");
|
||||
}
|
||||
|
||||
// Set of addresses in the save/restore region (to exclude from function detection)
|
||||
@@ -214,32 +253,79 @@ pub fn analyze(
|
||||
for i in 0..21 { saverestore_addrs.insert(rb + i * 4); }
|
||||
}
|
||||
|
||||
// 2. Collect all bl targets as candidate function entries
|
||||
// 2. Collect all bl targets as candidate function entries.
|
||||
// Union: bl targets ∪ pdata BeginAddresses ∪ entry_point.
|
||||
let mut call_targets: HashSet<u32> = HashSet::new();
|
||||
call_targets.insert(entry_point);
|
||||
|
||||
for &(start, end) in &code_ranges {
|
||||
let mut addr = start;
|
||||
while addr < end {
|
||||
if let Some(instr) = read_instr(pe, addr, image_base) {
|
||||
if let Some(target) = bl_target(instr, addr) {
|
||||
if let Some(instr) = read_instr(pe, addr, image_base)
|
||||
&& let Some(target) = bl_target(instr, addr) {
|
||||
// Don't count calls into save/restore stubs as function entries
|
||||
if !saverestore_addrs.contains(&target) {
|
||||
call_targets.insert(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
addr += 4;
|
||||
}
|
||||
}
|
||||
eprintln!("[func] {} bl targets (candidate functions)", call_targets.len());
|
||||
|
||||
// 3. For each candidate, detect prologue and walk to epilogue
|
||||
// Index pdata by begin_address for O(1) prologue → length lookup.
|
||||
let pdata_by_begin: HashMap<u32, &xenia_xex::pdata::PdataEntry> =
|
||||
pdata.iter().map(|e| (e.begin_address, e)).collect();
|
||||
for e in pdata {
|
||||
if !saverestore_addrs.contains(&e.begin_address) {
|
||||
call_targets.insert(e.begin_address);
|
||||
}
|
||||
}
|
||||
tracing::debug!(
|
||||
candidates = call_targets.len(),
|
||||
pdata_entries = pdata.len(),
|
||||
"function candidates (bl ∪ pdata)"
|
||||
);
|
||||
|
||||
// 3. For each candidate, detect prologue and walk to epilogue. Pdata
|
||||
// metadata is layered on after the prologue walk so a missing prologue
|
||||
// still yields an entry when pdata covers it.
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
|
||||
for &func_addr in &call_targets {
|
||||
if let Some(fi) = analyze_function(pe, image_base, func_addr, &code_ranges, save_base, restore_base) {
|
||||
let pdata_entry = pdata_by_begin.get(&func_addr).copied();
|
||||
|
||||
if let Some(mut fi) = analyze_function(
|
||||
pe, image_base, func_addr, &code_ranges, save_base, restore_base,
|
||||
) {
|
||||
if let Some(p) = pdata_entry {
|
||||
fi.pdata_validated = true;
|
||||
fi.pdata_length = Some(p.function_length);
|
||||
// bit 0 of the packed flags = exception-handler-present
|
||||
fi.has_eh = (p.flags & 0x2) != 0;
|
||||
// If the prologue walk ended too early, trust pdata's length.
|
||||
let pdata_end = p.begin_address.wrapping_add(p.function_length);
|
||||
if pdata_end > fi.end {
|
||||
fi.end = pdata_end;
|
||||
}
|
||||
}
|
||||
functions.insert(func_addr, fi);
|
||||
} else if let Some(p) = pdata_entry {
|
||||
// Orphan: pdata claims a function here but no prologue matched.
|
||||
// Emit a synthetic entry so the row exists for downstream queries.
|
||||
functions.insert(
|
||||
func_addr,
|
||||
FuncInfo {
|
||||
start: func_addr,
|
||||
end: p.begin_address.wrapping_add(p.function_length),
|
||||
frame_size: 0,
|
||||
saved_gprs: 0,
|
||||
is_leaf: false,
|
||||
is_saverestore: false,
|
||||
pdata_validated: true,
|
||||
pdata_length: Some(p.function_length),
|
||||
has_eh: (p.flags & 0x2) != 0,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -247,6 +333,7 @@ pub fn analyze(
|
||||
if let Some(sb) = save_base {
|
||||
// The save block is one cascade: entry at each rN, falls through to blr
|
||||
// Treat as a single function with the first entry point
|
||||
let pe_sb = pdata_by_begin.get(&sb).copied();
|
||||
functions.insert(sb, FuncInfo {
|
||||
start: sb,
|
||||
end: sb + 20 * 4, // 18 std + stw r12 + blr
|
||||
@@ -254,9 +341,13 @@ pub fn analyze(
|
||||
saved_gprs: 18,
|
||||
is_leaf: true,
|
||||
is_saverestore: true,
|
||||
pdata_validated: pe_sb.is_some(),
|
||||
pdata_length: pe_sb.map(|p| p.function_length),
|
||||
has_eh: pe_sb.map(|p| (p.flags & 0x2) != 0).unwrap_or(false),
|
||||
});
|
||||
}
|
||||
if let Some(rb) = restore_base {
|
||||
let pe_rb = pdata_by_begin.get(&rb).copied();
|
||||
functions.insert(rb, FuncInfo {
|
||||
start: rb,
|
||||
end: rb + 21 * 4, // 18 ld + lwz r12 + mtspr LR + blr
|
||||
@@ -264,15 +355,43 @@ pub fn analyze(
|
||||
saved_gprs: 18,
|
||||
is_leaf: true,
|
||||
is_saverestore: true,
|
||||
pdata_validated: pe_rb.is_some(),
|
||||
pdata_length: pe_rb.map(|p| p.function_length),
|
||||
has_eh: pe_rb.map(|p| (p.flags & 0x2) != 0).unwrap_or(false),
|
||||
});
|
||||
}
|
||||
|
||||
eprintln!("[func] {} functions detected", functions.len());
|
||||
// 5. Fix up `end_address` collisions: if function A's `end` overlaps
|
||||
// function B's `start` (B > A), trim A. This catches mis-merged
|
||||
// prologue walks where pdata revealed an interleaved second prologue.
|
||||
// We do this in a single forward pass.
|
||||
let starts: Vec<u32> = functions.keys().copied().collect();
|
||||
for i in 0..starts.len().saturating_sub(1) {
|
||||
let cur = starts[i];
|
||||
let next = starts[i + 1];
|
||||
if let Some(fi) = functions.get_mut(&cur)
|
||||
&& fi.end > next
|
||||
{
|
||||
fi.end = next;
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "functions").record(elapsed_ms);
|
||||
let pdata_validated_count = functions.values().filter(|f| f.pdata_validated).count();
|
||||
tracing::info!(
|
||||
functions = functions.len(),
|
||||
pdata_entries = pdata.len(),
|
||||
pdata_validated = pdata_validated_count,
|
||||
elapsed_ms,
|
||||
"function detection complete"
|
||||
);
|
||||
|
||||
FuncAnalysis {
|
||||
functions,
|
||||
save_gpr_base: save_base,
|
||||
restore_gpr_base: restore_base,
|
||||
pdata_entries: pdata.to_vec(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,15 +421,13 @@ fn analyze_function(
|
||||
let instr1 = read_instr(pe, func_addr + 4, image_base).unwrap_or(0);
|
||||
|
||||
// Check if next is bl to save stub
|
||||
if let Some(target) = bl_target(instr1, func_addr + 4) {
|
||||
if let Some(sb) = save_base {
|
||||
if target >= sb && target < sb + 18 * 4 {
|
||||
if let Some(target) = bl_target(instr1, func_addr + 4)
|
||||
&& let Some(sb) = save_base
|
||||
&& target >= sb && target < sb + 18 * 4 {
|
||||
let idx = (target - sb) / 4;
|
||||
saved_gprs = 18 - idx;
|
||||
prologue_len = 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Next should be stwu r1, -N(r1)
|
||||
let stwu_instr = read_instr(pe, func_addr + prologue_len, image_base).unwrap_or(0);
|
||||
@@ -356,14 +473,12 @@ fn analyze_function(
|
||||
}
|
||||
|
||||
// Epilogue: b __restgprlr_NN (tail branch into restore stub)
|
||||
if let Some(target) = b_target(instr, addr) {
|
||||
if let Some(rb) = restore_base {
|
||||
if target >= rb && target < rb + 18 * 4 {
|
||||
if let Some(target) = b_target(instr, addr)
|
||||
&& let Some(rb) = restore_base
|
||||
&& target >= rb && target < rb + 18 * 4 {
|
||||
end_addr = addr + 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Epilogue: bctr (indirect tail call — end of function)
|
||||
if is_bctr(instr) {
|
||||
@@ -392,6 +507,9 @@ fn analyze_function(
|
||||
saved_gprs,
|
||||
is_leaf,
|
||||
is_saverestore: false,
|
||||
pdata_validated: false,
|
||||
pdata_length: None,
|
||||
has_eh: false,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -407,24 +525,22 @@ impl FuncAnalysis {
|
||||
for (&addr, fi) in &self.functions {
|
||||
if fi.is_saverestore {
|
||||
// Label the block start, plus individual register entry points
|
||||
if let Some(sb) = self.save_gpr_base {
|
||||
if addr == sb {
|
||||
if let Some(sb) = self.save_gpr_base
|
||||
&& addr == sb {
|
||||
for i in 0u32..18 {
|
||||
let reg = 14 + i;
|
||||
labels.insert(sb + i * 4, format!("__savegprlr_{reg}"));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if let Some(rb) = self.restore_gpr_base {
|
||||
if addr == rb {
|
||||
if let Some(rb) = self.restore_gpr_base
|
||||
&& addr == rb {
|
||||
for i in 0u32..18 {
|
||||
let reg = 14 + i;
|
||||
labels.insert(rb + i * 4, format!("__restgprlr_{reg}"));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
labels.insert(addr, format!("sub_{addr:08X}"));
|
||||
}
|
||||
|
||||
257
crates/xenia-analysis/src/funcptr_arrays.rs
Normal file
257
crates/xenia-analysis/src/funcptr_arrays.rs
Normal file
@@ -0,0 +1,257 @@
|
||||
//! Generic function-pointer array detection (M8 + M11).
|
||||
//!
|
||||
//! M3 already detects "vtable" candidates — runs of ≥3 contiguous function
|
||||
//! pointers in `.rdata` / `.data` (with COL/RTTI walk on top). This module
|
||||
//! widens the net:
|
||||
//!
|
||||
//! - **Dispatch tables** (M8): runs of ≥2 function pointers in `.rdata` /
|
||||
//! `.data` that are NOT already classified as vtables. Captures switch
|
||||
//! jump tables, callback registries, command tables, gameplay state
|
||||
//! machines, etc.
|
||||
//! - **Static initialiser tables** (M11): function-pointer arrays in
|
||||
//! `.rdata` whose entries all have classic constructor-like prologues
|
||||
//! (small frame; either leaf or calling well-known runtime helpers).
|
||||
//! The MSVC convention names the bracketing symbols `__xc_a` /
|
||||
//! `__xc_z` (C++ ctors) and `__xi_a` / `__xi_z` (C runtime), but the
|
||||
//! names are stripped from Sylpheed; we classify by structure.
|
||||
//!
|
||||
//! All findings are written to a single `function_pointer_arrays` table
|
||||
//! with a `kind` column — `"vtable"`, `"dispatch_table"`, or `"static_init"`.
|
||||
//! Vtable rows are duplicated from M3's `vtables` table for join
|
||||
//! convenience (so a single query covers all classification kinds).
|
||||
//!
|
||||
//! ### What this module does NOT do
|
||||
//!
|
||||
//! - No alias-based classification — `static_init` is heuristic and may
|
||||
//! include any function-pointer array near the binary's `__xc_*` region.
|
||||
//! - Does not parse the bracket symbols' actual addresses — we'd need
|
||||
//! debug symbols, which Sylpheed doesn't ship.
|
||||
//! - Two-element runs in `.data` are common false positives (struct fields
|
||||
//! that happen to alias function entries); we only emit `dispatch_table`
|
||||
//! rows for `.rdata`.
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::vtables::Vtable;
|
||||
|
||||
/// One detected function-pointer array.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FuncPtrArray {
|
||||
pub address: u32,
|
||||
pub length: u32,
|
||||
pub kind: &'static str, // "vtable" | "dispatch_table" | "static_init"
|
||||
/// Array entries (function VAs).
|
||||
pub entries: Vec<u32>,
|
||||
}
|
||||
|
||||
/// Run the pass. `vtables` is the M3 result — those addresses are skipped
|
||||
/// in the dispatch-table scan to avoid duplication. `function_starts` is
|
||||
/// the M1 corrected function-start set (used to validate that each array
|
||||
/// entry actually points at a known function).
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
function_starts: &BTreeSet<u32>,
|
||||
vtables: &[Vtable],
|
||||
) -> Vec<FuncPtrArray> {
|
||||
let started = std::time::Instant::now();
|
||||
let vtable_addrs: BTreeSet<u32> = vtables.iter().map(|v| v.address).collect();
|
||||
let mut out: Vec<FuncPtrArray> = Vec::new();
|
||||
|
||||
// Re-emit vtables in this table for unified-query convenience.
|
||||
for v in vtables {
|
||||
out.push(FuncPtrArray {
|
||||
address: v.address,
|
||||
length: v.length,
|
||||
kind: "vtable",
|
||||
entries: v.methods.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
// Scan only .rdata for dispatch tables — .data has too many false
|
||||
// positives from struct fields aliasing function VAs.
|
||||
for section in sections {
|
||||
if section.name != ".rdata" { continue; }
|
||||
let raw_start = section.virtual_address as usize;
|
||||
let raw_end = (section.virtual_address + section.virtual_size) as usize;
|
||||
if raw_end > pe.len() { continue; }
|
||||
let bytes = &pe[raw_start..raw_end.min(pe.len())];
|
||||
let va_base = image_base + section.virtual_address;
|
||||
|
||||
let mut i = 0usize;
|
||||
while i + 8 <= bytes.len() {
|
||||
if !i.is_multiple_of(4) { i += 1; continue; }
|
||||
let mut entries: Vec<u32> = Vec::new();
|
||||
let mut j = i;
|
||||
while j + 4 <= bytes.len() {
|
||||
let val = u32::from_be_bytes([bytes[j], bytes[j + 1], bytes[j + 2], bytes[j + 3]]);
|
||||
if function_starts.contains(&val) {
|
||||
entries.push(val);
|
||||
j += 4;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if entries.len() >= 2 {
|
||||
let address = va_base + (i as u32);
|
||||
if !vtable_addrs.contains(&address) {
|
||||
let kind = classify_run(image_base, &entries, pe);
|
||||
out.push(FuncPtrArray {
|
||||
address,
|
||||
length: entries.len() as u32,
|
||||
kind,
|
||||
entries,
|
||||
});
|
||||
}
|
||||
i += j - i;
|
||||
} else {
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
let n_vt = out.iter().filter(|a| a.kind == "vtable").count();
|
||||
let n_dt = out.iter().filter(|a| a.kind == "dispatch_table").count();
|
||||
let n_si = out.iter().filter(|a| a.kind == "static_init").count();
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "funcptr_arrays").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
total = out.len(), vtable = n_vt, dispatch_table = n_dt, static_init = n_si,
|
||||
elapsed_ms,
|
||||
"function-pointer array scan complete",
|
||||
);
|
||||
out
|
||||
}
|
||||
|
||||
/// Classify a non-vtable function-pointer array. Currently distinguishes
|
||||
/// only "static_init" (all entries have constructor-like prologues — a
|
||||
/// brief mfspr+stwu prologue with a small frame) from "dispatch_table"
|
||||
/// (anything else).
|
||||
fn classify_run(image_base: u32, entries: &[u32], pe: &[u8]) -> &'static str {
|
||||
// Heuristic: a static initialiser's prologue is small (frame ≤ 0x80,
|
||||
// typically ≤ 0x40). If every entry's first instruction is mfspr+LR
|
||||
// (opcode 31, xo 339, spr 8) followed by a small stwu, classify as
|
||||
// static_init.
|
||||
let mut all_ctor = true;
|
||||
let mut any_ctor = false;
|
||||
for &fn_va in entries {
|
||||
if !is_ctor_like(pe, image_base, fn_va) {
|
||||
all_ctor = false;
|
||||
} else {
|
||||
any_ctor = true;
|
||||
}
|
||||
}
|
||||
if all_ctor && any_ctor && entries.len() >= 3 {
|
||||
"static_init"
|
||||
} else {
|
||||
"dispatch_table"
|
||||
}
|
||||
}
|
||||
|
||||
/// True if the function at `fn_va` looks like a tiny C++ static initialiser:
|
||||
/// `mfspr r12, LR` immediately followed by `stwu r1, -N(r1)` with `N ≤ 0x80`.
|
||||
fn is_ctor_like(pe: &[u8], image_base: u32, fn_va: u32) -> bool {
|
||||
let off = fn_va.wrapping_sub(image_base) as usize;
|
||||
if off + 8 > pe.len() { return false; }
|
||||
let i0 = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]);
|
||||
let i1 = u32::from_be_bytes([pe[off + 4], pe[off + 5], pe[off + 6], pe[off + 7]]);
|
||||
// i0: mfspr rD, LR — opcode 31, xo 339, spr 8.
|
||||
let op0 = i0 >> 26;
|
||||
let xo0 = (i0 >> 1) & 0x3FF;
|
||||
let spr0 = (((i0 >> 11) & 0x1F) << 5) | ((i0 >> 16) & 0x1F);
|
||||
if !(op0 == 31 && xo0 == 339 && spr0 == 8) { return false; }
|
||||
// i1 must be stwu r1, -N(r1) with N ≤ 0x80, OR a `bl __savegprlr_*`
|
||||
// followed eventually by stwu (full prologue). Allow either.
|
||||
let op1 = i1 >> 26;
|
||||
if op1 == 37 {
|
||||
// stwu D-form: rS=1, rA=1
|
||||
let rs = (i1 >> 21) & 0x1F;
|
||||
let ra = (i1 >> 16) & 0x1F;
|
||||
let d = ((i1 & 0xFFFF) as i16) as i32;
|
||||
rs == 1 && ra == 1 && d <= 0 && (-d) <= 0x80
|
||||
} else if op1 == 18 {
|
||||
// bl __savegprlr_NN — accept; ctor with frame ≤ 0x80 is the
|
||||
// common case, but if the compiler emits a save-stub call we
|
||||
// can't easily verify the frame size without walking further.
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
fn mk_section(name: &str, va: u32, size: u32) -> PeSection {
|
||||
PeSection {
|
||||
name: name.into(),
|
||||
virtual_address: va,
|
||||
virtual_size: size,
|
||||
raw_offset: va,
|
||||
raw_size: size,
|
||||
flags: 0x4000_0040,
|
||||
}
|
||||
}
|
||||
|
||||
fn write_be_u32(buf: &mut [u8], at: usize, val: u32) {
|
||||
buf[at..at + 4].copy_from_slice(&val.to_be_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_dispatch_table_in_rdata() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
// Two consecutive function pointers, no vtable shadowing them.
|
||||
let pcs = [image_base + 0x2000, image_base + 0x2010];
|
||||
for (i, p) in pcs.iter().enumerate() {
|
||||
write_be_u32(&mut pe, rdata_va as usize + i * 4, *p);
|
||||
}
|
||||
|
||||
let sections = vec![mk_section(".rdata", rdata_va, 0x100)];
|
||||
let mut starts = BTreeSet::new();
|
||||
for &p in &pcs { starts.insert(p); }
|
||||
|
||||
let arrs = analyze(&pe, image_base, §ions, &starts, &[]);
|
||||
assert_eq!(arrs.len(), 1);
|
||||
assert_eq!(arrs[0].kind, "dispatch_table");
|
||||
assert_eq!(arrs[0].length, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vtable_overrides_dispatch_classification() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
let pcs = [image_base + 0x2000, image_base + 0x2010, image_base + 0x2020];
|
||||
for (i, p) in pcs.iter().enumerate() {
|
||||
write_be_u32(&mut pe, rdata_va as usize + i * 4, *p);
|
||||
}
|
||||
let sections = vec![mk_section(".rdata", rdata_va, 0x100)];
|
||||
let mut starts = BTreeSet::new();
|
||||
for &p in &pcs { starts.insert(p); }
|
||||
|
||||
let vt = Vtable {
|
||||
address: image_base + rdata_va,
|
||||
length: 3,
|
||||
col_address: None,
|
||||
class_name: "ANON_test".into(),
|
||||
rtti_present: false,
|
||||
base_classes_json: None,
|
||||
methods: pcs.to_vec(),
|
||||
};
|
||||
let arrs = analyze(&pe, image_base, §ions, &starts, &[vt]);
|
||||
// Vtable + (no dispatch-table dup): the M3 vtable is re-emitted, but
|
||||
// the scan also skips the same address from re-classification.
|
||||
assert_eq!(arrs.len(), 1);
|
||||
assert_eq!(arrs[0].kind, "vtable");
|
||||
}
|
||||
}
|
||||
636
crates/xenia-analysis/src/ind_dispatch_typed.rs
Normal file
636
crates/xenia-analysis/src/ind_dispatch_typed.rs
Normal file
@@ -0,0 +1,636 @@
|
||||
//! M5.5 — `this`-flow indirect-dispatch resolution.
|
||||
//!
|
||||
//! M5 only resolved the canonical `lis+addi → lwz off(vt) → mtctr → bcctrl`
|
||||
//! pattern (vtable address materialised statically; rare in real C++).
|
||||
//! This layer closes the dominant case, where the dispatch reads through
|
||||
//! the object's `vptr` field:
|
||||
//!
|
||||
//! ```text
|
||||
//! lwz rVt, vptr_off(this) ; rVt = this->vptr
|
||||
//! ... ; (rVt not clobbered)
|
||||
//! lwz rFn, slot*4(rVt) ; rFn = vtable[slot]
|
||||
//! ... ; (rFn / ctr not clobbered)
|
||||
//! mtctr rFn
|
||||
//! ...
|
||||
//! bcctrl
|
||||
//! ```
|
||||
//!
|
||||
//! Resolution strategy (class-membership inference):
|
||||
//!
|
||||
//! 1. **Phase 1 — vptr-write scan.** Walk every function with a tiny
|
||||
//! register tracker (mirrors the lis+addi propagation in
|
||||
//! `xenia_analysis::xref`). Whenever a `stw rA, off(rB)` writes a
|
||||
//! known M3 vtable address into `off(rB)`, record
|
||||
//! `(vtable_addr, vptr_offset, writer_pc)`. These are constructor-
|
||||
//! side vptr stores.
|
||||
//!
|
||||
//! 2. **Phase 2 — invert by offset.** Build
|
||||
//! `vtables_by_offset[vptr_off] = set of vtables ever written at
|
||||
//! that offset`. Most classes use offset 0 (single inheritance);
|
||||
//! multiple-inheritance secondary vptrs land at non-zero offsets.
|
||||
//!
|
||||
//! 3. **Phase 3 — dispatch-site scan.** For each `bcctrl`, walk back
|
||||
//! up to 16 instructions looking for the canonical sequence,
|
||||
//! extracting `(vptr_off, slot)`. Bail on any clobber of the
|
||||
//! tracked register, on any branch instruction, or on a label
|
||||
//! boundary.
|
||||
//!
|
||||
//! 4. **Phase 4 — emit edges.** For each detected
|
||||
//! `(dispatch_pc, vptr_off, slot)`:
|
||||
//! - Look up all candidate vtables `V` where:
|
||||
//! - `vtables_by_offset[vptr_off]` contains `V`, AND
|
||||
//! - `V.length > slot`
|
||||
//! - Emit one `ind_call` edge from `dispatch_pc` to
|
||||
//! `V.methods[slot]` per candidate.
|
||||
//!
|
||||
//! Multi-candidate sites are an over-approximation: the analysis can't
|
||||
//! distinguish without alias info which of the matching classes the
|
||||
//! `this` register actually holds. Downstream queries can filter by
|
||||
//! the exposed `candidate_count` column — single-candidate edges are
|
||||
//! high-confidence, multi-candidate edges are reachability-only.
|
||||
//!
|
||||
//! ### What this layer does NOT do
|
||||
//!
|
||||
//! - No flow-sensitive analysis: register state is killed at every
|
||||
//! label (basic-block boundary), and we do not propagate values
|
||||
//! across calls (since the ABI's volatile/non-volatile partition is
|
||||
//! unreliable for `this`-pointer chains).
|
||||
//! - No alias resolution: a multi-candidate site emits one edge per
|
||||
//! matching vtable, not the exact one used at runtime.
|
||||
//! - Does not handle vptr writes via X-form indexed stores (`stwx`)
|
||||
//! or VMX/VMX128 stores — only D-form `stw rA, off(rB)`. The MSVC
|
||||
//! compiler uses D-form for all canonical vptr writes we've seen.
|
||||
//! - Does not synthesise vptr writes for inlined / elided constructors.
|
||||
//! If a class never has a writer at offset `vptr_off`, dispatches
|
||||
//! through that offset will not find candidates.
|
||||
//!
|
||||
//! Reference: IBM PowerPC ABI, Itanium C++ ABI on vtable layout (the
|
||||
//! same offset-from-`this` model applies on Win32 PPC).
|
||||
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||
|
||||
use crate::func::FuncAnalysis;
|
||||
use crate::vtables::Vtable;
|
||||
|
||||
/// One detected dispatch site after typed resolution.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TypedDispatch {
|
||||
pub dispatch_pc: u32,
|
||||
pub vptr_offset: u32,
|
||||
pub slot: u32,
|
||||
/// Set of candidate vtable addresses whose `(vptr_offset, slot)` matched.
|
||||
pub candidate_vtables: Vec<u32>,
|
||||
/// Set of resolved method PCs (one per candidate vtable).
|
||||
pub method_pcs: Vec<u32>,
|
||||
}
|
||||
|
||||
/// Result of the M5.5 pass.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct TypedIndirectResult {
|
||||
pub dispatches: Vec<TypedDispatch>,
|
||||
/// Phase-1 raw output, exposed for diagnostics.
|
||||
pub vptr_writes: Vec<VptrWrite>,
|
||||
}
|
||||
|
||||
/// One detected constructor-side vptr write.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct VptrWrite {
|
||||
pub vtable_addr: u32,
|
||||
pub vptr_offset: u32,
|
||||
pub writer_pc: u32,
|
||||
pub writer_function: u32,
|
||||
}
|
||||
|
||||
const OP_ADDI: u32 = 14;
|
||||
const OP_ADDIS: u32 = 15;
|
||||
const OP_BCCTR: u32 = 19;
|
||||
const OP_LWZ: u32 = 32;
|
||||
const OP_ORI: u32 = 24;
|
||||
const OP_STW: u32 = 36;
|
||||
const OP_X_FORM: u32 = 31;
|
||||
|
||||
/// Run the full M5.5 analysis.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
func_analysis: &FuncAnalysis,
|
||||
vtables: &[Vtable],
|
||||
labels: &HashMap<u32, String>,
|
||||
) -> TypedIndirectResult {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
let vtable_addrs: BTreeSet<u32> = vtables.iter().map(|v| v.address).collect();
|
||||
let vtable_by_addr: BTreeMap<u32, &Vtable> =
|
||||
vtables.iter().map(|v| (v.address, v)).collect();
|
||||
|
||||
let block_boundaries: HashSet<u32> = labels.keys().copied().collect();
|
||||
|
||||
// Phase 1: scan for vptr writes.
|
||||
let vptr_writes = scan_vptr_writes(
|
||||
pe, image_base, func_analysis, &vtable_addrs, &block_boundaries,
|
||||
);
|
||||
|
||||
// Phase 2: invert by offset.
|
||||
let mut vtables_by_offset: HashMap<u32, HashSet<u32>> = HashMap::new();
|
||||
for w in &vptr_writes {
|
||||
vtables_by_offset.entry(w.vptr_offset).or_default().insert(w.vtable_addr);
|
||||
}
|
||||
|
||||
// Phase 3 + 4: scan dispatches and emit edges.
|
||||
let dispatches = scan_dispatches_and_resolve(
|
||||
pe, image_base, func_analysis, &block_boundaries,
|
||||
&vtables_by_offset, &vtable_by_addr,
|
||||
);
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
let single_candidate = dispatches.iter().filter(|d| d.candidate_vtables.len() == 1).count();
|
||||
let multi_candidate = dispatches.len() - single_candidate;
|
||||
let total_edges: usize = dispatches.iter().map(|d| d.method_pcs.len()).sum();
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "ind_dispatch_typed").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
vptr_writes = vptr_writes.len(),
|
||||
offsets = vtables_by_offset.len(),
|
||||
dispatches = dispatches.len(),
|
||||
single = single_candidate,
|
||||
multi = multi_candidate,
|
||||
edges = total_edges,
|
||||
elapsed_ms,
|
||||
"M5.5 typed indirect-dispatch scan complete",
|
||||
);
|
||||
|
||||
TypedIndirectResult { dispatches, vptr_writes }
|
||||
}
|
||||
|
||||
fn read_instr(pe: &[u8], image_base: u32, addr: u32) -> Option<u32> {
|
||||
let off = addr.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||
}
|
||||
|
||||
/// Phase 1 — find every `stw rA, off(rB)` where the lis+addi-tracked
|
||||
/// value of `rA` equals a known vtable address.
|
||||
fn scan_vptr_writes(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
func_analysis: &FuncAnalysis,
|
||||
vtable_addrs: &BTreeSet<u32>,
|
||||
block_boundaries: &HashSet<u32>,
|
||||
) -> Vec<VptrWrite> {
|
||||
let mut writes: Vec<VptrWrite> = Vec::new();
|
||||
for (&fn_start, fi) in &func_analysis.functions {
|
||||
if fi.is_saverestore { continue; }
|
||||
let mut reg: [Option<u32>; 32] = [None; 32];
|
||||
let mut pc = fn_start;
|
||||
while pc < fi.end {
|
||||
if pc != fn_start && block_boundaries.contains(&pc) {
|
||||
reg = [None; 32];
|
||||
}
|
||||
let Some(instr) = read_instr(pe, image_base, pc) else { break };
|
||||
let op = instr >> 26;
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||
let uimm = instr & 0xFFFF;
|
||||
match op {
|
||||
OP_ADDIS if ra == 0 => reg[rd] = Some(uimm << 16),
|
||||
OP_ADDIS => {
|
||||
reg[rd] = reg[ra].map(|b| b.wrapping_add(uimm << 16));
|
||||
}
|
||||
OP_ADDI if ra != 0 => {
|
||||
reg[rd] = reg[ra].map(|b| b.wrapping_add(simm as u32));
|
||||
}
|
||||
OP_ADDI => reg[rd] = Some(simm as u32),
|
||||
OP_ORI => {
|
||||
let rs = rd;
|
||||
reg[ra] = reg[rs].map(|b| b | uimm);
|
||||
}
|
||||
OP_STW => {
|
||||
// `stw rS, off(rA)` — rS in bits 21..25, rA in 16..20.
|
||||
if ra != 0
|
||||
&& let Some(vtable_addr) = reg[rd]
|
||||
&& vtable_addrs.contains(&vtable_addr)
|
||||
{
|
||||
// The vptr offset is the displacement; rB's value
|
||||
// is irrelevant for class-membership inference.
|
||||
writes.push(VptrWrite {
|
||||
vtable_addr,
|
||||
vptr_offset: simm as u32,
|
||||
writer_pc: pc,
|
||||
writer_function: fn_start,
|
||||
});
|
||||
}
|
||||
// stw doesn't write to rD.
|
||||
}
|
||||
OP_LWZ => reg[rd] = None,
|
||||
32..=35 | 40..=43 | 48..=51 => reg[rd] = None,
|
||||
OP_X_FORM => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
if xo != 444 && xo != 467 { reg[rd] = None; }
|
||||
}
|
||||
18 => {
|
||||
// `bl` (LK=1) clobbers volatile r0..r12 + ctr. Plain
|
||||
// `b` makes the next instruction unreachable; the
|
||||
// label-based reset handles join points.
|
||||
if (instr & 1) != 0 {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
}
|
||||
}
|
||||
16 => {
|
||||
if (instr & 1) != 0 {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
pc = pc.wrapping_add(4);
|
||||
}
|
||||
}
|
||||
writes
|
||||
}
|
||||
|
||||
/// Phase 3 + 4 — scan every `bcctrl`/`bctr` instruction; for each, walk
|
||||
/// backward up to 16 instructions to find the canonical
|
||||
/// `lwz vt, vptr_off(this); lwz fn, slot(vt); mtctr fn; bcctrl` sequence.
|
||||
/// Emit one `TypedDispatch` per dispatch site that resolves to ≥ 1
|
||||
/// candidate vtable.
|
||||
fn scan_dispatches_and_resolve(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
func_analysis: &FuncAnalysis,
|
||||
block_boundaries: &HashSet<u32>,
|
||||
vtables_by_offset: &HashMap<u32, HashSet<u32>>,
|
||||
vtable_by_addr: &BTreeMap<u32, &Vtable>,
|
||||
) -> Vec<TypedDispatch> {
|
||||
let mut out: Vec<TypedDispatch> = Vec::new();
|
||||
for (&fn_start, fi) in &func_analysis.functions {
|
||||
if fi.is_saverestore { continue; }
|
||||
let mut pc = fn_start;
|
||||
while pc < fi.end {
|
||||
let Some(instr) = read_instr(pe, image_base, pc) else { break };
|
||||
let op = instr >> 26;
|
||||
if op == OP_BCCTR {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
let lk = (instr & 1) != 0;
|
||||
if xo == 528 && lk
|
||||
&& let Some(d) = try_resolve_dispatch_site(
|
||||
pe, image_base, fn_start, fi.end, pc,
|
||||
block_boundaries, vtables_by_offset, vtable_by_addr,
|
||||
)
|
||||
{
|
||||
out.push(d);
|
||||
}
|
||||
}
|
||||
pc = pc.wrapping_add(4);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Backwards scan from `bcctrl` at `pc` (looking back at most 16 instrs
|
||||
/// within the same basic block). Returns `Some(_)` only when the full
|
||||
/// `lwz vt, off(rA); lwz fn, slot(vt); mtctr fn` chain is present and the
|
||||
/// `(vptr_off, slot)` pair has at least one candidate vtable.
|
||||
fn try_resolve_dispatch_site(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
fn_start: u32,
|
||||
_fn_end: u32,
|
||||
bcctrl_pc: u32,
|
||||
block_boundaries: &HashSet<u32>,
|
||||
vtables_by_offset: &HashMap<u32, HashSet<u32>>,
|
||||
vtable_by_addr: &BTreeMap<u32, &Vtable>,
|
||||
) -> Option<TypedDispatch> {
|
||||
const LOOKBACK: u32 = 16;
|
||||
|
||||
// Walk back 1..LOOKBACK instrs to find `mtctr rFn`.
|
||||
let mut mtctr_rs: Option<usize> = None;
|
||||
let mut mtctr_pc: Option<u32> = None;
|
||||
for i in 1..=LOOKBACK {
|
||||
let p = bcctrl_pc.wrapping_sub(i * 4);
|
||||
if p < fn_start { break; }
|
||||
if block_boundaries.contains(&p) { break; }
|
||||
let Some(instr) = read_instr(pe, image_base, p) else { break };
|
||||
let op = instr >> 26;
|
||||
if op == OP_X_FORM {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
if xo == 467 {
|
||||
let spr = (((instr >> 11) & 0x1F) << 5) | ((instr >> 16) & 0x1F);
|
||||
if spr == 9 {
|
||||
mtctr_rs = Some(((instr >> 21) & 0x1F) as usize);
|
||||
mtctr_pc = Some(p);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let mtctr_rs = mtctr_rs?;
|
||||
let mtctr_pc = mtctr_pc?;
|
||||
|
||||
// Walk back from mtctr to find `lwz rFn, slot(rVt)` defining mtctr_rs.
|
||||
let mut slot: Option<u32> = None;
|
||||
let mut vt_reg: Option<usize> = None;
|
||||
let mut fn_lwz_pc: Option<u32> = None;
|
||||
for i in 1..=LOOKBACK {
|
||||
let p = mtctr_pc.wrapping_sub(i * 4);
|
||||
if p < fn_start { break; }
|
||||
if block_boundaries.contains(&p) { break; }
|
||||
let Some(instr) = read_instr(pe, image_base, p) else { break };
|
||||
let op = instr >> 26;
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
if op == OP_LWZ {
|
||||
if rd == mtctr_rs {
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
if ra == 0 { return None; }
|
||||
let off = ((instr & 0xFFFF) as i16) as i32;
|
||||
if off < 0 || (off % 4) != 0 { return None; }
|
||||
slot = Some((off as u32) / 4);
|
||||
vt_reg = Some(ra);
|
||||
fn_lwz_pc = Some(p);
|
||||
break;
|
||||
}
|
||||
// Other lwz; if it writes our target reg, it's a clobber, but
|
||||
// the loop already keys on the lwz that produces the value, so
|
||||
// no clobber check needed beyond seeing rd == mtctr_rs.
|
||||
} else if writes_reg(instr, mtctr_rs as u32) {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
let slot = slot?;
|
||||
let vt_reg = vt_reg?;
|
||||
let fn_lwz_pc = fn_lwz_pc?;
|
||||
|
||||
// Walk back from the fn-lwz to find `lwz rVt, vptr_off(rThis)` defining vt_reg.
|
||||
let mut vptr_off: Option<u32> = None;
|
||||
for i in 1..=LOOKBACK {
|
||||
let p = fn_lwz_pc.wrapping_sub(i * 4);
|
||||
if p < fn_start { break; }
|
||||
if block_boundaries.contains(&p) { break; }
|
||||
let Some(instr) = read_instr(pe, image_base, p) else { break };
|
||||
let op = instr >> 26;
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
if op == OP_LWZ && rd == vt_reg {
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
if ra == 0 { return None; }
|
||||
let off = ((instr & 0xFFFF) as i16) as i32;
|
||||
// Negative offsets are valid in C++ (multiple inheritance casts
|
||||
// can produce them in some ABIs); reinterpret as u32 wrap.
|
||||
vptr_off = Some(off as u32);
|
||||
break;
|
||||
}
|
||||
if writes_reg(instr, vt_reg as u32) {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
let vptr_off = vptr_off?;
|
||||
|
||||
// Phase 4 — resolve to candidate vtables.
|
||||
let candidates = vtables_by_offset.get(&vptr_off)?;
|
||||
let mut candidate_vtables: Vec<u32> = Vec::new();
|
||||
let mut method_pcs: Vec<u32> = Vec::new();
|
||||
for &vt_addr in candidates {
|
||||
if let Some(vt) = vtable_by_addr.get(&vt_addr)
|
||||
&& vt.length > slot
|
||||
&& let Some(&method_pc) = vt.methods.get(slot as usize)
|
||||
{
|
||||
candidate_vtables.push(vt_addr);
|
||||
method_pcs.push(method_pc);
|
||||
}
|
||||
}
|
||||
if method_pcs.is_empty() { return None; }
|
||||
|
||||
Some(TypedDispatch {
|
||||
dispatch_pc: bcctrl_pc,
|
||||
vptr_offset: vptr_off,
|
||||
slot,
|
||||
candidate_vtables,
|
||||
method_pcs,
|
||||
})
|
||||
}
|
||||
|
||||
/// Conservative "does this instruction write to register `r`" predicate.
|
||||
/// Used to detect register clobbers between the value-producing lwz and
|
||||
/// its consumer.
|
||||
fn writes_reg(instr: u32, r: u32) -> bool {
|
||||
let op = instr >> 26;
|
||||
let rd = (instr >> 21) & 0x1F;
|
||||
let _ra = (instr >> 16) & 0x1F;
|
||||
match op {
|
||||
// Most arithmetic / load opcodes use bits 21..25 = rD/rT.
|
||||
14 | 15 | 32..=43 | 46 | 48..=51 => rd == r,
|
||||
// ori/oris/xor/etc. opcodes 24..29 — rA in bits 16..20 is the dest.
|
||||
24 | 25 | 26 | 27 | 28 | 29 => ((instr >> 16) & 0x1F) == r,
|
||||
// X-form: most write rD; some write rA. Check both, conservatively.
|
||||
OP_X_FORM => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
// Logical X-form (and/or/xor/etc.): rA is the dest.
|
||||
// Logical X-form ops (and/or/xor/etc.) write rA, not rD.
|
||||
if matches!(xo, 26 | 28 | 60 | 124 | 284 | 316 | 444 | 476 | 536 | 539 | 922 | 954) {
|
||||
((instr >> 16) & 0x1F) == r
|
||||
} else {
|
||||
rd == r
|
||||
}
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::func::FuncInfo;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
fn mk_vtable(addr: u32, methods: Vec<u32>) -> Vtable {
|
||||
Vtable {
|
||||
address: addr,
|
||||
length: methods.len() as u32,
|
||||
col_address: None,
|
||||
class_name: format!("ANON_{addr:08X}"),
|
||||
rtti_present: false,
|
||||
base_classes_json: None,
|
||||
methods,
|
||||
}
|
||||
}
|
||||
|
||||
fn mk_func_analysis(start: u32, len: u32) -> FuncAnalysis {
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
functions.insert(start, FuncInfo {
|
||||
start,
|
||||
end: start + len,
|
||||
frame_size: 0,
|
||||
saved_gprs: 0,
|
||||
is_leaf: false,
|
||||
is_saverestore: false,
|
||||
pdata_validated: false,
|
||||
pdata_length: None,
|
||||
has_eh: false,
|
||||
});
|
||||
FuncAnalysis { functions, save_gpr_base: None, restore_gpr_base: None, pdata_entries: Vec::new() }
|
||||
}
|
||||
|
||||
fn write_be(pe: &mut [u8], at: usize, v: u32) {
|
||||
pe[at..at + 4].copy_from_slice(&v.to_be_bytes());
|
||||
}
|
||||
|
||||
/// Encode a vptr-write site: `lis rN, hi(vt); addi rN, rN, lo(vt); stw rN, off(rOther)`.
|
||||
fn enc_vptr_write(pe: &mut [u8], at: usize, vt: u32, write_off: i16, dest_reg: u32) {
|
||||
let hi = (vt >> 16) as u16;
|
||||
let lo = (vt & 0xFFFF) as i16;
|
||||
let lis = (15u32 << 26) | (3 << 21) | 0 << 16 | (hi as u32);
|
||||
let addi = (14u32 << 26) | (3 << 21) | (3 << 16) | ((lo as u16) as u32);
|
||||
let stw = (36u32 << 26) | (3 << 21) | (dest_reg << 16) | ((write_off as u16) as u32);
|
||||
write_be(pe, at, lis);
|
||||
write_be(pe, at + 4, addi);
|
||||
write_be(pe, at + 8, stw);
|
||||
}
|
||||
|
||||
/// Encode a dispatch site:
|
||||
/// lwz r4, vptr_off(r3) ; r4 = this->vptr
|
||||
/// lwz r5, slot*4(r4) ; r5 = vptr[slot]
|
||||
/// mtctr r5
|
||||
/// bcctrl
|
||||
fn enc_dispatch(pe: &mut [u8], at: usize, vptr_off: i16, slot: u32) {
|
||||
let lwz_vt = (32u32 << 26) | (4 << 21) | (3 << 16) | ((vptr_off as u16) as u32);
|
||||
let lwz_fn = (32u32 << 26) | (5 << 21) | (4 << 16) | ((slot * 4) & 0xFFFF);
|
||||
// mtctr r5 = mtspr CTR(=9), r5: SPR_low (=9) → bits 16..20.
|
||||
let mtctr = (31u32 << 26) | (5 << 21) | (9 << 16) | (467 << 1);
|
||||
let bcctrl = (19u32 << 26) | (20 << 21) | (528 << 1) | 1;
|
||||
write_be(pe, at, lwz_vt);
|
||||
write_be(pe, at + 4, lwz_fn);
|
||||
write_be(pe, at + 8, mtctr);
|
||||
write_be(pe, at + 12, bcctrl);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_candidate_vtable_resolves_to_one_method() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
// Function A — constructor — at 0x82001000. Writes vt=0x82010000 at off=0.
|
||||
let ctor_pc = 0x82001000u32;
|
||||
enc_vptr_write(&mut pe, (ctor_pc - image_base) as usize, 0x82010000, 0, 31);
|
||||
|
||||
// Function B — dispatcher — at 0x82002000. Calls slot 2 of vptr at off 0.
|
||||
let disp_pc = 0x82002000u32;
|
||||
enc_dispatch(&mut pe, (disp_pc - image_base) as usize, 0, 2);
|
||||
let bcctrl_pc = disp_pc + 12;
|
||||
|
||||
// Both functions in func_analysis (synthesise).
|
||||
let mut fa = mk_func_analysis(ctor_pc, 0x40);
|
||||
fa.functions.insert(disp_pc, FuncInfo {
|
||||
start: disp_pc, end: disp_pc + 0x40, frame_size: 0, saved_gprs: 0,
|
||||
is_leaf: false, is_saverestore: false,
|
||||
pdata_validated: false, pdata_length: None, has_eh: false,
|
||||
});
|
||||
|
||||
let vt = mk_vtable(0x82010000, vec![0xAA, 0xBB, 0xCC, 0xDD]);
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
let r = analyze(&pe, image_base, &fa, &[vt], &labels);
|
||||
|
||||
assert_eq!(r.vptr_writes.len(), 1);
|
||||
assert_eq!(r.vptr_writes[0].vtable_addr, 0x82010000);
|
||||
assert_eq!(r.vptr_writes[0].vptr_offset, 0);
|
||||
|
||||
assert_eq!(r.dispatches.len(), 1);
|
||||
let d = &r.dispatches[0];
|
||||
assert_eq!(d.dispatch_pc, bcctrl_pc);
|
||||
assert_eq!(d.vptr_offset, 0);
|
||||
assert_eq!(d.slot, 2);
|
||||
assert_eq!(d.method_pcs, vec![0xCC]);
|
||||
assert_eq!(d.candidate_vtables, vec![0x82010000]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_candidate_emits_one_edge_per_match() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
// Two ctors, each writing a different vtable at offset 0.
|
||||
let ctor_a = 0x82001000u32;
|
||||
enc_vptr_write(&mut pe, (ctor_a - image_base) as usize, 0x82010000, 0, 31);
|
||||
let ctor_b = 0x82001100u32;
|
||||
enc_vptr_write(&mut pe, (ctor_b - image_base) as usize, 0x82010040, 0, 31);
|
||||
|
||||
// One dispatch at slot 1.
|
||||
let disp = 0x82002000u32;
|
||||
enc_dispatch(&mut pe, (disp - image_base) as usize, 0, 1);
|
||||
|
||||
let mut fa = mk_func_analysis(ctor_a, 0x40);
|
||||
fa.functions.insert(ctor_b, FuncInfo {
|
||||
start: ctor_b, end: ctor_b + 0x40, frame_size: 0, saved_gprs: 0,
|
||||
is_leaf: false, is_saverestore: false,
|
||||
pdata_validated: false, pdata_length: None, has_eh: false,
|
||||
});
|
||||
fa.functions.insert(disp, FuncInfo {
|
||||
start: disp, end: disp + 0x40, frame_size: 0, saved_gprs: 0,
|
||||
is_leaf: false, is_saverestore: false,
|
||||
pdata_validated: false, pdata_length: None, has_eh: false,
|
||||
});
|
||||
|
||||
let vts = vec![
|
||||
mk_vtable(0x82010000, vec![0x11, 0x22, 0x33, 0x44]),
|
||||
mk_vtable(0x82010040, vec![0x55, 0x66, 0x77, 0x88]),
|
||||
];
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
let r = analyze(&pe, image_base, &fa, &vts, &labels);
|
||||
|
||||
assert_eq!(r.vptr_writes.len(), 2);
|
||||
assert_eq!(r.dispatches.len(), 1);
|
||||
let d = &r.dispatches[0];
|
||||
assert_eq!(d.candidate_vtables.len(), 2);
|
||||
assert!(d.method_pcs.contains(&0x22));
|
||||
assert!(d.method_pcs.contains(&0x66));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn out_of_bounds_slot_yields_no_dispatch() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
let ctor = 0x82001000u32;
|
||||
enc_vptr_write(&mut pe, (ctor - image_base) as usize, 0x82010000, 0, 31);
|
||||
|
||||
let disp = 0x82002000u32;
|
||||
// slot 10 — vtable only has 4 methods.
|
||||
enc_dispatch(&mut pe, (disp - image_base) as usize, 0, 10);
|
||||
|
||||
let mut fa = mk_func_analysis(ctor, 0x40);
|
||||
fa.functions.insert(disp, FuncInfo {
|
||||
start: disp, end: disp + 0x40, frame_size: 0, saved_gprs: 0,
|
||||
is_leaf: false, is_saverestore: false,
|
||||
pdata_validated: false, pdata_length: None, has_eh: false,
|
||||
});
|
||||
|
||||
let vt = mk_vtable(0x82010000, vec![0x11, 0x22, 0x33, 0x44]);
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
let r = analyze(&pe, image_base, &fa, &[vt], &labels);
|
||||
assert_eq!(r.dispatches.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_writer_at_offset_yields_no_dispatch() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
// ctor writes at offset 0
|
||||
let ctor = 0x82001000u32;
|
||||
enc_vptr_write(&mut pe, (ctor - image_base) as usize, 0x82010000, 0, 31);
|
||||
|
||||
// dispatch reads from offset 8 — no class writes vptr there.
|
||||
let disp = 0x82002000u32;
|
||||
enc_dispatch(&mut pe, (disp - image_base) as usize, 8, 1);
|
||||
|
||||
let mut fa = mk_func_analysis(ctor, 0x40);
|
||||
fa.functions.insert(disp, FuncInfo {
|
||||
start: disp, end: disp + 0x40, frame_size: 0, saved_gprs: 0,
|
||||
is_leaf: false, is_saverestore: false,
|
||||
pdata_validated: false, pdata_length: None, has_eh: false,
|
||||
});
|
||||
|
||||
let vt = mk_vtable(0x82010000, vec![0x11, 0x22, 0x33, 0x44]);
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
let r = analyze(&pe, image_base, &fa, &[vt], &labels);
|
||||
assert_eq!(r.dispatches.len(), 0);
|
||||
}
|
||||
}
|
||||
471
crates/xenia-analysis/src/indirect.rs
Normal file
471
crates/xenia-analysis/src/indirect.rs
Normal file
@@ -0,0 +1,471 @@
|
||||
//! Indirect-dispatch reachability for vtable-bound `bcctrl`/`bctrl` sites.
|
||||
//!
|
||||
//! Walks each detected function with a tiny per-basic-block register tracker,
|
||||
//! recognising the canonical MSVC PowerPC pattern that loads a slot from a
|
||||
//! statically-addressed vtable into CTR and indirectly calls it:
|
||||
//!
|
||||
//! ```text
|
||||
//! lis rA, hi
|
||||
//! addi rA, rA, lo ; rA = vtable_address
|
||||
//! lwz rB, slot*4(rA) ; rB = vtable[slot]
|
||||
//! mtctr rB ; CTR = vtable[slot]
|
||||
//! bcctrl ; indirect call → vtable[slot]
|
||||
//! ```
|
||||
//!
|
||||
//! Pattern hits are emitted as `(source_pc, target_pc)` pairs that callers
|
||||
//! insert into the `xrefs` table with `kind='ind_call'`.
|
||||
//!
|
||||
//! ### What this does NOT cover
|
||||
//!
|
||||
//! - Vtable pointer loaded from a `this`-pointer field (`lwz rA, off(this)`)
|
||||
//! is the dominant pattern in real C++ code; resolving it requires
|
||||
//! alias / points-to analysis that's far beyond this layer's scope.
|
||||
//! - Indirect calls via function-pointer fields (callbacks) are similarly
|
||||
//! unresolvable without object-flow analysis.
|
||||
//! - Register state is intentionally killed at every label (basic-block
|
||||
//! boundary) — we don't try to do flow-sensitive merging across joins.
|
||||
//!
|
||||
//! Reference: IBM PowerPC ABI on register-save convention, plus the
|
||||
//! `xenia_analysis::xref` `lis+addi`/`lis+ori` tracker which we mirror
|
||||
//! conceptually.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
|
||||
use crate::func::FuncAnalysis;
|
||||
use crate::vtables::Vtable;
|
||||
|
||||
/// One detected indirect-call edge: `bcctrl` at `source` jumps to `target`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct IndirectEdge {
|
||||
pub source: u32,
|
||||
pub target: u32,
|
||||
/// Vtable the source resolved through.
|
||||
pub via_vtable: u32,
|
||||
/// Method slot index within the vtable.
|
||||
pub slot: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum RegVal {
|
||||
/// Register holds a known constant (e.g. after `lis+addi`).
|
||||
Const(u32),
|
||||
/// Register holds a method pointer loaded from a known vtable slot.
|
||||
MethodPtr {
|
||||
vtable_addr: u32,
|
||||
slot: u32,
|
||||
method_pc: u32,
|
||||
},
|
||||
}
|
||||
|
||||
const OP_ADDI: u32 = 14;
|
||||
const OP_ADDIS: u32 = 15;
|
||||
const OP_BCCTR: u32 = 19; // also covers blr — distinguish via XO
|
||||
const OP_LWZ: u32 = 32;
|
||||
const OP_ORI: u32 = 24;
|
||||
const OP_X_FORM: u32 = 31; // mtspr / mr / etc.
|
||||
|
||||
/// Run the static indirect-dispatch scan. Returns one edge per resolvable
|
||||
/// `bcctrl` site.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
func_analysis: &FuncAnalysis,
|
||||
vtables: &[Vtable],
|
||||
labels: &HashMap<u32, String>,
|
||||
) -> Vec<IndirectEdge> {
|
||||
let started = std::time::Instant::now();
|
||||
// Index vtables by their start VA so the lwz handler can decide
|
||||
// whether a given Const(addr) is "really" a vtable.
|
||||
let vtable_by_addr: BTreeMap<u32, &Vtable> =
|
||||
vtables.iter().map(|v| (v.address, v)).collect();
|
||||
|
||||
// Set of all "label"-bearing PCs in the analyzed binary. We treat each
|
||||
// label as a basic-block boundary (anything `loc_*` is a jump target,
|
||||
// so register state arriving at it is unreliable).
|
||||
let mut block_boundaries: HashSet<u32> = HashSet::with_capacity(labels.len());
|
||||
for &addr in labels.keys() {
|
||||
block_boundaries.insert(addr);
|
||||
}
|
||||
|
||||
let mut edges: Vec<IndirectEdge> = Vec::new();
|
||||
|
||||
for (&fn_start, fi) in &func_analysis.functions {
|
||||
if fi.is_saverestore { continue; }
|
||||
let mut reg: [Option<RegVal>; 32] = [None; 32];
|
||||
let mut ctr: Option<RegVal> = None;
|
||||
let mut pc = fn_start;
|
||||
while pc < fi.end {
|
||||
// Reset register state on basic-block entry. We don't reset on
|
||||
// the function entry itself (PC == fn_start) because labels and
|
||||
// function-starts coincide; the initial state is already None.
|
||||
if pc != fn_start && block_boundaries.contains(&pc) {
|
||||
reg = [None; 32];
|
||||
ctr = None;
|
||||
}
|
||||
|
||||
let instr = match read_instr(pe, image_base, pc) {
|
||||
Some(i) => i,
|
||||
None => break,
|
||||
};
|
||||
|
||||
let op = instr >> 26;
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||
let uimm = instr & 0xFFFF;
|
||||
|
||||
match op {
|
||||
// lis rD, IMM (== addis rD, r0, IMM)
|
||||
OP_ADDIS if ra == 0 => {
|
||||
reg[rd] = Some(RegVal::Const(uimm << 16));
|
||||
}
|
||||
// addis rD, rA, IMM
|
||||
OP_ADDIS => {
|
||||
if let Some(RegVal::Const(b)) = reg[ra] {
|
||||
reg[rd] = Some(RegVal::Const(b.wrapping_add(uimm << 16)));
|
||||
} else {
|
||||
reg[rd] = None;
|
||||
}
|
||||
}
|
||||
// addi rD, rA, IMM
|
||||
OP_ADDI if ra != 0 => {
|
||||
if let Some(RegVal::Const(b)) = reg[ra] {
|
||||
reg[rd] = Some(RegVal::Const(b.wrapping_add(simm as u32)));
|
||||
} else {
|
||||
reg[rd] = None;
|
||||
}
|
||||
}
|
||||
// li rD, IMM (== addi rD, 0, IMM)
|
||||
OP_ADDI => {
|
||||
reg[rd] = Some(RegVal::Const(simm as u32));
|
||||
}
|
||||
// ori rA, rS, IMM — note operand order: bits 21..25 = rS, 16..20 = rA
|
||||
OP_ORI => {
|
||||
let rs = rd; // bits 21..25 = source
|
||||
if let Some(RegVal::Const(b)) = reg[rs] {
|
||||
reg[ra] = Some(RegVal::Const(b | uimm));
|
||||
} else {
|
||||
reg[ra] = None;
|
||||
}
|
||||
}
|
||||
// lwz rD, off(rA) — try to resolve as vtable slot load.
|
||||
OP_LWZ => {
|
||||
if ra != 0
|
||||
&& let Some(RegVal::Const(base)) = reg[ra]
|
||||
{
|
||||
let target = base.wrapping_add(simm as u32);
|
||||
// Two-step lookup so we accept both:
|
||||
// (a) base = exact vtable head, simm/4 = slot
|
||||
// (b) base + simm = exact vtable head (rare;
|
||||
// compiler hoists the slot offset into addi)
|
||||
let resolved = resolve_vtable_slot(target, &vtable_by_addr)
|
||||
.or_else(|| resolve_vtable_slot_via_off(base, simm, &vtable_by_addr));
|
||||
reg[rd] = resolved.map(|(vt, slot, pc)| RegVal::MethodPtr {
|
||||
vtable_addr: vt, slot, method_pc: pc,
|
||||
});
|
||||
} else {
|
||||
reg[rd] = None;
|
||||
}
|
||||
}
|
||||
// X-form: mtspr/mtctr, bcctrl, mr, etc.
|
||||
OP_X_FORM => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
match xo {
|
||||
467 => {
|
||||
// mtspr SPR, rS — PPC SPR field is split: high 5 bits
|
||||
// in PPC bits 16:20 (= Rust bits 11..15), low 5 bits
|
||||
// in PPC bits 11:15 (= Rust bits 16..20). Mirrors
|
||||
// the convention in `func.rs::is_mfspr_lr`.
|
||||
let spr = (((instr >> 11) & 0x1F) << 5) | ((instr >> 16) & 0x1F);
|
||||
if spr == 9 {
|
||||
ctr = reg[rd];
|
||||
}
|
||||
// Otherwise no observable effect on tracked state.
|
||||
}
|
||||
// Anything that writes rD (most arithmetic, loads, etc.) clobbers it.
|
||||
// Conservative: invalidate rD on any X-form that has rD in bits 21..25
|
||||
// and is NOT a comparison or branch.
|
||||
_ => {
|
||||
// Heuristic: most X-form ops with non-zero RC encode rD; we
|
||||
// invalidate to avoid stale Const propagation past arithmetic.
|
||||
// This is over-eager but safe (false negatives on edges, never
|
||||
// false positives).
|
||||
reg[rd] = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
// bcctr/bcctrl — opcode 19, XO=528. LK in low bit.
|
||||
OP_BCCTR => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
if xo == 528 {
|
||||
let lk = (instr & 1) != 0;
|
||||
if lk
|
||||
&& let Some(RegVal::MethodPtr { vtable_addr, slot, method_pc }) = ctr
|
||||
{
|
||||
edges.push(IndirectEdge {
|
||||
source: pc,
|
||||
target: method_pc,
|
||||
via_vtable: vtable_addr,
|
||||
slot,
|
||||
});
|
||||
}
|
||||
// After the call, CTR is preserved but rD register
|
||||
// values across the call boundary are not trustworthy.
|
||||
// Don't touch reg state — most ABIs preserve only
|
||||
// some regs anyway.
|
||||
}
|
||||
}
|
||||
// op 18: b / bl / ba / bla. LK=1 is a call; LK=0 is an
|
||||
// unconditional branch with no fall-through (next PC is
|
||||
// reached only via a different basic block, which the
|
||||
// label-based reset already handles). On a call, the
|
||||
// PowerPC ABI marks r0..r12 + ctr as volatile and
|
||||
// r13..r31 as non-volatile (callee-saved); preserve the
|
||||
// non-volatile half so vtable pointers loaded into r30/r31
|
||||
// before a `bl` survive the call.
|
||||
18 => {
|
||||
let lk = (instr & 1) != 0;
|
||||
if lk {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
ctr = None;
|
||||
}
|
||||
// LK=0 (`b`) makes fall-through unreachable; nothing to do —
|
||||
// any next reachable PC will hit a label boundary.
|
||||
}
|
||||
// Conditional branches (op 16) fall through; preserve all reg
|
||||
// state for the fall-through path. The label-based join-point
|
||||
// invalidation bounds false-positive risk for jump-IN paths.
|
||||
16 => {
|
||||
let lk = (instr & 1) != 0;
|
||||
if lk {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
ctr = None;
|
||||
}
|
||||
}
|
||||
// Stores and loads we don't track explicitly clobber rD only
|
||||
// when rD is on the destination side; the conservative rule
|
||||
// is "any non-recognised opcode that may write rD invalidates it".
|
||||
36..=55 => {
|
||||
// Loads write rD; stores don't. The safe pessimisation is
|
||||
// to invalidate rD for the load family (32..=35, 40..=43, etc.)
|
||||
// and leave it alone for stores. We've already handled lwz
|
||||
// above; for the rest, invalidate rD.
|
||||
if matches!(op, 32..=35 | 40..=43 | 48..=51) {
|
||||
reg[rd] = None;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
pc = pc.wrapping_add(4);
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "indirect").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
edges = edges.len(),
|
||||
elapsed_ms,
|
||||
"indirect-dispatch scan complete"
|
||||
);
|
||||
edges
|
||||
}
|
||||
|
||||
fn read_instr(pe: &[u8], image_base: u32, addr: u32) -> Option<u32> {
|
||||
let off = addr.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||
}
|
||||
|
||||
/// `target = base + simm` where `target` is an exact vtable head (rare,
|
||||
/// compiler hoists the slot offset into the addi).
|
||||
fn resolve_vtable_slot_via_off(
|
||||
base: u32,
|
||||
simm: i32,
|
||||
vtable_by_addr: &BTreeMap<u32, &Vtable>,
|
||||
) -> Option<(u32, u32, u32)> {
|
||||
let target = base.wrapping_add(simm as u32);
|
||||
if let Some(v) = vtable_by_addr.get(&target)
|
||||
&& !v.methods.is_empty()
|
||||
{
|
||||
return Some((v.address, 0, v.methods[0]));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// `target` is an absolute address. If it falls inside a known vtable's
|
||||
/// `[address, address + length*4)` range AND is 4-aligned to a slot,
|
||||
/// return `(vtable_addr, slot, method_pc)`.
|
||||
fn resolve_vtable_slot(
|
||||
target: u32,
|
||||
vtable_by_addr: &BTreeMap<u32, &Vtable>,
|
||||
) -> Option<(u32, u32, u32)> {
|
||||
// BTreeMap range search for the largest key ≤ target.
|
||||
let (&vt_addr, vt) = vtable_by_addr.range(..=target).next_back()?;
|
||||
if target < vt_addr { return None; }
|
||||
let off = target - vt_addr;
|
||||
if !off.is_multiple_of(4) { return None; }
|
||||
let slot = off / 4;
|
||||
if slot >= vt.length { return None; }
|
||||
let method_pc = *vt.methods.get(slot as usize)?;
|
||||
Some((vt_addr, slot, method_pc))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::func::FuncInfo;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
fn mk_vtable(addr: u32, methods: Vec<u32>) -> Vtable {
|
||||
Vtable {
|
||||
address: addr,
|
||||
length: methods.len() as u32,
|
||||
col_address: None,
|
||||
class_name: "ANON_test".into(),
|
||||
rtti_present: false,
|
||||
base_classes_json: None,
|
||||
methods,
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode the canonical pattern at PC `start`:
|
||||
/// lis r3, hi
|
||||
/// addi r3, r3, lo ; r3 = vtable_addr
|
||||
/// lwz r4, slot*4(r3) ; r4 = vtable[slot]
|
||||
/// mtctr r4
|
||||
/// bcctrl
|
||||
fn encode_pattern(buf: &mut [u8], offset: usize, vtable_addr: u32, slot_off: i32) {
|
||||
let hi = (vtable_addr >> 16) as u16;
|
||||
let lo = (vtable_addr & 0xFFFF) as i16;
|
||||
let lis = (15u32 << 26) | (3 << 21) | (0 << 16) | (hi as u32);
|
||||
// addi r3, r3, lo (signed) — note: addi is treated as signed
|
||||
let addi = (14u32 << 26) | (3 << 21) | (3 << 16) | ((lo as u16) as u32);
|
||||
let lwz = (32u32 << 26) | (4 << 21) | (3 << 16) | ((slot_off as u16) as u32);
|
||||
// mtctr r4 = mtspr CTR(=9), r4. SPR_low (=9) → Rust bits 16-20;
|
||||
// SPR_high (=0) → Rust bits 11-15. Rc bit 0.
|
||||
let mtctr = (31u32 << 26) | (4 << 21) | (9 << 16) | (0 << 11) | (467 << 1);
|
||||
let bcctrl = (19u32 << 26) | (20 << 21) | (528 << 1) | 1; // bcctrl 20, 0
|
||||
let words = [lis, addi, lwz, mtctr, bcctrl];
|
||||
for (i, w) in words.iter().enumerate() {
|
||||
buf[offset + i * 4..offset + i * 4 + 4].copy_from_slice(&w.to_be_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_canonical_lis_addi_lwz_mtctr_bcctrl() {
|
||||
let image_base = 0x82000000u32;
|
||||
let text_va = 0x1000u32;
|
||||
let pc_start = image_base + text_va;
|
||||
let vtable_addr = 0x82010000u32;
|
||||
|
||||
// PE: just the .text we'll write the pattern into.
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
encode_pattern(&mut pe, text_va as usize, vtable_addr, 8); // slot 2
|
||||
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
functions.insert(pc_start, FuncInfo {
|
||||
start: pc_start,
|
||||
end: pc_start + 5 * 4,
|
||||
frame_size: 0,
|
||||
saved_gprs: 0,
|
||||
is_leaf: false,
|
||||
is_saverestore: false,
|
||||
pdata_validated: false,
|
||||
pdata_length: None,
|
||||
has_eh: false,
|
||||
});
|
||||
let func_analysis = FuncAnalysis {
|
||||
functions,
|
||||
save_gpr_base: None,
|
||||
restore_gpr_base: None,
|
||||
pdata_entries: Vec::new(),
|
||||
};
|
||||
|
||||
let vtables = vec![mk_vtable(vtable_addr, vec![0xAA, 0xBB, 0xCC, 0xDD])];
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
let edges = analyze(&pe, image_base, &func_analysis, &vtables, &labels);
|
||||
|
||||
assert_eq!(edges.len(), 1);
|
||||
assert_eq!(edges[0].source, pc_start + 4 * 4); // bcctrl at 5th instruction
|
||||
assert_eq!(edges[0].target, 0xCC); // slot 2
|
||||
assert_eq!(edges[0].via_vtable, vtable_addr);
|
||||
assert_eq!(edges[0].slot, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn out_of_range_slot_yields_no_edge() {
|
||||
let image_base = 0x82000000u32;
|
||||
let text_va = 0x1000u32;
|
||||
let pc_start = image_base + text_va;
|
||||
let vtable_addr = 0x82010000u32;
|
||||
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
// Encode slot 12, but vtable only has 4 methods.
|
||||
encode_pattern(&mut pe, text_va as usize, vtable_addr, 48);
|
||||
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
functions.insert(pc_start, FuncInfo {
|
||||
start: pc_start,
|
||||
end: pc_start + 5 * 4,
|
||||
frame_size: 0,
|
||||
saved_gprs: 0,
|
||||
is_leaf: false,
|
||||
is_saverestore: false,
|
||||
pdata_validated: false,
|
||||
pdata_length: None,
|
||||
has_eh: false,
|
||||
});
|
||||
let func_analysis = FuncAnalysis {
|
||||
functions,
|
||||
save_gpr_base: None,
|
||||
restore_gpr_base: None,
|
||||
pdata_entries: Vec::new(),
|
||||
};
|
||||
|
||||
let vtables = vec![mk_vtable(vtable_addr, vec![0xAA, 0xBB, 0xCC, 0xDD])];
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
let edges = analyze(&pe, image_base, &func_analysis, &vtables, &labels);
|
||||
assert_eq!(edges.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn label_in_middle_kills_state() {
|
||||
let image_base = 0x82000000u32;
|
||||
let text_va = 0x1000u32;
|
||||
let pc_start = image_base + text_va;
|
||||
let vtable_addr = 0x82010000u32;
|
||||
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
encode_pattern(&mut pe, text_va as usize, vtable_addr, 0);
|
||||
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
functions.insert(pc_start, FuncInfo {
|
||||
start: pc_start,
|
||||
end: pc_start + 5 * 4,
|
||||
frame_size: 0,
|
||||
saved_gprs: 0,
|
||||
is_leaf: false,
|
||||
is_saverestore: false,
|
||||
pdata_validated: false,
|
||||
pdata_length: None,
|
||||
has_eh: false,
|
||||
});
|
||||
let func_analysis = FuncAnalysis {
|
||||
functions,
|
||||
save_gpr_base: None,
|
||||
restore_gpr_base: None,
|
||||
pdata_entries: Vec::new(),
|
||||
};
|
||||
|
||||
let vtables = vec![mk_vtable(vtable_addr, vec![0xAA, 0xBB])];
|
||||
|
||||
// Label between addi and lwz — must kill the Const tracking.
|
||||
let mut labels: HashMap<u32, String> = HashMap::new();
|
||||
labels.insert(pc_start + 8, "loc_mid".to_string());
|
||||
|
||||
let edges = analyze(&pe, image_base, &func_analysis, &vtables, &labels);
|
||||
assert_eq!(edges.len(), 0, "label in middle of pattern must kill register state");
|
||||
}
|
||||
}
|
||||
@@ -2,9 +2,22 @@ pub mod ppc;
|
||||
pub mod func;
|
||||
pub mod xref;
|
||||
pub mod db;
|
||||
pub mod disasm;
|
||||
pub mod formatter;
|
||||
pub mod sinks;
|
||||
pub mod sql_views;
|
||||
pub mod demangle;
|
||||
pub mod vtables;
|
||||
pub mod lookup;
|
||||
pub mod indirect;
|
||||
pub mod ind_dispatch_typed;
|
||||
pub mod strings;
|
||||
pub mod funcptr_arrays;
|
||||
pub mod eh_scope;
|
||||
pub mod static_init;
|
||||
|
||||
mod ordinals;
|
||||
pub use ordinals::resolve_ordinal;
|
||||
pub use xref::{XrefKind, Xref, XrefMap, resolve_source_label};
|
||||
pub use db::{DbWriter, ExecTraceEntry, ImportCallEntry, BranchTraceEntry};
|
||||
pub use disasm::{RichDisasmItem, enrich_section};
|
||||
|
||||
222
crates/xenia-analysis/src/lookup.rs
Normal file
222
crates/xenia-analysis/src/lookup.rs
Normal file
@@ -0,0 +1,222 @@
|
||||
//! Symbolic-name resolution for runtime probes (M4).
|
||||
//!
|
||||
//! Lets `--pc-probe` / `--branch-probe` / `--ctor-probe` accept names like
|
||||
//! `xe::apu::AudioSystem::Setup` or `MyClass::*` instead of bare PC literals.
|
||||
//! Resolution joins the M3-produced `classes` × `methods` × `functions` tables
|
||||
//! and the M2 `demangled_names` table.
|
||||
//!
|
||||
//! Numeric tokens (`0x824D6640`, `2186674160`) are returned unchanged; symbolic
|
||||
//! tokens require a path to an existing `sylpheed.db` (passed by the caller).
|
||||
//!
|
||||
//! All DB access is read-only and happens before guest execution, so the
|
||||
//! lockstep digest is unaffected.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use duckdb::params;
|
||||
|
||||
/// Parse one probe token into one or more PCs.
|
||||
///
|
||||
/// Recognized forms:
|
||||
/// - `0xADDR` / `ADDR` (decimal) → returns one PC unchanged.
|
||||
/// - `Class::method` → all `methods.function_address` matching that
|
||||
/// `class_name` + `method_name` pair.
|
||||
/// - `Class::*` → all `methods.function_address` for that class.
|
||||
/// - `func::Name` (free function) → falls back to `functions.name` lookup.
|
||||
///
|
||||
/// `db_path` is consulted ONLY if the token is non-numeric. When `db_path` is
|
||||
/// `None` and the token is symbolic, returns an error suggesting the user
|
||||
/// either pass `--db` or use a numeric address.
|
||||
pub fn resolve_probe_token(db_path: Option<&Path>, token: &str) -> Result<Vec<u32>> {
|
||||
let token = token.trim();
|
||||
if token.is_empty() {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
if let Some(pc) = parse_numeric(token) {
|
||||
return Ok(vec![pc]);
|
||||
}
|
||||
|
||||
let db = db_path.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"symbolic probe token {token:?} requires a sylpheed.db; \
|
||||
pass --probe-db=PATH or use a numeric 0x… address",
|
||||
)
|
||||
})?;
|
||||
|
||||
if !db.exists() {
|
||||
return Err(anyhow!("--probe-db not found: {}", db.display()));
|
||||
}
|
||||
|
||||
let conn = duckdb::Connection::open_with_flags(
|
||||
db,
|
||||
duckdb::Config::default().access_mode(duckdb::AccessMode::ReadOnly)?,
|
||||
)?;
|
||||
|
||||
// Class::method or Class::*
|
||||
if let Some((class, method)) = token.split_once("::") {
|
||||
if method == "*" {
|
||||
return resolve_class_star(&conn, class);
|
||||
}
|
||||
// Try Class::method first, then fall back to functions.name lookup.
|
||||
let pcs = resolve_class_method(&conn, class, method)?;
|
||||
if !pcs.is_empty() {
|
||||
return Ok(pcs);
|
||||
}
|
||||
}
|
||||
|
||||
// Last-resort: functions.name match (e.g. for `entry_point` or
|
||||
// `__savegprlr_22`). Substring-free; user gets a clear error if missing.
|
||||
resolve_function_name(&conn, token)
|
||||
}
|
||||
|
||||
fn parse_numeric(token: &str) -> Option<u32> {
|
||||
if let Some(hex) = token.strip_prefix("0x").or_else(|| token.strip_prefix("0X")) {
|
||||
return u32::from_str_radix(hex, 16).ok();
|
||||
}
|
||||
token.parse::<u32>().ok()
|
||||
}
|
||||
|
||||
fn resolve_class_method(conn: &duckdb::Connection, class: &str, method: &str) -> Result<Vec<u32>> {
|
||||
// Two-step lookup so we can give better errors:
|
||||
// 1. find matching methods rows joined to classes;
|
||||
// 2. surface the function_address column.
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT DISTINCT m.function_address FROM methods m
|
||||
JOIN classes c ON c.vtable_address = m.vtable_address
|
||||
JOIN demangled_names dn ON dn.address = m.function_address
|
||||
WHERE c.name = ? AND dn.method_name = ?",
|
||||
)?;
|
||||
let pcs: Vec<u32> = stmt
|
||||
.query_map(params![class, method], |r| r.get::<_, i64>(0).map(|x| x as u32))?
|
||||
.filter_map(|r| r.ok())
|
||||
.collect();
|
||||
Ok(pcs)
|
||||
}
|
||||
|
||||
fn resolve_class_star(conn: &duckdb::Connection, class: &str) -> Result<Vec<u32>> {
|
||||
let mut stmt = conn.prepare(
|
||||
"SELECT DISTINCT m.function_address FROM methods m
|
||||
JOIN classes c ON c.vtable_address = m.vtable_address
|
||||
WHERE c.name = ?",
|
||||
)?;
|
||||
let pcs: Vec<u32> = stmt
|
||||
.query_map(params![class], |r| r.get::<_, i64>(0).map(|x| x as u32))?
|
||||
.filter_map(|r| r.ok())
|
||||
.collect();
|
||||
if pcs.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"no class named {class:?} found in classes table — has --dis populated this DB?",
|
||||
));
|
||||
}
|
||||
Ok(pcs)
|
||||
}
|
||||
|
||||
fn resolve_function_name(conn: &duckdb::Connection, name: &str) -> Result<Vec<u32>> {
|
||||
let mut stmt = conn.prepare("SELECT address FROM functions WHERE name = ?")?;
|
||||
let pcs: Vec<u32> = stmt
|
||||
.query_map(params![name], |r| r.get::<_, i64>(0).map(|x| x as u32))?
|
||||
.filter_map(|r| r.ok())
|
||||
.collect();
|
||||
if pcs.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"probe token {name:?} did not match any classes::methods or functions row",
|
||||
));
|
||||
}
|
||||
Ok(pcs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use duckdb::Connection;
|
||||
|
||||
fn build_synthetic_db(path: &Path) {
|
||||
let conn = Connection::open(path).expect("open");
|
||||
conn.execute_batch(
|
||||
"
|
||||
CREATE TABLE functions (
|
||||
address BIGINT PRIMARY KEY,
|
||||
name VARCHAR
|
||||
);
|
||||
CREATE TABLE classes (
|
||||
name VARCHAR PRIMARY KEY,
|
||||
vtable_address BIGINT,
|
||||
rtti_present BOOLEAN,
|
||||
base_classes_json VARCHAR
|
||||
);
|
||||
CREATE TABLE methods (
|
||||
vtable_address BIGINT,
|
||||
slot BIGINT,
|
||||
function_address BIGINT,
|
||||
mangled_name VARCHAR,
|
||||
demangled_name VARCHAR,
|
||||
PRIMARY KEY (vtable_address, slot)
|
||||
);
|
||||
CREATE TABLE demangled_names (
|
||||
address BIGINT,
|
||||
mangled VARCHAR,
|
||||
raw_demangled VARCHAR,
|
||||
namespace_path VARCHAR,
|
||||
class_name VARCHAR,
|
||||
method_name VARCHAR,
|
||||
params_signature VARCHAR
|
||||
);
|
||||
INSERT INTO classes VALUES ('Foo', 11000, true, NULL);
|
||||
INSERT INTO functions VALUES (12000, 'sub_2EE0'), (12100, 'sub_2F44');
|
||||
INSERT INTO methods VALUES (11000, 0, 12000, NULL, NULL),
|
||||
(11000, 1, 12100, NULL, NULL);
|
||||
INSERT INTO demangled_names (address, mangled, raw_demangled, class_name, method_name)
|
||||
VALUES (12000, '?bar@Foo@@QEAAXXZ', 'void Foo::bar(void)', 'Foo', 'bar'),
|
||||
(12100, '?baz@Foo@@QEAAXXZ', 'void Foo::baz(void)', 'Foo', 'baz');
|
||||
",
|
||||
)
|
||||
.expect("seed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn numeric_passthrough_no_db_needed() {
|
||||
let pcs = resolve_probe_token(None, "0x824D6640").unwrap();
|
||||
assert_eq!(pcs, vec![0x824D6640]);
|
||||
let pcs = resolve_probe_token(None, "2186095088").unwrap();
|
||||
assert_eq!(pcs, vec![0x824D29F0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn symbolic_token_without_db_errors() {
|
||||
let err = resolve_probe_token(None, "Foo::bar").unwrap_err();
|
||||
assert!(format!("{err}").contains("requires a sylpheed.db"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn class_method_resolves() {
|
||||
let tmp = std::env::temp_dir().join("xenia_lookup_test.duckdb");
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
build_synthetic_db(&tmp);
|
||||
let pcs = resolve_probe_token(Some(&tmp), "Foo::bar").unwrap();
|
||||
assert_eq!(pcs, vec![12000]);
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn class_star_returns_all_methods() {
|
||||
let tmp = std::env::temp_dir().join("xenia_lookup_star.duckdb");
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
build_synthetic_db(&tmp);
|
||||
let mut pcs = resolve_probe_token(Some(&tmp), "Foo::*").unwrap();
|
||||
pcs.sort();
|
||||
assert_eq!(pcs, vec![12000, 12100]);
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn function_name_fallback() {
|
||||
let tmp = std::env::temp_dir().join("xenia_lookup_fn.duckdb");
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
build_synthetic_db(&tmp);
|
||||
let pcs = resolve_probe_token(Some(&tmp), "sub_2EE0").unwrap();
|
||||
assert_eq!(pcs, vec![12000]);
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
37
crates/xenia-analysis/src/sinks/duckdb.rs
Normal file
37
crates/xenia-analysis/src/sinks/duckdb.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
//! DuckDB sink — appends rich disasm items to the `instructions` table.
|
||||
//!
|
||||
//! Column layout matches [`crate::db`]: address, raw, mnemonic, operands,
|
||||
//! disasm, ext_mnemonic, ext_operands, ext_disasm, section, function, label.
|
||||
|
||||
use duckdb::{Appender, params};
|
||||
|
||||
use crate::disasm::RichDisasmItem;
|
||||
|
||||
/// Append every item to the appender. Returns the number of rows written.
|
||||
/// Does NOT flush — the caller decides when to flush, since multiple
|
||||
/// section iterators typically share one appender.
|
||||
pub fn append_instructions<'a>(
|
||||
appender: &mut Appender<'_>,
|
||||
items: impl IntoIterator<Item = RichDisasmItem<'a>>,
|
||||
) -> duckdb::Result<u64> {
|
||||
let mut count: u64 = 0;
|
||||
for ri in items {
|
||||
let t = &ri.item.text;
|
||||
appender.append_row(params![
|
||||
ri.item.addr as i64,
|
||||
ri.item.raw as i64,
|
||||
t.mnemonic.as_str(),
|
||||
t.operands.as_str(),
|
||||
t.disasm.as_str(),
|
||||
t.ext_mnemonic.as_deref(),
|
||||
t.ext_operands.as_deref(),
|
||||
t.ext_disasm.as_deref(),
|
||||
t.branch_target.map(|t| t as i64),
|
||||
ri.section,
|
||||
ri.function.map(|f| f as i64),
|
||||
ri.label,
|
||||
])?;
|
||||
count += 1;
|
||||
}
|
||||
Ok(count)
|
||||
}
|
||||
63
crates/xenia-analysis/src/sinks/json.rs
Normal file
63
crates/xenia-analysis/src/sinks/json.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
//! JSON Lines sink — one structured row per line, constant memory.
|
||||
//!
|
||||
//! Suited for piping into `jq`, importing into pandas / DuckDB's
|
||||
//! `read_json_auto`, or feeding downstream tooling that expects a
|
||||
//! line-delimited stream rather than a single megaobject.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::disasm::RichDisasmItem;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct JsonRow<'a> {
|
||||
addr: u32,
|
||||
raw: u32,
|
||||
mnemonic: &'a str,
|
||||
operands: &'a str,
|
||||
disasm: &'a str,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
ext_mnemonic: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
ext_operands: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
ext_disasm: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
branch_target: Option<u32>,
|
||||
section: &'a str,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
function: Option<u32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
label: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// Write each item as a single JSON object on its own line. Returns the
|
||||
/// number of rows written.
|
||||
pub fn write_jsonl<'a, W: Write>(
|
||||
out: &mut W,
|
||||
items: impl IntoIterator<Item = RichDisasmItem<'a>>,
|
||||
) -> io::Result<u64> {
|
||||
let mut count: u64 = 0;
|
||||
for ri in items {
|
||||
let t = &ri.item.text;
|
||||
let row = JsonRow {
|
||||
addr: ri.item.addr,
|
||||
raw: ri.item.raw,
|
||||
mnemonic: &t.mnemonic,
|
||||
operands: &t.operands,
|
||||
disasm: &t.disasm,
|
||||
ext_mnemonic: t.ext_mnemonic.as_deref(),
|
||||
ext_operands: t.ext_operands.as_deref(),
|
||||
ext_disasm: t.ext_disasm.as_deref(),
|
||||
branch_target: t.branch_target,
|
||||
section: ri.section,
|
||||
function: ri.function,
|
||||
label: ri.label,
|
||||
};
|
||||
serde_json::to_writer(&mut *out, &row)?;
|
||||
out.write_all(b"\n")?;
|
||||
count += 1;
|
||||
}
|
||||
Ok(count)
|
||||
}
|
||||
8
crates/xenia-analysis/src/sinks/mod.rs
Normal file
8
crates/xenia-analysis/src/sinks/mod.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
//! Output sinks for [`crate::disasm::RichDisasmItem`] streams.
|
||||
//!
|
||||
//! Each sink consumes the same iterator shape and writes to a different
|
||||
//! medium: human-readable .asm text, JSON Lines, or DuckDB rows.
|
||||
|
||||
pub mod duckdb;
|
||||
pub mod json;
|
||||
pub mod text;
|
||||
58
crates/xenia-analysis/src/sinks/text.rs
Normal file
58
crates/xenia-analysis/src/sinks/text.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
//! Text sink — renders one .asm instruction line with optional
|
||||
//! branch-target / data-ref annotations.
|
||||
//!
|
||||
//! The full `write_asm` orchestration (section headers, function prologue
|
||||
//! info, xref comment blocks, hex-dump of data sections) stays in
|
||||
//! [`crate::formatter`]; this sink only owns the per-instruction line.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::disasm::RichDisasmItem;
|
||||
use crate::xref::{XrefKind, section_for_addr};
|
||||
|
||||
/// Render one instruction line:
|
||||
/// ` 82000000: 60000000 nop`
|
||||
/// ` 82000004: 4800FFFC bl 0x82000000 ; -> entry_point`
|
||||
/// ` 82000010: 812A0000 lwz r9, 0(r10) ; [R] 0x828A0000 (.rdata) = dat_…`
|
||||
pub fn write_instr_line<W: Write + ?Sized>(
|
||||
out: &mut W,
|
||||
item: &RichDisasmItem<'_>,
|
||||
labels: &HashMap<u32, String>,
|
||||
sections: &[PeSection],
|
||||
image_base: u32,
|
||||
data_annotation: Option<(u32, XrefKind)>,
|
||||
) -> io::Result<()> {
|
||||
let disasm_text = item.item.text.display();
|
||||
|
||||
// Branch-target → label annotation. Uses the structured `branch_target`
|
||||
// field (cleaner than the legacy "find 0x in disasm string" regex).
|
||||
let mut annotated = match item.item.text.branch_target {
|
||||
Some(target) => match labels.get(&target) {
|
||||
Some(lbl) => format!("{disasm_text:<40} ; -> {lbl}"),
|
||||
None => disasm_text.to_string(),
|
||||
},
|
||||
None => disasm_text.to_string(),
|
||||
};
|
||||
|
||||
if let Some((data_addr, kind)) = data_annotation {
|
||||
let tag = match kind {
|
||||
XrefKind::DataRead => "[R]",
|
||||
XrefKind::DataWrite => "[W]",
|
||||
_ => "[&]",
|
||||
};
|
||||
let sec = section_for_addr(data_addr, sections, image_base).unwrap_or("?");
|
||||
let data_lbl = labels.get(&data_addr)
|
||||
.map(|s| format!(" = {s}"))
|
||||
.unwrap_or_default();
|
||||
if !annotated.contains("; ->") {
|
||||
annotated = format!("{annotated:<40} ; {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
} else {
|
||||
annotated = format!("{annotated} {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(out, " {:08X}: {:08X} {}", item.item.addr, item.item.raw, annotated)
|
||||
}
|
||||
165
crates/xenia-analysis/src/sql_views.rs
Normal file
165
crates/xenia-analysis/src/sql_views.rs
Normal file
@@ -0,0 +1,165 @@
|
||||
//! Additive SQL views over the Phase-3 ingest tables.
|
||||
//!
|
||||
//! These views are created when `--analyze=sql` or `--analyze=both` is set.
|
||||
//! They are *not* a replacement for the Rust passes ([`crate::xref`],
|
||||
//! [`crate::func`]) — those still own data-ref resolution and prologue
|
||||
//! pattern matching. The views cover the cleanly-relational parts:
|
||||
//!
|
||||
//! - branch xrefs (self-join on `instructions.target_hex`)
|
||||
//! - call graph + reachability (recursive CTE over `xrefs`)
|
||||
//! - convenience joins (function-first-instruction, imports-called)
|
||||
//!
|
||||
//! All views are read-only and stable across re-creation: dropping and
|
||||
//! recreating the database via [`crate::db::DbWriter::open_fresh`] re-runs
|
||||
//! these definitions.
|
||||
//!
|
||||
//! ## Cross-check semantics
|
||||
//!
|
||||
//! `v_branch_xrefs` is intended to produce *exactly* the same `(source,
|
||||
//! target, kind)` tuples as the Rust `xref.rs` first pass — given the same
|
||||
//! input image. [`crate::db::DbWriter::cross_check_branch_xrefs`] queries
|
||||
//! the symmetric difference and returns the row counts; both should be
|
||||
//! zero. A non-zero count means the formatter's `mnemonic` column or the
|
||||
//! kind-classification CASE drifted out of agreement with `xref.rs`, and
|
||||
//! is worth a one-line warning at log time.
|
||||
|
||||
/// `(view_name, CREATE VIEW … SQL)` pairs in the order they must run.
|
||||
/// Later views may depend on earlier ones (e.g. `v_call_graph` reads
|
||||
/// `xrefs`, which is the Rust-pass table; `v_branch_xrefs` is independent).
|
||||
pub const ALL_VIEWS: &[(&str, &str)] = &[
|
||||
("v_branch_xrefs", V_BRANCH_XREFS),
|
||||
("v_call_graph", V_CALL_GRAPH),
|
||||
("v_reachability_from_entry", V_REACHABILITY_FROM_ENTRY),
|
||||
("v_indirect_reachability_from_entry", V_INDIRECT_REACHABILITY_FROM_ENTRY),
|
||||
("v_function_first_instruction", V_FUNCTION_FIRST_INSTRUCTION),
|
||||
("v_imports_called", V_IMPORTS_CALLED),
|
||||
];
|
||||
|
||||
/// Branch cross-references derived purely from `instructions.target_hex`.
|
||||
///
|
||||
/// Mirrors the kind classification in [`crate::xref::collect_branch_target`]
|
||||
/// and the short tags returned by [`crate::xref::XrefKind::tag`] (which are
|
||||
/// what `xrefs.kind` actually stores):
|
||||
/// - I-form (`b`/`bl`/`ba`/`bla`): `bl`/`bla` → `"call"`, `b`/`ba` → `"j"`
|
||||
/// - B-form (`bc`/`bcl`/`bca`/`bcla`): always → `"br"`
|
||||
///
|
||||
/// Indirect branches (`bclr`/`bcctr`) leave `target_hex` NULL and are
|
||||
/// excluded from this view by design.
|
||||
const V_BRANCH_XREFS: &str = "
|
||||
CREATE OR REPLACE VIEW v_branch_xrefs AS
|
||||
SELECT
|
||||
address AS source,
|
||||
target_hex AS target,
|
||||
CASE
|
||||
WHEN mnemonic IN ('bl', 'bla') THEN 'call'
|
||||
WHEN mnemonic IN ('b', 'ba') THEN 'j'
|
||||
WHEN mnemonic IN ('bc', 'bcl', 'bca', 'bcla') THEN 'br'
|
||||
ELSE 'br'
|
||||
END AS kind,
|
||||
mnemonic AS instruction,
|
||||
function AS source_func
|
||||
FROM instructions
|
||||
WHERE target_hex IS NOT NULL;
|
||||
";
|
||||
|
||||
/// Call-graph edges resolved against function names.
|
||||
///
|
||||
/// Reads from `xrefs` (the Rust-pass table) — this is the canonical source
|
||||
/// for *all* edge kinds, including indirect/data; SQL can't reconstruct the
|
||||
/// data-ref edges cleanly because they require register tracking. For pure
|
||||
/// branch edges, `v_branch_xrefs` produces equivalent rows directly from
|
||||
/// `instructions`.
|
||||
const V_CALL_GRAPH: &str = "
|
||||
CREATE OR REPLACE VIEW v_call_graph AS
|
||||
SELECT
|
||||
x.source AS caller_addr,
|
||||
cf.name AS caller_name,
|
||||
x.target AS callee_addr,
|
||||
tf.name AS callee_name,
|
||||
x.kind AS edge_kind
|
||||
FROM xrefs x
|
||||
LEFT JOIN functions cf ON cf.address = x.source_func
|
||||
LEFT JOIN functions tf ON tf.address = x.target
|
||||
WHERE x.kind = 'call';
|
||||
";
|
||||
|
||||
/// Transitive function-level reachability from the entry point over
|
||||
/// call/jump/branch edges. Useful for finding dead code
|
||||
/// (`SELECT address FROM functions
|
||||
/// WHERE address NOT IN (SELECT addr FROM v_reachability_from_entry)`)
|
||||
/// and for scoping analysis to the live subset.
|
||||
///
|
||||
/// Seeds from the function containing the `entry_point` label and walks
|
||||
/// the recursive closure: a reachable function's instructions branch into
|
||||
/// the functions enclosing the branch targets, which are then reachable
|
||||
/// in turn. `UNION` (not `UNION ALL`) deduplicates to handle call-graph
|
||||
/// cycles (recursive functions, mutually-recursive pairs).
|
||||
const V_REACHABILITY_FROM_ENTRY: &str = "
|
||||
CREATE OR REPLACE VIEW v_reachability_from_entry AS
|
||||
WITH RECURSIVE reach(fn) AS (
|
||||
SELECT i.function FROM instructions i
|
||||
JOIN labels l ON l.address = i.address
|
||||
WHERE l.name = 'entry_point' AND i.function IS NOT NULL
|
||||
UNION
|
||||
SELECT tgt.function FROM xrefs x
|
||||
JOIN instructions src ON src.address = x.source
|
||||
JOIN instructions tgt ON tgt.address = x.target
|
||||
JOIN reach r ON src.function = r.fn
|
||||
WHERE x.kind IN ('call', 'j', 'br')
|
||||
AND tgt.function IS NOT NULL
|
||||
)
|
||||
SELECT fn AS addr FROM reach;
|
||||
";
|
||||
|
||||
/// Reachability extended over `kind='ind_call'` edges from M5. Strict
|
||||
/// superset of `v_reachability_from_entry` — every fn there is also here,
|
||||
/// plus any function reached only via a vtable bcctrl whose vtable+slot
|
||||
/// the M5 dataflow could resolve. Sample 5 newly-reachable PCs in canary
|
||||
/// before trusting widely; the analysis intentionally leaves out alias-
|
||||
/// dependent indirect calls (vtable loaded from a `this` field).
|
||||
const V_INDIRECT_REACHABILITY_FROM_ENTRY: &str = "
|
||||
CREATE OR REPLACE VIEW v_indirect_reachability_from_entry AS
|
||||
WITH RECURSIVE reach(fn) AS (
|
||||
SELECT i.function FROM instructions i
|
||||
JOIN labels l ON l.address = i.address
|
||||
WHERE l.name = 'entry_point' AND i.function IS NOT NULL
|
||||
UNION
|
||||
SELECT tgt.function FROM xrefs x
|
||||
JOIN instructions src ON src.address = x.source
|
||||
JOIN instructions tgt ON tgt.address = x.target
|
||||
JOIN reach r ON src.function = r.fn
|
||||
WHERE x.kind IN ('call', 'ind_call', 'j', 'br')
|
||||
AND tgt.function IS NOT NULL
|
||||
)
|
||||
SELECT fn AS addr FROM reach;
|
||||
";
|
||||
|
||||
/// Convenience join: each function's first decoded instruction. Useful for
|
||||
/// quickly inspecting prologue patterns without computing offsets manually.
|
||||
const V_FUNCTION_FIRST_INSTRUCTION: &str = "
|
||||
CREATE OR REPLACE VIEW v_function_first_instruction AS
|
||||
SELECT
|
||||
f.address AS function_addr,
|
||||
f.name AS function_name,
|
||||
i.raw AS first_raw,
|
||||
i.disasm AS first_disasm,
|
||||
i.ext_disasm AS first_ext_disasm
|
||||
FROM functions f
|
||||
JOIN instructions i ON i.address = f.address;
|
||||
";
|
||||
|
||||
/// Per-function summary of which kernel/library imports it calls. Joins
|
||||
/// xrefs (call edges) against the labels table to surface import names.
|
||||
const V_IMPORTS_CALLED: &str = "
|
||||
CREATE OR REPLACE VIEW v_imports_called AS
|
||||
SELECT
|
||||
x.source_func AS function_addr,
|
||||
f.name AS function_name,
|
||||
x.target AS import_addr,
|
||||
l.name AS import_name
|
||||
FROM xrefs x
|
||||
JOIN labels l ON l.address = x.target
|
||||
LEFT JOIN functions f ON f.address = x.source_func
|
||||
WHERE x.kind = 'call'
|
||||
AND l.kind = 'import';
|
||||
";
|
||||
399
crates/xenia-analysis/src/static_init.rs
Normal file
399
crates/xenia-analysis/src/static_init.rs
Normal file
@@ -0,0 +1,399 @@
|
||||
//! M11.5 — static-initialiser driver detection.
|
||||
//!
|
||||
//! MSVC's CRT static-init driver (`_initterm` / `_initterm_e` style)
|
||||
//! is a tight loop that walks a function-pointer array between two
|
||||
//! addresses, calling each non-null entry:
|
||||
//!
|
||||
//! ```text
|
||||
//! loop_top:
|
||||
//! cmpw[l] rA, rB ; compare cursor vs end
|
||||
//! beq done
|
||||
//! lwz rN, 0(rA) ; load fn ptr
|
||||
//! cmpwi rN, 0 ; null-skip (optional)
|
||||
//! beq skip
|
||||
//! mtctr rN
|
||||
//! bcctrl
|
||||
//! skip:
|
||||
//! addi rA, rA, 4
|
||||
//! b loop_top
|
||||
//! done:
|
||||
//! ```
|
||||
//!
|
||||
//! Two static addresses (`rA` and `rB` at loop start) bracket the
|
||||
//! function-pointer array. Detection strategy: scan every function for
|
||||
//! the canonical pattern; when found, extract the array bounds and
|
||||
//! emit one row in `function_pointer_arrays` with `kind='static_init'`.
|
||||
//!
|
||||
//! ### What this layer does
|
||||
//!
|
||||
//! - Walks each function looking for an `lwz; mtctr; bcctrl` sequence
|
||||
//! inside a loop bounded by a comparison against another constant.
|
||||
//! - When the loop's cursor register is observed to be incremented by
|
||||
//! exactly 4 per iteration, classifies it as a static-init driver
|
||||
//! and records the (start, end) array bounds.
|
||||
//!
|
||||
//! ### What this layer does NOT do
|
||||
//!
|
||||
//! - No support for back-to-back drivers sharing a common loop trampoline.
|
||||
//! - No detection of the M11 prologue-style heuristic; M11.5 is
|
||||
//! structure-grounded and replaces the prior heuristic where it fires.
|
||||
//! - Does not handle CRT-style `_initterm_e` (the `_e` variant returns
|
||||
//! a status); detection works for both as long as the loop shape
|
||||
//! matches.
|
||||
//!
|
||||
//! Reference: Microsoft CRT `crt0.c::_initterm` source pattern.
|
||||
|
||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
|
||||
use crate::func::FuncAnalysis;
|
||||
use crate::funcptr_arrays::FuncPtrArray;
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct StaticInitDriver {
|
||||
/// VA of the driver function (the one containing the loop).
|
||||
pub driver_function: u32,
|
||||
/// VA of the array start.
|
||||
pub array_start: u32,
|
||||
/// VA one-past-end of the array.
|
||||
pub array_end: u32,
|
||||
/// Detected length in slots.
|
||||
pub length: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct StaticInitResult {
|
||||
pub drivers: Vec<StaticInitDriver>,
|
||||
/// Newly-detected static-init arrays, ready to be merged into the
|
||||
/// `function_pointer_arrays` table with `kind='static_init'`.
|
||||
pub arrays: Vec<FuncPtrArray>,
|
||||
}
|
||||
|
||||
const OP_ADDI: u32 = 14;
|
||||
const OP_ADDIS: u32 = 15;
|
||||
const OP_BCCTR: u32 = 19;
|
||||
const OP_LWZ: u32 = 32;
|
||||
const OP_X_FORM: u32 = 31;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum RegVal {
|
||||
Const(u32),
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
func_analysis: &FuncAnalysis,
|
||||
function_starts: &BTreeSet<u32>,
|
||||
labels: &HashMap<u32, String>,
|
||||
) -> StaticInitResult {
|
||||
let started = std::time::Instant::now();
|
||||
let block_boundaries: HashSet<u32> = labels.keys().copied().collect();
|
||||
|
||||
let mut drivers: Vec<StaticInitDriver> = Vec::new();
|
||||
|
||||
for (&fn_start, fi) in &func_analysis.functions {
|
||||
if fi.is_saverestore { continue; }
|
||||
if let Some(d) = scan_function_for_driver(
|
||||
pe, image_base, fn_start, fi.end, &block_boundaries,
|
||||
) {
|
||||
drivers.push(d);
|
||||
}
|
||||
}
|
||||
|
||||
// Build arrays from the discovered drivers + section data.
|
||||
let mut arrays: Vec<FuncPtrArray> = Vec::new();
|
||||
for d in &drivers {
|
||||
if let Some(entries) = read_array(pe, image_base, sections, d.array_start, d.array_end, function_starts) {
|
||||
arrays.push(FuncPtrArray {
|
||||
address: d.array_start,
|
||||
length: entries.len() as u32,
|
||||
kind: "static_init",
|
||||
entries,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "static_init").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
drivers = drivers.len(),
|
||||
arrays = arrays.len(),
|
||||
elapsed_ms,
|
||||
"M11.5 static-init driver scan complete",
|
||||
);
|
||||
|
||||
StaticInitResult { drivers, arrays }
|
||||
}
|
||||
|
||||
/// Read the function-pointer array between [start, end) from .rdata/.data.
|
||||
/// NULL entries are skipped (CRT _initterm explicitly tolerates them).
|
||||
/// Non-function-start entries cause us to bail (the driver bounds were
|
||||
/// likely misidentified).
|
||||
fn read_array(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
start: u32,
|
||||
end: u32,
|
||||
function_starts: &BTreeSet<u32>,
|
||||
) -> Option<Vec<u32>> {
|
||||
if end <= start || (end - start) > 4096 { return None; }
|
||||
let _section = sections.iter().find(|s| {
|
||||
let lo = image_base + s.virtual_address;
|
||||
let hi = lo + s.virtual_size;
|
||||
start >= lo && end <= hi && (s.name == ".rdata" || s.name == ".data")
|
||||
})?;
|
||||
let mut entries = Vec::new();
|
||||
let mut p = start;
|
||||
while p < end {
|
||||
let off = p.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
let v = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]);
|
||||
if v != 0 {
|
||||
if !function_starts.contains(&v) { return None; }
|
||||
entries.push(v);
|
||||
}
|
||||
p = p.wrapping_add(4);
|
||||
}
|
||||
if entries.is_empty() { return None; }
|
||||
Some(entries)
|
||||
}
|
||||
|
||||
/// Walk one function looking for the canonical static-init driver shape.
|
||||
/// Returns Some when the loop's cursor register starts at a known constant
|
||||
/// `rA`, terminates at another known constant `rB` via a compare, and
|
||||
/// increments by 4 per iteration with an `lwz; mtctr; bcctrl` body.
|
||||
fn scan_function_for_driver(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
fn_start: u32,
|
||||
fn_end: u32,
|
||||
block_boundaries: &HashSet<u32>,
|
||||
) -> Option<StaticInitDriver> {
|
||||
let mut reg: [Option<RegVal>; 32] = [None; 32];
|
||||
// Pattern features observed during the walk.
|
||||
let mut cursor_reg: Option<usize> = None;
|
||||
let mut cursor_init: Option<u32> = None;
|
||||
let mut end_reg: Option<usize> = None;
|
||||
let mut end_init: Option<u32> = None;
|
||||
let mut saw_lwz_through_cursor = false;
|
||||
let mut saw_mtctr = false;
|
||||
let mut saw_bcctrl = false;
|
||||
let mut saw_addi_4 = false;
|
||||
|
||||
let mut pc = fn_start;
|
||||
while pc < fn_end {
|
||||
if pc != fn_start && block_boundaries.contains(&pc) {
|
||||
// Heuristic: when we cross a basic-block boundary that
|
||||
// is not the loop-top, accumulated state remains valid for
|
||||
// pattern-matching purposes — but we drop register Const
|
||||
// tracking to be safe.
|
||||
reg = [None; 32];
|
||||
}
|
||||
let off = pc.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { break; }
|
||||
let instr = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]);
|
||||
let op = instr >> 26;
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||
let uimm = instr & 0xFFFF;
|
||||
|
||||
match op {
|
||||
OP_ADDIS if ra == 0 => reg[rd] = Some(RegVal::Const(uimm << 16)),
|
||||
OP_ADDIS => {
|
||||
if let Some(RegVal::Const(b)) = reg[ra] {
|
||||
reg[rd] = Some(RegVal::Const(b.wrapping_add(uimm << 16)));
|
||||
} else { reg[rd] = None; }
|
||||
}
|
||||
OP_ADDI if ra != 0 => {
|
||||
let prev = reg[ra];
|
||||
if let Some(RegVal::Const(b)) = prev {
|
||||
let v = b.wrapping_add(simm as u32);
|
||||
reg[rd] = Some(RegVal::Const(v));
|
||||
// Was this an `addi r, r, 4`? Mark cursor-increment.
|
||||
if rd == ra && simm == 4 {
|
||||
if Some(rd) == cursor_reg {
|
||||
saw_addi_4 = true;
|
||||
}
|
||||
} else if cursor_reg.is_none() {
|
||||
// First time we see a known-constant register that
|
||||
// *could* be the cursor — defer the choice until we
|
||||
// see a load through it.
|
||||
cursor_init = Some(v);
|
||||
cursor_reg = Some(rd);
|
||||
} else if end_reg.is_none() && Some(rd) != cursor_reg {
|
||||
end_init = Some(v);
|
||||
end_reg = Some(rd);
|
||||
}
|
||||
} else { reg[rd] = None; }
|
||||
}
|
||||
OP_LWZ => {
|
||||
if ra != 0 && Some(ra) == cursor_reg {
|
||||
saw_lwz_through_cursor = true;
|
||||
}
|
||||
reg[rd] = None;
|
||||
}
|
||||
OP_X_FORM => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
if xo == 467 {
|
||||
let spr = (((instr >> 11) & 0x1F) << 5) | ((instr >> 16) & 0x1F);
|
||||
if spr == 9 && saw_lwz_through_cursor { saw_mtctr = true; }
|
||||
}
|
||||
if xo != 444 && xo != 467 { reg[rd] = None; }
|
||||
}
|
||||
OP_BCCTR => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
let lk = (instr & 1) != 0;
|
||||
if xo == 528 && lk && saw_mtctr {
|
||||
saw_bcctrl = true;
|
||||
}
|
||||
}
|
||||
18 => {
|
||||
if (instr & 1) != 0 {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
}
|
||||
}
|
||||
16 => {
|
||||
if (instr & 1) != 0 {
|
||||
for r in 0..=12 { reg[r] = None; }
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
pc = pc.wrapping_add(4);
|
||||
}
|
||||
|
||||
// Validate that all four pattern features fired.
|
||||
if !(saw_lwz_through_cursor && saw_mtctr && saw_bcctrl && saw_addi_4) {
|
||||
return None;
|
||||
}
|
||||
let cursor_init = cursor_init?;
|
||||
let end_init = end_init?;
|
||||
if end_init <= cursor_init { return None; }
|
||||
if end_init - cursor_init > 4096 { return None; }
|
||||
|
||||
Some(StaticInitDriver {
|
||||
driver_function: fn_start,
|
||||
array_start: cursor_init,
|
||||
array_end: end_init,
|
||||
length: (end_init - cursor_init) / 4,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::func::FuncInfo;
|
||||
use std::collections::BTreeMap;
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
fn mk_section(name: &str, va: u32, size: u32) -> PeSection {
|
||||
PeSection {
|
||||
name: name.into(),
|
||||
virtual_address: va, virtual_size: size,
|
||||
raw_offset: va, raw_size: size,
|
||||
flags: 0x4000_0040,
|
||||
}
|
||||
}
|
||||
fn write_be(pe: &mut [u8], at: usize, v: u32) {
|
||||
pe[at..at + 4].copy_from_slice(&v.to_be_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_canonical_initterm_loop() {
|
||||
// Build a tiny driver that loops over a 3-entry array.
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
|
||||
// Array at .rdata + 0x800: 3 function pointers.
|
||||
let arr_va_lo = 0x800u32;
|
||||
let fns = [image_base + 0x2000, image_base + 0x2010, image_base + 0x2020];
|
||||
for (i, p) in fns.iter().enumerate() {
|
||||
write_be(&mut pe, arr_va_lo as usize + i * 4, *p);
|
||||
}
|
||||
let array_start = image_base + arr_va_lo;
|
||||
let array_end = array_start + 12;
|
||||
|
||||
// Driver function at 0x82001000:
|
||||
// lis r3, hi(array_start)
|
||||
// addi r3, r3, lo(array_start)
|
||||
// lis r4, hi(array_end)
|
||||
// addi r4, r4, lo(array_end)
|
||||
// lwz r5, 0(r3)
|
||||
// mtctr r5
|
||||
// bcctrl
|
||||
// addi r3, r3, 4
|
||||
// blr
|
||||
let driver = 0x82001000u32;
|
||||
let off = (driver - image_base) as usize;
|
||||
let lis_r3 = (15u32 << 26) | (3 << 21) | ((array_start >> 16) as u32);
|
||||
let addi_r3 = (14u32 << 26) | (3 << 21) | (3 << 16) | ((array_start as u16) as u32);
|
||||
let lis_r4 = (15u32 << 26) | (4 << 21) | ((array_end >> 16) as u32);
|
||||
let addi_r4 = (14u32 << 26) | (4 << 21) | (4 << 16) | ((array_end as u16) as u32);
|
||||
let lwz = (32u32 << 26) | (5 << 21) | (3 << 16);
|
||||
let mtctr = (31u32 << 26) | (5 << 21) | (9 << 16) | (467 << 1);
|
||||
let bcctrl = (19u32 << 26) | (20 << 21) | (528 << 1) | 1;
|
||||
let addi_inc = (14u32 << 26) | (3 << 21) | (3 << 16) | 4;
|
||||
let blr = (19u32 << 26) | (20 << 21) | (16 << 1);
|
||||
for (i, w) in [lis_r3, addi_r3, lis_r4, addi_r4, lwz, mtctr, bcctrl, addi_inc, blr].iter().enumerate() {
|
||||
write_be(&mut pe, off + i * 4, *w);
|
||||
}
|
||||
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
functions.insert(driver, FuncInfo {
|
||||
start: driver, end: driver + 0x40, frame_size: 0, saved_gprs: 0,
|
||||
is_leaf: false, is_saverestore: false,
|
||||
pdata_validated: false, pdata_length: None, has_eh: false,
|
||||
});
|
||||
let fa = FuncAnalysis {
|
||||
functions, save_gpr_base: None, restore_gpr_base: None, pdata_entries: Vec::new(),
|
||||
};
|
||||
|
||||
let sections = vec![mk_section(".rdata", 0x800, 0x100)];
|
||||
let mut starts = BTreeSet::new();
|
||||
for &p in &fns { starts.insert(p); }
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
|
||||
let r = analyze(&pe, image_base, §ions, &fa, &starts, &labels);
|
||||
|
||||
assert_eq!(r.drivers.len(), 1, "should detect one driver");
|
||||
let d = &r.drivers[0];
|
||||
assert_eq!(d.driver_function, driver);
|
||||
assert_eq!(d.array_start, array_start);
|
||||
assert_eq!(d.array_end, array_end);
|
||||
assert_eq!(d.length, 3);
|
||||
|
||||
assert_eq!(r.arrays.len(), 1);
|
||||
assert_eq!(r.arrays[0].kind, "static_init");
|
||||
assert_eq!(r.arrays[0].entries.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_function_without_pattern() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x4000];
|
||||
let driver = 0x82001000u32;
|
||||
// Just a blr — no driver pattern.
|
||||
let blr = (19u32 << 26) | (20 << 21) | (16 << 1);
|
||||
write_be(&mut pe, (driver - image_base) as usize, blr);
|
||||
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
functions.insert(driver, FuncInfo {
|
||||
start: driver, end: driver + 0x40, frame_size: 0, saved_gprs: 0,
|
||||
is_leaf: true, is_saverestore: false,
|
||||
pdata_validated: false, pdata_length: None, has_eh: false,
|
||||
});
|
||||
let fa = FuncAnalysis {
|
||||
functions, save_gpr_base: None, restore_gpr_base: None, pdata_entries: Vec::new(),
|
||||
};
|
||||
let sections = vec![mk_section(".rdata", 0x800, 0x100)];
|
||||
let starts: BTreeSet<u32> = BTreeSet::new();
|
||||
let labels: HashMap<u32, String> = HashMap::new();
|
||||
let r = analyze(&pe, image_base, §ions, &fa, &starts, &labels);
|
||||
assert_eq!(r.drivers.len(), 0);
|
||||
}
|
||||
}
|
||||
382
crates/xenia-analysis/src/strings.rs
Normal file
382
crates/xenia-analysis/src/strings.rs
Normal file
@@ -0,0 +1,382 @@
|
||||
//! String / constant-pool detection in `.rdata`.
|
||||
//!
|
||||
//! Scans the `.rdata` section for runs of printable ASCII or null-terminated
|
||||
//! UTF-16LE characters of length ≥ 6, emitting one row per discovered string.
|
||||
//! Cross-references against `xrefs.target` are computed by the caller —
|
||||
//! this module only finds the strings; downstream queries can join.
|
||||
//!
|
||||
//! ### What this layer does NOT do
|
||||
//!
|
||||
//! - No UTF-8 multibyte detection — Xbox 360 game binaries reliably use
|
||||
//! ASCII for debug strings and UTF-16LE for localised text.
|
||||
//! - Strings in `.data` (mutable globals) are not scanned by default.
|
||||
//! - Wide strings on Xbox 360 are little-endian (compiler convention even
|
||||
//! on this big-endian platform); we do NOT try big-endian UTF-16.
|
||||
//! - No language detection / classification beyond encoding.
|
||||
//!
|
||||
//! Extends the original ASCII / UTF-16LE pass with Shift_JIS detection
|
||||
//! (Sylpheed is originally Japanese — likely yields mission/UI text
|
||||
//! invisible to ASCII-only) and UTF-8 multi-byte detection.
|
||||
//!
|
||||
//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic;
|
||||
//! Shift_JIS lead/trail byte ranges per JIS X 0208.
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
/// One detected string.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DetectedString {
|
||||
/// Absolute VA of the first byte.
|
||||
pub address: u32,
|
||||
/// `"ascii"` | `"utf16le"` | `"shift_jis"` | `"utf8"`.
|
||||
pub encoding: &'static str,
|
||||
/// Length in bytes (excluding the NUL terminator).
|
||||
pub length: u32,
|
||||
/// UTF-8 representation of the string content.
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
/// Scan all `.rdata` sections (and any other read-only data section the user
|
||||
/// configures) for ASCII and UTF-16LE strings.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(pe: &[u8], image_base: u32, sections: &[PeSection]) -> Vec<DetectedString> {
|
||||
let started = std::time::Instant::now();
|
||||
let mut out: Vec<DetectedString> = Vec::new();
|
||||
|
||||
for section in sections {
|
||||
if section.name != ".rdata" { continue; }
|
||||
let raw_start = section.virtual_address as usize;
|
||||
let raw_end = (section.virtual_address + section.virtual_size) as usize;
|
||||
if raw_end > pe.len() { continue; }
|
||||
let bytes = &pe[raw_start..raw_end.min(pe.len())];
|
||||
let va_base = image_base + section.virtual_address;
|
||||
|
||||
scan_ascii(bytes, va_base, &mut out);
|
||||
scan_utf16le(bytes, va_base, &mut out);
|
||||
scan_shift_jis(bytes, va_base, &mut out);
|
||||
scan_utf8(bytes, va_base, &mut out);
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
let n_ascii = out.iter().filter(|s| s.encoding == "ascii").count();
|
||||
let n_utf16 = out.iter().filter(|s| s.encoding == "utf16le").count();
|
||||
let n_sjis = out.iter().filter(|s| s.encoding == "shift_jis").count();
|
||||
let n_utf8 = out.iter().filter(|s| s.encoding == "utf8").count();
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "strings").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
ascii = n_ascii,
|
||||
utf16le = n_utf16,
|
||||
shift_jis = n_sjis,
|
||||
utf8 = n_utf8,
|
||||
total = out.len(),
|
||||
elapsed_ms,
|
||||
"string scan complete"
|
||||
);
|
||||
out
|
||||
}
|
||||
|
||||
const MIN_LEN: usize = 6;
|
||||
|
||||
fn is_printable_ascii(b: u8) -> bool {
|
||||
// Printable + the common whitespace characters used in real strings.
|
||||
matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r')
|
||||
}
|
||||
|
||||
fn scan_ascii(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
if !is_printable_ascii(bytes[i]) {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
let start = i;
|
||||
while i < bytes.len() && is_printable_ascii(bytes[i]) { i += 1; }
|
||||
let run_len = i - start;
|
||||
// Require NUL termination and minimum length.
|
||||
if run_len >= MIN_LEN && i < bytes.len() && bytes[i] == 0 {
|
||||
let s = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
|
||||
out.push(DetectedString {
|
||||
address: va_base + start as u32,
|
||||
encoding: "ascii",
|
||||
length: run_len as u32,
|
||||
content: s.to_string(),
|
||||
});
|
||||
}
|
||||
// Skip the NUL (if any) before continuing.
|
||||
if i < bytes.len() && bytes[i] == 0 { i += 1; }
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_utf16le(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||
// UTF-16LE strings are 2-byte aligned in MSVC output. Walk on even
|
||||
// offsets to avoid misaligned hits.
|
||||
let mut i = 0;
|
||||
while i + 2 <= bytes.len() {
|
||||
if !i.is_multiple_of(2) { i += 1; continue; }
|
||||
let lo = bytes[i];
|
||||
let hi = bytes[i + 1];
|
||||
// Restrict scan-start to printable ASCII range with a zero high byte —
|
||||
// this is what real Xbox 360 wide strings look like.
|
||||
if hi != 0 || !is_printable_ascii(lo) {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
let start = i;
|
||||
let mut codeunits: Vec<u16> = Vec::new();
|
||||
while i + 2 <= bytes.len() {
|
||||
let l = bytes[i];
|
||||
let h = bytes[i + 1];
|
||||
if h != 0 || !is_printable_ascii(l) { break; }
|
||||
codeunits.push((h as u16) << 8 | l as u16);
|
||||
i += 2;
|
||||
}
|
||||
// Require NUL u16 terminator.
|
||||
let nul_terminated = i + 2 <= bytes.len() && bytes[i] == 0 && bytes[i + 1] == 0;
|
||||
if codeunits.len() >= MIN_LEN && nul_terminated {
|
||||
let s: String = String::from_utf16_lossy(&codeunits);
|
||||
out.push(DetectedString {
|
||||
address: va_base + start as u32,
|
||||
encoding: "utf16le",
|
||||
length: ((i - start) as u32),
|
||||
content: s,
|
||||
});
|
||||
}
|
||||
// Skip past the terminator.
|
||||
if nul_terminated { i += 2; }
|
||||
}
|
||||
}
|
||||
|
||||
/// Per JIS X 0208: Shift_JIS first byte ∈ [0x81, 0x9F] ∪ [0xE0, 0xEF];
|
||||
/// trail byte ∈ [0x40, 0x7E] ∪ [0x80, 0xFC]. Single-byte ASCII and JIS
|
||||
/// half-width katakana (0xA1..=0xDF) are passed through.
|
||||
fn is_sjis_lead(b: u8) -> bool {
|
||||
(0x81..=0x9F).contains(&b) || (0xE0..=0xEF).contains(&b)
|
||||
}
|
||||
fn is_sjis_trail(b: u8) -> bool {
|
||||
(0x40..=0x7E).contains(&b) || (0x80..=0xFC).contains(&b)
|
||||
}
|
||||
fn is_sjis_singlebyte(b: u8) -> bool {
|
||||
is_printable_ascii(b) || (0xA1..=0xDF).contains(&b)
|
||||
}
|
||||
|
||||
/// Scan for Shift_JIS strings — runs of ≥ 6 bytes consisting of valid
|
||||
/// SJIS code units (single-byte ASCII / half-width katakana, OR a
|
||||
/// lead+trail pair). At least one multi-byte pair must be present so we
|
||||
/// don't double-count strings that are purely ASCII.
|
||||
fn scan_shift_jis(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
let start = i;
|
||||
let mut has_multibyte = false;
|
||||
let mut nbytes = 0;
|
||||
while i < bytes.len() {
|
||||
let b = bytes[i];
|
||||
if is_sjis_lead(b) && i + 1 < bytes.len() && is_sjis_trail(bytes[i + 1]) {
|
||||
has_multibyte = true;
|
||||
nbytes += 2;
|
||||
i += 2;
|
||||
} else if is_sjis_singlebyte(b) {
|
||||
nbytes += 1;
|
||||
i += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Require NUL terminator + min length + at least one multi-byte char.
|
||||
if has_multibyte
|
||||
&& nbytes >= MIN_LEN
|
||||
&& i < bytes.len() && bytes[i] == 0
|
||||
{
|
||||
// Decode SJIS → UTF-8 best-effort. We don't ship a full
|
||||
// SJIS decoder; keep the bytes as a `\u{XX}\u{YY}…` style
|
||||
// rendering for diagnostic readability, and let downstream
|
||||
// tooling re-decode if needed.
|
||||
let raw = &bytes[start..i];
|
||||
let mut s = String::with_capacity(raw.len() * 4);
|
||||
let mut p = 0;
|
||||
while p < raw.len() {
|
||||
let b = raw[p];
|
||||
if is_sjis_lead(b) && p + 1 < raw.len() && is_sjis_trail(raw[p + 1]) {
|
||||
// Render as SJIS hex pair so the string is identifiable
|
||||
// even without a decoder. Real Japanese decoding is a
|
||||
// future enhancement.
|
||||
s.push_str(&format!("\\x{:02X}\\x{:02X}", b, raw[p + 1]));
|
||||
p += 2;
|
||||
} else {
|
||||
s.push(b as char);
|
||||
p += 1;
|
||||
}
|
||||
}
|
||||
out.push(DetectedString {
|
||||
address: va_base + start as u32,
|
||||
encoding: "shift_jis",
|
||||
length: nbytes as u32,
|
||||
content: s,
|
||||
});
|
||||
i += 1; // skip NUL
|
||||
} else {
|
||||
// Advance past whatever didn't match.
|
||||
i = start + 1;
|
||||
if i < bytes.len() && bytes[i] == 0 { i += 1; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan for UTF-8 strings carrying multi-byte sequences (we already
|
||||
/// catch pure-ASCII via `scan_ascii`). Validates 2/3-byte sequences;
|
||||
/// 4-byte (supplementary plane) is uncommon in game text and skipped.
|
||||
fn scan_utf8(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
let start = i;
|
||||
let mut has_multibyte = false;
|
||||
let mut nbytes = 0;
|
||||
while i < bytes.len() {
|
||||
let b = bytes[i];
|
||||
if b < 0x80 {
|
||||
if !is_printable_ascii(b) { break; }
|
||||
nbytes += 1;
|
||||
i += 1;
|
||||
} else if (b & 0xE0) == 0xC0 {
|
||||
// 2-byte: 110xxxxx 10xxxxxx
|
||||
if i + 1 >= bytes.len() || (bytes[i + 1] & 0xC0) != 0x80 { break; }
|
||||
has_multibyte = true;
|
||||
nbytes += 2;
|
||||
i += 2;
|
||||
} else if (b & 0xF0) == 0xE0 {
|
||||
// 3-byte: 1110xxxx 10xxxxxx 10xxxxxx
|
||||
if i + 2 >= bytes.len()
|
||||
|| (bytes[i + 1] & 0xC0) != 0x80
|
||||
|| (bytes[i + 2] & 0xC0) != 0x80 { break; }
|
||||
has_multibyte = true;
|
||||
nbytes += 3;
|
||||
i += 3;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if has_multibyte
|
||||
&& nbytes >= MIN_LEN
|
||||
&& i < bytes.len() && bytes[i] == 0
|
||||
&& let Ok(s) = std::str::from_utf8(&bytes[start..i])
|
||||
{
|
||||
out.push(DetectedString {
|
||||
address: va_base + start as u32,
|
||||
encoding: "utf8",
|
||||
length: nbytes as u32,
|
||||
content: s.to_string(),
|
||||
});
|
||||
i += 1; // skip NUL
|
||||
} else {
|
||||
i = start + 1;
|
||||
if i < bytes.len() && bytes[i] == 0 { i += 1; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn mk_section(name: &str, va: u32, size: u32) -> PeSection {
|
||||
PeSection {
|
||||
name: name.into(),
|
||||
virtual_address: va,
|
||||
virtual_size: size,
|
||||
raw_offset: va,
|
||||
raw_size: size,
|
||||
flags: 0x4000_0040,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_ascii_string() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
let s = b"Hello, world!\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(strings.len(), 1);
|
||||
assert_eq!(strings[0].encoding, "ascii");
|
||||
assert_eq!(strings[0].content, "Hello, world!");
|
||||
assert_eq!(strings[0].address, image_base + 0x1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_short_runs() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
let s = b"Hi\0longer string here\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(strings.len(), 1);
|
||||
assert_eq!(strings[0].content, "longer string here");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_utf16le_string() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
// "Hello!" in UTF-16LE + NUL u16
|
||||
let s: &[u8] = b"H\0e\0l\0l\0o\0!\0\0\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
// Both ASCII and UTF-16 may detect — UTF-16 should find it as wide;
|
||||
// ASCII pass scans bytes and won't see this as a contiguous run
|
||||
// because of the interleaved 0 bytes (non-printable).
|
||||
let utf16: Vec<_> = strings.iter().filter(|s| s.encoding == "utf16le").collect();
|
||||
assert!(utf16.iter().any(|s| s.content == "Hello!"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_shift_jis_string() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
// "ABC" + (SJIS hiragana 'a' = 0x82 0xA0) + (SJIS 'i' = 0x82 0xA2) + NUL
|
||||
let s: &[u8] = b"ABC\x82\xA0\x82\xA2\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
let sjis: Vec<_> = strings.iter().filter(|s| s.encoding == "shift_jis").collect();
|
||||
assert_eq!(sjis.len(), 1);
|
||||
assert!(sjis[0].content.contains("ABC"));
|
||||
assert!(sjis[0].content.contains("\\x82\\xA0"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_utf8_multibyte_string() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
// "Café" = 'C', 'a', 'f', 0xC3 0xA9 (é), then more ASCII to reach min length
|
||||
let s: &[u8] = b"Caf\xC3\xA9eteria\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
let u8s: Vec<_> = strings.iter().filter(|s| s.encoding == "utf8").collect();
|
||||
assert_eq!(u8s.len(), 1);
|
||||
assert_eq!(u8s[0].content, "Café".to_string() + "eteria");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn requires_nul_terminator() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
// No trailing NUL — should NOT be detected.
|
||||
let off = 0x1000usize;
|
||||
let s = b"abcdefghij";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
// Fill rest of section with 0xFF so the run terminates cleanly without NUL.
|
||||
for j in off + s.len()..off + 0x100 { pe[j] = 0xFF; }
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(strings.len(), 0);
|
||||
}
|
||||
}
|
||||
424
crates/xenia-analysis/src/vtables.rs
Normal file
424
crates/xenia-analysis/src/vtables.rs
Normal file
@@ -0,0 +1,424 @@
|
||||
//! MSVC vtable + RTTI detection.
|
||||
//!
|
||||
//! Heuristic two-pass scan over the binary's read-only data sections. Pass 1
|
||||
//! finds candidate vtables — runs of ≥3 contiguous big-endian u32 values that
|
||||
//! all land on known function entries. Pass 2 attempts the MSVC RTTI walk
|
||||
//! `vtable[-1] → CompleteObjectLocator → TypeDescriptor → mangled name`. When
|
||||
//! RTTI is stripped (typical for shipped game binaries), each anonymous vtable
|
||||
//! gets a deterministic name `ANON_Class_<hex>` keyed by a hash of its
|
||||
//! sorted method PCs (so identical vtables across multiple class instances
|
||||
//! collapse to one entry).
|
||||
//!
|
||||
//! What this module does NOT do:
|
||||
//! - Vtables in heap-allocated memory (built at runtime by ctors) are out of
|
||||
//! scope — only vtables present statically in `.rdata` / `.data`.
|
||||
//! - RTTI inheritance (`BaseClassDescriptor` walk) is best-effort; we record
|
||||
//! the first-level base list when present and leave it NULL otherwise.
|
||||
//! - Multiple-inheritance "extra" vftables (one per base subobject) are
|
||||
//! detected as independent vtables; we don't link them.
|
||||
//!
|
||||
//! Reference: openrce.org "Reversing Microsoft Visual C++" RTTI articles
|
||||
//! (CompleteObjectLocator / TypeDescriptor / BaseClassDescriptor layout).
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::demangle;
|
||||
|
||||
/// One detected vtable.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Vtable {
|
||||
/// Absolute VA of `vtable[0]` (first method slot).
|
||||
pub address: u32,
|
||||
/// Number of methods in the vtable.
|
||||
pub length: u32,
|
||||
/// Absolute VA of the `CompleteObjectLocator` from `vtable[-1]`, if it
|
||||
/// looked like a valid pointer into `.rdata`. NULL when no RTTI / stripped.
|
||||
pub col_address: Option<u32>,
|
||||
/// Class name. Demangled from RTTI when available, otherwise the synthetic
|
||||
/// `ANON_Class_<hex>` form.
|
||||
pub class_name: String,
|
||||
/// True when the COL → TypeDescriptor walk succeeded.
|
||||
pub rtti_present: bool,
|
||||
/// First-level base class names from `RTTIClassHierarchyDescriptor`, JSON-encoded.
|
||||
/// `None` when not parseable.
|
||||
pub base_classes_json: Option<String>,
|
||||
/// One entry per slot: function VA in `.text`.
|
||||
pub methods: Vec<u32>,
|
||||
}
|
||||
|
||||
/// Run the vtable scan + RTTI walk. `function_starts` is the set of valid
|
||||
/// `.text` function entry VAs from M1's corrected `functions` table.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
function_starts: &std::collections::BTreeSet<u32>,
|
||||
) -> Vec<Vtable> {
|
||||
let started = std::time::Instant::now();
|
||||
// Sections we'll scan for vtable bodies.
|
||||
let scan_targets: Vec<&PeSection> = sections
|
||||
.iter()
|
||||
.filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
|
||||
.collect();
|
||||
|
||||
// Range table for "is this VA in .rdata or .data?"
|
||||
let rdata_ranges: Vec<(u32, u32)> = sections
|
||||
.iter()
|
||||
.filter(|s| s.name == ".rdata")
|
||||
.map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size))
|
||||
.collect();
|
||||
|
||||
let mut candidates: Vec<Vtable> = Vec::new();
|
||||
|
||||
for section in scan_targets {
|
||||
let va_start = image_base + section.virtual_address;
|
||||
let va_end = va_start + section.virtual_size;
|
||||
let raw_start = section.virtual_address as usize;
|
||||
let raw_end = (section.virtual_address + section.virtual_size) as usize;
|
||||
if raw_end > pe.len() { continue; }
|
||||
let bytes = &pe[raw_start..raw_end.min(pe.len())];
|
||||
|
||||
let mut i = 0usize;
|
||||
while i + 12 <= bytes.len() {
|
||||
// Try to start a run at this 4-aligned offset.
|
||||
if !i.is_multiple_of(4) { i += 1; continue; }
|
||||
let mut run_len = 0usize;
|
||||
let mut methods: Vec<u32> = Vec::new();
|
||||
let mut j = i;
|
||||
while j + 4 <= bytes.len() {
|
||||
let val = u32::from_be_bytes([bytes[j], bytes[j + 1], bytes[j + 2], bytes[j + 3]]);
|
||||
if function_starts.contains(&val) {
|
||||
methods.push(val);
|
||||
run_len += 1;
|
||||
j += 4;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if run_len >= 3 {
|
||||
let address = va_start + (i as u32);
|
||||
candidates.push(Vtable {
|
||||
address,
|
||||
length: run_len as u32,
|
||||
col_address: None,
|
||||
class_name: synth_anon_name(&methods),
|
||||
rtti_present: false,
|
||||
base_classes_json: None,
|
||||
methods,
|
||||
});
|
||||
i += run_len * 4;
|
||||
} else {
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
let _ = (va_start, va_end);
|
||||
}
|
||||
|
||||
// RTTI walk: for each candidate, look at vtable[-1].
|
||||
let pe_image_base = image_base;
|
||||
for v in &mut candidates {
|
||||
if v.address < 4 { continue; }
|
||||
let col_off = (v.address - pe_image_base - 4) as usize;
|
||||
if col_off + 4 > pe.len() { continue; }
|
||||
let col_ptr = u32::from_be_bytes([pe[col_off], pe[col_off + 1], pe[col_off + 2], pe[col_off + 3]]);
|
||||
if col_ptr == 0 { continue; }
|
||||
if !is_in_ranges(col_ptr, &rdata_ranges) { continue; }
|
||||
|
||||
// Try to extract the TypeDescriptor mangled-name string.
|
||||
if let Some((td_ptr, hierarchy_ptr)) = read_col(pe, image_base, col_ptr)
|
||||
&& let Some(mangled) = read_typedescriptor_name(pe, image_base, td_ptr, &rdata_ranges)
|
||||
&& let Some(class) = demangle_rtti_typename(&mangled)
|
||||
{
|
||||
v.col_address = Some(col_ptr);
|
||||
v.class_name = class;
|
||||
v.rtti_present = true;
|
||||
v.base_classes_json = read_class_hierarchy(pe, image_base, hierarchy_ptr, &rdata_ranges);
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
let rtti_count = candidates.iter().filter(|v| v.rtti_present).count();
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "vtables").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
vtables = candidates.len(),
|
||||
rtti = rtti_count,
|
||||
anon = candidates.len() - rtti_count,
|
||||
elapsed_ms,
|
||||
"vtable scan complete"
|
||||
);
|
||||
candidates
|
||||
}
|
||||
|
||||
fn is_in_ranges(addr: u32, ranges: &[(u32, u32)]) -> bool {
|
||||
ranges.iter().any(|&(s, e)| addr >= s && addr < e)
|
||||
}
|
||||
|
||||
/// Read 4 big-endian bytes at absolute VA `addr` from the PE image.
|
||||
fn read_be_u32(pe: &[u8], image_base: u32, addr: u32) -> Option<u32> {
|
||||
let off = addr.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||
}
|
||||
|
||||
/// Parse a `CompleteObjectLocator` at VA `col`. Returns
|
||||
/// `(type_descriptor_ptr, class_hierarchy_descriptor_ptr)` on success.
|
||||
///
|
||||
/// Layout (32-bit MSVC):
|
||||
/// ```text
|
||||
/// +0x00 signature (0 for x86 without /GR-, can be 1)
|
||||
/// +0x04 offset within complete object
|
||||
/// +0x08 cdOffset (this-pointer adjuster)
|
||||
/// +0x0C TypeDescriptor *
|
||||
/// +0x10 RTTIClassHierarchyDescriptor *
|
||||
/// ```
|
||||
fn read_col(pe: &[u8], image_base: u32, col: u32) -> Option<(u32, u32)> {
|
||||
let td = read_be_u32(pe, image_base, col + 0x0C)?;
|
||||
let chd = read_be_u32(pe, image_base, col + 0x10)?;
|
||||
if td == 0 { return None; }
|
||||
Some((td, chd))
|
||||
}
|
||||
|
||||
/// Read a TypeDescriptor's mangled-name string at VA `td`.
|
||||
///
|
||||
/// Layout: `+0x00` vftable ptr, `+0x04` "spare", `+0x08` zero-terminated
|
||||
/// mangled name (e.g. `.?AVClassName@@`).
|
||||
fn read_typedescriptor_name(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
td: u32,
|
||||
rdata_ranges: &[(u32, u32)],
|
||||
) -> Option<String> {
|
||||
if !is_in_ranges(td, rdata_ranges) { return None; }
|
||||
let name_va = td + 0x08;
|
||||
let off = name_va.wrapping_sub(image_base) as usize;
|
||||
if off + 1 > pe.len() { return None; }
|
||||
// Read up to 256 bytes or until NUL.
|
||||
let mut end = off;
|
||||
while end < pe.len().min(off + 256) && pe[end] != 0 { end += 1; }
|
||||
if end == off { return None; }
|
||||
let s = std::str::from_utf8(&pe[off..end]).ok()?;
|
||||
// Sanity: MSVC RTTI names always start with `.?A`.
|
||||
if !s.starts_with(".?A") { return None; }
|
||||
Some(s.to_string())
|
||||
}
|
||||
|
||||
/// Demangle an RTTI type-name string of the form `.?AVClassName@ns@@`.
|
||||
/// MSVC convention: leading `.` is the marker for an RTTI string; strip it
|
||||
/// before passing to the demangler.
|
||||
fn demangle_rtti_typename(rtti_name: &str) -> Option<String> {
|
||||
let stripped = rtti_name.strip_prefix('.')?;
|
||||
let raw = msvc_demangler::demangle(stripped, msvc_demangler::DemangleFlags::llvm()).ok()?;
|
||||
// Output looks like `class xe::apu::AudioSystem` or `struct foo::Bar`.
|
||||
let cls = raw
|
||||
.strip_prefix("class ")
|
||||
.or_else(|| raw.strip_prefix("struct "))
|
||||
.or_else(|| raw.strip_prefix("union "))
|
||||
.unwrap_or(&raw);
|
||||
Some(cls.to_string())
|
||||
}
|
||||
|
||||
/// Best-effort `RTTIClassHierarchyDescriptor` walk: read the
|
||||
/// `BaseClassArray` entries and demangle each base's TypeDescriptor name.
|
||||
/// Returns a JSON array string on success.
|
||||
///
|
||||
/// Layout:
|
||||
/// ```text
|
||||
/// RTTIClassHierarchyDescriptor:
|
||||
/// +0x00 signature
|
||||
/// +0x04 attributes
|
||||
/// +0x08 numBaseClasses
|
||||
/// +0x0C BaseClassArray * (-> array of BaseClassDescriptor *)
|
||||
/// BaseClassDescriptor:
|
||||
/// +0x00 TypeDescriptor *
|
||||
/// +0x04 numContainedBases
|
||||
/// ...
|
||||
/// ```
|
||||
fn read_class_hierarchy(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
chd: u32,
|
||||
rdata_ranges: &[(u32, u32)],
|
||||
) -> Option<String> {
|
||||
if !is_in_ranges(chd, rdata_ranges) { return None; }
|
||||
let num_bases = read_be_u32(pe, image_base, chd + 0x08)?;
|
||||
if num_bases == 0 || num_bases > 256 { return None; } // sanity cap
|
||||
let bca_ptr = read_be_u32(pe, image_base, chd + 0x0C)?;
|
||||
if !is_in_ranges(bca_ptr, rdata_ranges) { return None; }
|
||||
|
||||
let mut names: Vec<String> = Vec::new();
|
||||
for i in 0..num_bases {
|
||||
let bcd_ptr = match read_be_u32(pe, image_base, bca_ptr + i * 4) {
|
||||
Some(p) if is_in_ranges(p, rdata_ranges) => p,
|
||||
_ => return None,
|
||||
};
|
||||
let td_ptr = match read_be_u32(pe, image_base, bcd_ptr) {
|
||||
Some(p) if is_in_ranges(p, rdata_ranges) => p,
|
||||
_ => return None,
|
||||
};
|
||||
let mangled = match read_typedescriptor_name(pe, image_base, td_ptr, rdata_ranges) {
|
||||
Some(s) => s,
|
||||
None => return None,
|
||||
};
|
||||
let cls = demangle_rtti_typename(&mangled).unwrap_or(mangled);
|
||||
names.push(cls);
|
||||
}
|
||||
serde_json::to_string(&names).ok()
|
||||
}
|
||||
|
||||
/// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of
|
||||
/// the sorted method-PC list. Two vtables with identical method ordering
|
||||
/// collapse to the same anonymous name.
|
||||
fn synth_anon_name(methods: &[u32]) -> String {
|
||||
// FNV-1a 64-bit on the sorted PC list; we only use 32 bits for brevity.
|
||||
let mut sorted = methods.to_vec();
|
||||
sorted.sort_unstable();
|
||||
let mut h: u64 = 0xcbf29ce484222325;
|
||||
for pc in &sorted {
|
||||
for b in pc.to_le_bytes() {
|
||||
h ^= b as u64;
|
||||
h = h.wrapping_mul(0x100000001b3);
|
||||
}
|
||||
}
|
||||
format!("ANON_Class_{:08X}", (h as u32))
|
||||
}
|
||||
|
||||
/// Build the per-method `(vtable_address, slot, function_address)` list for
|
||||
/// DB insertion, with optional demangled-name lookup for any function that
|
||||
/// has a matching `?…` label. Skips slots whose function isn't in the
|
||||
/// supplied label map.
|
||||
pub fn methods_table(
|
||||
vtables: &[Vtable],
|
||||
labels: &std::collections::HashMap<u32, String>,
|
||||
) -> Vec<(u32, u32, u32, Option<String>, Option<String>)> {
|
||||
let mut out = Vec::new();
|
||||
for v in vtables {
|
||||
for (slot, &fn_va) in v.methods.iter().enumerate() {
|
||||
let label = labels.get(&fn_va).cloned();
|
||||
let demangled = label.as_ref()
|
||||
.and_then(|l| demangle::demangle(l).map(|d| d.raw_demangled));
|
||||
out.push((v.address, slot as u32, fn_va, label, demangled));
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Build a `class_name → Vtable` summary for the `classes` table. Multiple
|
||||
/// vtables sharing the same class name (multiple instances at link time)
|
||||
/// collapse via `BTreeMap` — the first detected vtable wins.
|
||||
pub fn classes_table(vtables: &[Vtable]) -> Vec<(String, u32, bool, Option<String>)> {
|
||||
let mut by_name: BTreeMap<String, &Vtable> = BTreeMap::new();
|
||||
for v in vtables {
|
||||
by_name.entry(v.class_name.clone()).or_insert(v);
|
||||
}
|
||||
by_name
|
||||
.into_iter()
|
||||
.map(|(name, v)| (name, v.address, v.rtti_present, v.base_classes_json.clone()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn synth_anon_name_is_stable() {
|
||||
let a = synth_anon_name(&[0x82001000, 0x82001100, 0x82001200]);
|
||||
let b = synth_anon_name(&[0x82001200, 0x82001000, 0x82001100]);
|
||||
assert_eq!(a, b, "anon name must be order-independent");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn synth_anon_name_differs_for_different_methods() {
|
||||
let a = synth_anon_name(&[0x82001000, 0x82001100]);
|
||||
let b = synth_anon_name(&[0x82002000, 0x82002100]);
|
||||
assert_ne!(a, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_3_method_vtable_in_rdata() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let text_va = 0x2000u32;
|
||||
let rdata_size = 16u32;
|
||||
let text_size = 0x100u32;
|
||||
|
||||
// PE buffer big enough for both sections.
|
||||
let total = (text_va + text_size) as usize;
|
||||
let mut pe = vec![0u8; total];
|
||||
|
||||
// Vtable: 3 method PCs at .rdata start, all valid function entries.
|
||||
let m: [u32; 3] = [image_base + text_va, image_base + text_va + 0x10, image_base + text_va + 0x20];
|
||||
for (i, val) in m.iter().enumerate() {
|
||||
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
|
||||
.copy_from_slice(&val.to_be_bytes());
|
||||
}
|
||||
|
||||
let sections = vec![
|
||||
PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: rdata_va,
|
||||
virtual_size: rdata_size,
|
||||
raw_offset: rdata_va,
|
||||
raw_size: rdata_size,
|
||||
flags: 0x4000_0040,
|
||||
},
|
||||
PeSection {
|
||||
name: ".text".into(),
|
||||
virtual_address: text_va,
|
||||
virtual_size: text_size,
|
||||
raw_offset: text_va,
|
||||
raw_size: text_size,
|
||||
flags: 0x6000_0020,
|
||||
},
|
||||
];
|
||||
let mut function_starts = std::collections::BTreeSet::new();
|
||||
for &pc in &m { function_starts.insert(pc); }
|
||||
|
||||
let vtables = analyze(&pe, image_base, §ions, &function_starts);
|
||||
assert_eq!(vtables.len(), 1);
|
||||
assert_eq!(vtables[0].length, 3);
|
||||
assert_eq!(vtables[0].address, image_base + rdata_va);
|
||||
assert!(vtables[0].class_name.starts_with("ANON_Class_"));
|
||||
assert!(!vtables[0].rtti_present);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_2_method_run() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let text_va = 0x2000u32;
|
||||
|
||||
let total = (text_va + 0x100) as usize;
|
||||
let mut pe = vec![0u8; total];
|
||||
let m: [u32; 2] = [image_base + text_va, image_base + text_va + 0x10];
|
||||
for (i, val) in m.iter().enumerate() {
|
||||
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
|
||||
.copy_from_slice(&val.to_be_bytes());
|
||||
}
|
||||
let sections = vec![
|
||||
PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: rdata_va,
|
||||
virtual_size: 8,
|
||||
raw_offset: rdata_va,
|
||||
raw_size: 8,
|
||||
flags: 0x4000_0040,
|
||||
},
|
||||
PeSection {
|
||||
name: ".text".into(),
|
||||
virtual_address: text_va,
|
||||
virtual_size: 0x100,
|
||||
raw_offset: text_va,
|
||||
raw_size: 0x100,
|
||||
flags: 0x6000_0020,
|
||||
},
|
||||
];
|
||||
let mut function_starts = std::collections::BTreeSet::new();
|
||||
for &pc in &m { function_starts.insert(pc); }
|
||||
let vtables = analyze(&pe, image_base, §ions, &function_starts);
|
||||
assert_eq!(vtables.len(), 0, "runs of 2 must be rejected to keep false-positive rate down");
|
||||
}
|
||||
}
|
||||
@@ -8,23 +8,25 @@ use crate::func::FuncAnalysis;
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum XrefKind {
|
||||
Call, // bl
|
||||
Jump, // b (unconditional)
|
||||
Branch, // bc / bXX (conditional)
|
||||
DataRead, // lwz, lbz, lhz, lha, lfs, lfd, etc. from resolved address
|
||||
DataWrite, // stw, stb, sth, stfs, stfd, etc. to resolved address
|
||||
DataRef, // address computed via lis+addi/ori but not directly loaded/stored
|
||||
Call, // bl
|
||||
IndirectCall, // bcctrl through a statically-resolvable vtable slot (M5)
|
||||
Jump, // b (unconditional)
|
||||
Branch, // bc / bXX (conditional)
|
||||
DataRead, // lwz, lbz, lhz, lha, lfs, lfd, etc. from resolved address
|
||||
DataWrite, // stw, stb, sth, stfs, stfd, etc. to resolved address
|
||||
DataRef, // address computed via lis+addi/ori but not directly loaded/stored
|
||||
}
|
||||
|
||||
impl XrefKind {
|
||||
pub fn tag(self) -> &'static str {
|
||||
match self {
|
||||
XrefKind::Call => "call",
|
||||
XrefKind::Jump => "j",
|
||||
XrefKind::Branch => "br",
|
||||
XrefKind::DataRead => "read",
|
||||
XrefKind::DataWrite => "write",
|
||||
XrefKind::DataRef => "ref",
|
||||
XrefKind::Call => "call",
|
||||
XrefKind::IndirectCall => "ind_call",
|
||||
XrefKind::Jump => "j",
|
||||
XrefKind::Branch => "br",
|
||||
XrefKind::DataRead => "read",
|
||||
XrefKind::DataWrite => "write",
|
||||
XrefKind::DataRef => "ref",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,10 +39,56 @@ impl XrefKind {
|
||||
}
|
||||
}
|
||||
|
||||
/// Sub-classification of how `source`'s instruction computes its target
|
||||
/// address. Only meaningful for data xrefs (`read` / `write` / `ref`); call
|
||||
/// / jump / branch / ind_call rows store `None`.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
|
||||
pub enum AddrMode {
|
||||
/// Standard signed-16 displacement: `lwz rD, simm(rA)`, `stw rS, simm(rA)`,
|
||||
/// FP D-forms (`lfs/lfd/stfs/stfd`), update variants. The dominant case.
|
||||
DForm,
|
||||
/// Address materialised via `lis + addi` register tracking — no
|
||||
/// load/store yet at this site.
|
||||
LisAddi,
|
||||
/// Address materialised via `lis + ori` register tracking.
|
||||
LisOri,
|
||||
/// Multi-word D-form: `lmw / stmw rS, simm(rA)` — emits one xref per
|
||||
/// register slot (32-rS slots starting at the resolved base).
|
||||
Multiword,
|
||||
/// X-form indexed: `stwx / stbx / sthx / stwux / stbux / sthux / stdx /
|
||||
/// stdux` plus AltiVec/VMX vector stores `stvx / stvxl / stvebx /
|
||||
/// stvehx / stvewx`. Static resolution requires both rA and rB
|
||||
/// constant. (M6 + VMX follow-up.)
|
||||
XFormIndexed,
|
||||
/// X-form byte-reverse: `stwbrx / sthbrx / lwbrx / lhbrx`.
|
||||
XFormByteRev,
|
||||
/// Reservation/atomic store-conditional: `stwcx. / stdcx.`.
|
||||
Atomic,
|
||||
/// Cache-line clear: `dcbz rA, rB` — clears 32 bytes at rA+rB.
|
||||
DCBZ,
|
||||
}
|
||||
|
||||
impl AddrMode {
|
||||
pub fn tag(self) -> &'static str {
|
||||
match self {
|
||||
AddrMode::DForm => "d_form",
|
||||
AddrMode::LisAddi => "lis_addi",
|
||||
AddrMode::LisOri => "lis_ori",
|
||||
AddrMode::Multiword => "multiword",
|
||||
AddrMode::XFormIndexed => "x_form_indexed",
|
||||
AddrMode::XFormByteRev => "x_form_byterev",
|
||||
AddrMode::Atomic => "atomic",
|
||||
AddrMode::DCBZ => "dcbz",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct Xref {
|
||||
pub source: u32,
|
||||
pub kind: XrefKind,
|
||||
/// `None` for control-flow edges; `Some(...)` for data edges.
|
||||
pub addr_mode: Option<AddrMode>,
|
||||
}
|
||||
|
||||
pub type XrefMap = HashMap<u32, Vec<Xref>>;
|
||||
@@ -53,6 +101,7 @@ pub struct XrefResult {
|
||||
}
|
||||
|
||||
/// Perform full cross-reference analysis on a PE image.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base), entry_point = format_args!("{:#010x}", entry_point)))]
|
||||
pub fn analyze_xrefs(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
@@ -61,6 +110,7 @@ pub fn analyze_xrefs(
|
||||
func_analysis: &FuncAnalysis,
|
||||
import_map: &HashMap<u32, String>,
|
||||
) -> XrefResult {
|
||||
let started = std::time::Instant::now();
|
||||
let func_labels = func_analysis.generate_labels();
|
||||
let mut labels: HashMap<u32, String> = func_labels;
|
||||
labels.insert(entry_point, "entry_point".to_string());
|
||||
@@ -124,7 +174,7 @@ pub fn analyze_xrefs(
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||
let uimm = (instr & 0xFFFF) as u32;
|
||||
let uimm = instr & 0xFFFF;
|
||||
|
||||
// Reset tracking on function boundaries (prologue = mfspr rN, LR)
|
||||
if opcode == 31 {
|
||||
@@ -156,7 +206,10 @@ pub fn analyze_xrefs(
|
||||
let data_addr = base.wrapping_add(simm as u32);
|
||||
if is_in_ranges(data_addr, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (data_addr, XrefKind::DataRef));
|
||||
xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataRef });
|
||||
xrefs.entry(data_addr).or_default().push(Xref {
|
||||
source: abs_addr, kind: XrefKind::DataRef,
|
||||
addr_mode: Some(AddrMode::LisAddi),
|
||||
});
|
||||
labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}"));
|
||||
}
|
||||
reg_hi[rd] = Some(data_addr); // propagate for chained access
|
||||
@@ -171,7 +224,10 @@ pub fn analyze_xrefs(
|
||||
let data_addr = base | uimm;
|
||||
if is_in_ranges(data_addr, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (data_addr, XrefKind::DataRef));
|
||||
xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataRef });
|
||||
xrefs.entry(data_addr).or_default().push(Xref {
|
||||
source: abs_addr, kind: XrefKind::DataRef,
|
||||
addr_mode: Some(AddrMode::LisOri),
|
||||
});
|
||||
labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}"));
|
||||
}
|
||||
reg_hi[ra] = Some(data_addr);
|
||||
@@ -180,33 +236,163 @@ pub fn analyze_xrefs(
|
||||
}
|
||||
}
|
||||
// Load instructions: lwz, lbz, lhz, lha, lfs, lfd, lwzu, etc.
|
||||
32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 46 | 48 | 49 | 50 | 51 => {
|
||||
if ra != 0 {
|
||||
if let Some(base) = reg_hi[ra] {
|
||||
32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 48 | 49 | 50 | 51 => {
|
||||
if ra != 0
|
||||
&& let Some(base) = reg_hi[ra] {
|
||||
let data_addr = base.wrapping_add(simm as u32);
|
||||
if is_in_ranges(data_addr, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (data_addr, XrefKind::DataRead));
|
||||
xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataRead });
|
||||
xrefs.entry(data_addr).or_default().push(Xref {
|
||||
source: abs_addr, kind: XrefKind::DataRead,
|
||||
addr_mode: Some(AddrMode::DForm),
|
||||
});
|
||||
labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Load into rD may clobber the tracked value
|
||||
reg_hi[rd] = None;
|
||||
}
|
||||
// lmw rD, simm(rA) — D-form multi-word load. Reads (32-rD)
|
||||
// consecutive 4-byte words starting at base+simm into
|
||||
// rD..r31. Emits one DataRead per slot.
|
||||
46 => {
|
||||
if ra != 0
|
||||
&& let Some(base) = reg_hi[ra]
|
||||
{
|
||||
let mut addr_w = base.wrapping_add(simm as u32);
|
||||
for _slot in (rd as u32)..32 {
|
||||
if is_in_ranges(addr_w, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (addr_w, XrefKind::DataRead));
|
||||
xrefs.entry(addr_w).or_default().push(Xref {
|
||||
source: abs_addr, kind: XrefKind::DataRead,
|
||||
addr_mode: Some(AddrMode::Multiword),
|
||||
});
|
||||
labels.entry(addr_w).or_insert_with(|| format!("dat_{addr_w:08X}"));
|
||||
}
|
||||
addr_w = addr_w.wrapping_add(4);
|
||||
}
|
||||
}
|
||||
reg_hi[rd] = None;
|
||||
}
|
||||
// Store instructions: stw, stb, sth, stfs, stfd, stwu, etc.
|
||||
36 | 37 | 38 | 39 | 44 | 45 | 47 | 52 | 53 | 54 | 55 => {
|
||||
if ra != 0 {
|
||||
if let Some(base) = reg_hi[ra] {
|
||||
36 | 37 | 38 | 39 | 44 | 45 | 52 | 53 | 54 | 55 => {
|
||||
if ra != 0
|
||||
&& let Some(base) = reg_hi[ra] {
|
||||
let data_addr = base.wrapping_add(simm as u32);
|
||||
if is_in_ranges(data_addr, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (data_addr, XrefKind::DataWrite));
|
||||
xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataWrite });
|
||||
xrefs.entry(data_addr).or_default().push(Xref {
|
||||
source: abs_addr, kind: XrefKind::DataWrite,
|
||||
addr_mode: Some(AddrMode::DForm),
|
||||
});
|
||||
labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
// stmw rS, simm(rA) — D-form multi-word store. Writes
|
||||
// (32-rS) consecutive 4-byte words from rS..r31 to
|
||||
// base+simm onward. Emits one DataWrite per slot.
|
||||
47 => {
|
||||
if ra != 0
|
||||
&& let Some(base) = reg_hi[ra]
|
||||
{
|
||||
let mut addr_w = base.wrapping_add(simm as u32);
|
||||
for _slot in (rd as u32)..32 {
|
||||
if is_in_ranges(addr_w, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (addr_w, XrefKind::DataWrite));
|
||||
xrefs.entry(addr_w).or_default().push(Xref {
|
||||
source: abs_addr, kind: XrefKind::DataWrite,
|
||||
addr_mode: Some(AddrMode::Multiword),
|
||||
});
|
||||
labels.entry(addr_w).or_insert_with(|| format!("dat_{addr_w:08X}"));
|
||||
}
|
||||
addr_w = addr_w.wrapping_add(4);
|
||||
}
|
||||
}
|
||||
}
|
||||
// X-form: opcode 31 — indexed loads/stores, atomic ops, dcbz.
|
||||
// We can't statically resolve `rA + rB` without tracking rB
|
||||
// too; we record an xref ONLY when rB is also a known
|
||||
// constant (rare) OR when rB is r0 (which encodes as zero).
|
||||
// Falls through to the generic-clobber arm afterwards via
|
||||
// the explicit reg_hi update.
|
||||
31 => {
|
||||
let xo = (instr >> 1) & 0x3FF;
|
||||
let rb = ((instr >> 11) & 0x1F) as usize;
|
||||
let resolve_rab = |reg_hi: &[Option<u32>; 32]| -> Option<u32> {
|
||||
let a = if ra == 0 { Some(0u32) } else { reg_hi[ra] };
|
||||
let b = if rb == 0 { Some(0u32) } else { reg_hi[rb] };
|
||||
match (a, b) {
|
||||
(Some(av), Some(bv)) => Some(av.wrapping_add(bv)),
|
||||
_ => None,
|
||||
}
|
||||
};
|
||||
let mode_for_xo = |xo: u32| -> Option<(AddrMode, XrefKind)> {
|
||||
match xo {
|
||||
// Atomic store-conditional
|
||||
150 => Some((AddrMode::Atomic, XrefKind::DataWrite)), // stwcx.
|
||||
214 => Some((AddrMode::Atomic, XrefKind::DataWrite)), // stdcx.
|
||||
// Byte-reverse stores
|
||||
662 => Some((AddrMode::XFormByteRev, XrefKind::DataWrite)), // stwbrx
|
||||
918 => Some((AddrMode::XFormByteRev, XrefKind::DataWrite)), // sthbrx
|
||||
// Byte-reverse loads
|
||||
534 => Some((AddrMode::XFormByteRev, XrefKind::DataRead)), // lwbrx
|
||||
790 => Some((AddrMode::XFormByteRev, XrefKind::DataRead)), // lhbrx
|
||||
// dcbz — cache-line zero (32-byte clear). Treat as a write.
|
||||
1014 => Some((AddrMode::DCBZ, XrefKind::DataWrite)),
|
||||
// Plain X-form indexed stores (the common ones)
|
||||
151 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stwx
|
||||
215 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stbx
|
||||
407 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // sthx
|
||||
183 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stwux
|
||||
247 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stbux
|
||||
439 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // sthux
|
||||
149 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stdx
|
||||
181 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stdux
|
||||
// Plain X-form indexed loads
|
||||
23 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lwzx
|
||||
87 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lbzx
|
||||
279 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhzx
|
||||
343 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhax
|
||||
55 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lwzux
|
||||
119 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lbzux
|
||||
311 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhzux
|
||||
375 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhaux
|
||||
21 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // ldx
|
||||
53 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // ldux
|
||||
// AltiVec/VMX (opcode 31) loads & stores. Element
|
||||
// variants store one byte/halfword/word; full
|
||||
// `stvx` stores 16 bytes. Address resolution still
|
||||
// requires both rA and rB constant — common only
|
||||
// in static-table setup loops.
|
||||
231 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvx
|
||||
487 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvxl
|
||||
135 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvebx
|
||||
167 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvehx
|
||||
199 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvewx
|
||||
// AltiVec/VMX loads — same XO range, kind=read.
|
||||
103 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvx
|
||||
359 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvxl
|
||||
7 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvebx
|
||||
39 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvehx
|
||||
71 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvewx
|
||||
_ => None,
|
||||
}
|
||||
};
|
||||
if let Some((addr_mode, kind)) = mode_for_xo(xo)
|
||||
&& let Some(data_addr) = resolve_rab(®_hi)
|
||||
&& is_in_ranges(data_addr, &data_ranges)
|
||||
{
|
||||
data_annotations.insert(abs_addr, (data_addr, kind));
|
||||
xrefs.entry(data_addr).or_default().push(Xref {
|
||||
source: abs_addr, kind,
|
||||
addr_mode: Some(addr_mode),
|
||||
});
|
||||
labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}"));
|
||||
}
|
||||
// Fall through: any X-form op may write rD; invalidate.
|
||||
reg_hi[rd] = None;
|
||||
}
|
||||
// Any other instruction writing to rD: invalidate
|
||||
_ => {
|
||||
// Conservatively invalidate for instructions that modify rD
|
||||
@@ -221,6 +407,17 @@ pub fn analyze_xrefs(
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "xrefs").record(elapsed_ms);
|
||||
let total_xrefs: usize = xrefs.values().map(|v| v.len()).sum();
|
||||
tracing::info!(
|
||||
labels = labels.len(),
|
||||
xrefs = total_xrefs,
|
||||
data_annotations = data_annotations.len(),
|
||||
elapsed_ms,
|
||||
"xref analysis complete"
|
||||
);
|
||||
|
||||
XrefResult { labels, xrefs, data_annotations }
|
||||
}
|
||||
|
||||
@@ -235,7 +432,7 @@ fn collect_branch_target(instr: u32, addr: u32, labels: &mut HashMap<u32, String
|
||||
let target = if aa { li as u32 } else { addr.wrapping_add(li as u32) };
|
||||
labels.entry(target).or_insert_with(|| format!("loc_{target:08X}"));
|
||||
let kind = if lk { XrefKind::Call } else { XrefKind::Jump };
|
||||
xrefs.entry(target).or_default().push(Xref { source: addr, kind });
|
||||
xrefs.entry(target).or_default().push(Xref { source: addr, kind, addr_mode: None });
|
||||
}
|
||||
16 => {
|
||||
// B-form: bc/bcl
|
||||
@@ -243,7 +440,7 @@ fn collect_branch_target(instr: u32, addr: u32, labels: &mut HashMap<u32, String
|
||||
let aa = instr & 2 != 0;
|
||||
let target = if aa { bd as u32 } else { addr.wrapping_add(bd as u32) };
|
||||
labels.entry(target).or_insert_with(|| format!("loc_{target:08X}"));
|
||||
xrefs.entry(target).or_default().push(Xref { source: addr, kind: XrefKind::Branch });
|
||||
xrefs.entry(target).or_default().push(Xref { source: addr, kind: XrefKind::Branch, addr_mode: None });
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -262,7 +459,7 @@ fn is_in_ranges(addr: u32, ranges: &[(u32, u32)]) -> bool {
|
||||
}
|
||||
|
||||
/// Find which section a data address falls in.
|
||||
pub fn section_for_addr<'a>(addr: u32, sections: &'a [PeSection], image_base: u32) -> Option<&'a str> {
|
||||
pub fn section_for_addr(addr: u32, sections: &[PeSection], image_base: u32) -> Option<&str> {
|
||||
for s in sections {
|
||||
let start = image_base + s.virtual_address;
|
||||
let end = start + s.virtual_size;
|
||||
@@ -285,12 +482,44 @@ pub fn resolve_source_label(
|
||||
}
|
||||
|
||||
// Find the containing function (largest start <= addr)
|
||||
if let Some((&func_start, _fi)) = func_analysis.functions.range(..=addr).next_back() {
|
||||
if let Some(func_label) = labels.get(&func_start) {
|
||||
if let Some((&func_start, _fi)) = func_analysis.functions.range(..=addr).next_back()
|
||||
&& let Some(func_label) = labels.get(&func_start) {
|
||||
let offset = addr - func_start;
|
||||
return format!("{func_label}+0x{offset:X}");
|
||||
}
|
||||
}
|
||||
|
||||
format!("0x{addr:08X}")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn addr_mode_tags_are_distinct() {
|
||||
let modes = [
|
||||
AddrMode::DForm,
|
||||
AddrMode::LisAddi,
|
||||
AddrMode::LisOri,
|
||||
AddrMode::Multiword,
|
||||
AddrMode::XFormIndexed,
|
||||
AddrMode::XFormByteRev,
|
||||
AddrMode::Atomic,
|
||||
AddrMode::DCBZ,
|
||||
];
|
||||
let tags: std::collections::HashSet<&str> = modes.iter().map(|m| m.tag()).collect();
|
||||
assert_eq!(tags.len(), modes.len(), "every AddrMode variant must have a unique tag");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xref_struct_carries_addr_mode_for_data_edges() {
|
||||
let x = Xref { source: 0x1234, kind: XrefKind::DataWrite, addr_mode: Some(AddrMode::DForm) };
|
||||
assert_eq!(x.addr_mode.unwrap().tag(), "d_form");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xref_struct_addr_mode_is_none_for_call_edges() {
|
||||
let x = Xref { source: 0x1234, kind: XrefKind::Call, addr_mode: None };
|
||||
assert!(x.addr_mode.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
362
crates/xenia-analysis/tests/db_schema_golden.rs
Normal file
362
crates/xenia-analysis/tests/db_schema_golden.rs
Normal file
@@ -0,0 +1,362 @@
|
||||
//! DB schema golden — locks the column layout (names + types) of every
|
||||
//! table written by `DbWriter`. A schema change here without a fixture
|
||||
//! update fails the test, forcing a conscious decision before downstream
|
||||
//! query consumers break.
|
||||
//!
|
||||
//! The fixture is constructed in-process (no XEX/ISO needed): a small
|
||||
//! synthetic PE-shaped byte slice with one `.text` section of 4
|
||||
//! instructions, plus an empty import-library list and one detected
|
||||
//! function.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::io::Write;
|
||||
|
||||
use duckdb::Connection;
|
||||
|
||||
use xenia_analysis::DbWriter;
|
||||
use xenia_analysis::formatter::DisasmInfo;
|
||||
use xenia_analysis::func::{FuncAnalysis, FuncInfo};
|
||||
use xenia_analysis::xref::XrefMap;
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
/// Build a 16-byte `.text` section: 4 instructions (mflr / nop / blr / nop).
|
||||
fn synthetic_pe() -> (Vec<u8>, Vec<PeSection>, Vec<xenia_xex::header::ImportLibrary>) {
|
||||
// VA layout: image_base + 0x1000 = .text start (so RVA = 0x1000).
|
||||
// The DB writer expects pe[rva] to hold the byte at that RVA, so the
|
||||
// buffer must be at least 0x1000 + section_size bytes long.
|
||||
const RVA: usize = 0x1000;
|
||||
const TEXT: [u32; 4] = [
|
||||
// mfspr r12, LR (a.k.a. mflr r12) — opcode 31, xo 339, spr 8 (LR).
|
||||
// Encoded with spr halves swapped per the ISA: spr_field = (8<<5).
|
||||
(31u32 << 26) | (12 << 21) | ((8 << 5) << 11) | (339 << 1),
|
||||
0x60000000, // nop (ori r0, r0, 0)
|
||||
(19u32 << 26) | (20 << 21) | (16 << 1), // blr (bclr 20, 0)
|
||||
0x60000000, // nop
|
||||
];
|
||||
|
||||
let mut pe = vec![0u8; RVA + 16];
|
||||
for (i, &word) in TEXT.iter().enumerate() {
|
||||
pe[RVA + i * 4..RVA + i * 4 + 4].copy_from_slice(&word.to_be_bytes());
|
||||
}
|
||||
|
||||
let sections = vec![PeSection {
|
||||
name: ".text".to_string(),
|
||||
virtual_address: 0x1000,
|
||||
virtual_size: 16,
|
||||
raw_offset: 0x1000,
|
||||
raw_size: 16,
|
||||
flags: 0x60000020, // CODE | EXECUTE | READ
|
||||
}];
|
||||
|
||||
let import_libraries = vec![]; // No imports in the fixture.
|
||||
(pe, sections, import_libraries)
|
||||
}
|
||||
|
||||
fn synthetic_func_analysis(image_base: u32) -> FuncAnalysis {
|
||||
// Single function covering all four .text instructions.
|
||||
let entry = image_base + 0x1000;
|
||||
let mut functions = BTreeMap::new();
|
||||
functions.insert(
|
||||
entry,
|
||||
FuncInfo {
|
||||
start: entry,
|
||||
end: entry + 16,
|
||||
frame_size: 0,
|
||||
saved_gprs: 0,
|
||||
is_leaf: true,
|
||||
is_saverestore: false,
|
||||
pdata_validated: false,
|
||||
pdata_length: None,
|
||||
has_eh: false,
|
||||
},
|
||||
);
|
||||
FuncAnalysis {
|
||||
functions,
|
||||
save_gpr_base: None,
|
||||
restore_gpr_base: None,
|
||||
pdata_entries: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn db_schema_matches_expected_columns() {
|
||||
let (pe, sections, libs) = synthetic_pe();
|
||||
let image_base = 0x82000000u32;
|
||||
let entry = image_base + 0x1000;
|
||||
|
||||
let info = DisasmInfo {
|
||||
image_base,
|
||||
entry_point: entry,
|
||||
original_pe_name: Some("synthetic.exe"),
|
||||
title_id: Some(0xDEADBEEF),
|
||||
media_id: Some(0xCAFEF00D),
|
||||
sections: §ions,
|
||||
import_libraries: &libs,
|
||||
};
|
||||
|
||||
let func_analysis = synthetic_func_analysis(image_base);
|
||||
let mut labels: HashMap<u32, String> = HashMap::new();
|
||||
labels.insert(entry, "entry_point".to_string());
|
||||
let xrefs: XrefMap = XrefMap::new();
|
||||
|
||||
let tmp = std::env::temp_dir().join("xenia_rs_schema_golden.duckdb");
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
|
||||
{
|
||||
let mut w = DbWriter::open_fresh(&tmp).expect("open fresh DB");
|
||||
w.write_base(&info).expect("write_base");
|
||||
w.ingest_instructions(&pe, &info, &func_analysis, &labels)
|
||||
.expect("ingest_instructions");
|
||||
w.write_analysis_results(&pe, &info, &func_analysis, &labels, &xrefs, &[], &[], &[], None, &[])
|
||||
.expect("write_analysis_results");
|
||||
w.create_sql_views().expect("create_sql_views");
|
||||
}
|
||||
|
||||
let conn = Connection::open(&tmp).expect("reopen DB");
|
||||
|
||||
// Lock the column layout per table. Pairs are (name, type).
|
||||
let expected: &[(&str, &[(&str, &str)])] = &[
|
||||
("metadata", &[
|
||||
("key", "VARCHAR"),
|
||||
("value", "VARCHAR"),
|
||||
]),
|
||||
("sections", &[
|
||||
("name", "VARCHAR"),
|
||||
("virtual_address", "BIGINT"),
|
||||
("virtual_size", "BIGINT"),
|
||||
("raw_offset", "BIGINT"),
|
||||
("raw_size", "BIGINT"),
|
||||
("flags", "BIGINT"),
|
||||
("is_code", "BOOLEAN"),
|
||||
]),
|
||||
("imports", &[
|
||||
("library", "VARCHAR"),
|
||||
("ordinal", "BIGINT"),
|
||||
("name", "VARCHAR"),
|
||||
("record_type", "BIGINT"),
|
||||
("address", "BIGINT"),
|
||||
]),
|
||||
("instructions", &[
|
||||
("address", "BIGINT"),
|
||||
("raw", "BIGINT"),
|
||||
("mnemonic", "VARCHAR"),
|
||||
("operands", "VARCHAR"),
|
||||
("disasm", "VARCHAR"),
|
||||
("ext_mnemonic", "VARCHAR"),
|
||||
("ext_operands", "VARCHAR"),
|
||||
("ext_disasm", "VARCHAR"),
|
||||
("target_hex", "BIGINT"),
|
||||
("section", "VARCHAR"),
|
||||
("function", "BIGINT"),
|
||||
("label", "VARCHAR"),
|
||||
]),
|
||||
("functions", &[
|
||||
("address", "BIGINT"),
|
||||
("name", "VARCHAR"),
|
||||
("end_address", "BIGINT"),
|
||||
("frame_size", "BIGINT"),
|
||||
("saved_gprs", "BIGINT"),
|
||||
("is_leaf", "BOOLEAN"),
|
||||
("is_saverestore", "BOOLEAN"),
|
||||
("pdata_validated", "BOOLEAN"),
|
||||
("pdata_length", "BIGINT"),
|
||||
("has_eh", "BOOLEAN"),
|
||||
]),
|
||||
("pdata_entries", &[
|
||||
("begin_address", "BIGINT"),
|
||||
("end_address", "BIGINT"),
|
||||
("function_length", "BIGINT"),
|
||||
("prolog_length", "BIGINT"),
|
||||
("flags", "BIGINT"),
|
||||
]),
|
||||
("labels", &[
|
||||
("address", "BIGINT"),
|
||||
("name", "VARCHAR"),
|
||||
("kind", "VARCHAR"),
|
||||
]),
|
||||
("demangled_names", &[
|
||||
("address", "BIGINT"),
|
||||
("mangled", "VARCHAR"),
|
||||
("raw_demangled", "VARCHAR"),
|
||||
("namespace_path", "VARCHAR"),
|
||||
("class_name", "VARCHAR"),
|
||||
("method_name", "VARCHAR"),
|
||||
("params_signature", "VARCHAR"),
|
||||
]),
|
||||
("vtables", &[
|
||||
("address", "BIGINT"),
|
||||
("length", "BIGINT"),
|
||||
("col_address", "BIGINT"),
|
||||
("class_name", "VARCHAR"),
|
||||
("rtti_present", "BOOLEAN"),
|
||||
("base_classes_json", "VARCHAR"),
|
||||
]),
|
||||
("methods", &[
|
||||
("vtable_address", "BIGINT"),
|
||||
("slot", "BIGINT"),
|
||||
("function_address", "BIGINT"),
|
||||
("mangled_name", "VARCHAR"),
|
||||
("demangled_name", "VARCHAR"),
|
||||
]),
|
||||
("classes", &[
|
||||
("name", "VARCHAR"),
|
||||
("vtable_address", "BIGINT"),
|
||||
("rtti_present", "BOOLEAN"),
|
||||
("base_classes_json", "VARCHAR"),
|
||||
]),
|
||||
("strings", &[
|
||||
("address", "BIGINT"),
|
||||
("encoding", "VARCHAR"),
|
||||
("length", "BIGINT"),
|
||||
("content", "VARCHAR"),
|
||||
]),
|
||||
("tls_info", &[
|
||||
("raw_data_start", "BIGINT"),
|
||||
("raw_data_end", "BIGINT"),
|
||||
("index_address", "BIGINT"),
|
||||
("callback_array", "BIGINT"),
|
||||
("zero_fill_size", "BIGINT"),
|
||||
("characteristics", "BIGINT"),
|
||||
]),
|
||||
("tls_callbacks", &[
|
||||
("slot", "BIGINT"),
|
||||
("address", "BIGINT"),
|
||||
]),
|
||||
("function_pointer_arrays", &[
|
||||
("address", "BIGINT"),
|
||||
("length", "BIGINT"),
|
||||
("kind", "VARCHAR"),
|
||||
]),
|
||||
("function_pointer_array_entries", &[
|
||||
("array_address", "BIGINT"),
|
||||
("slot", "BIGINT"),
|
||||
("function_address", "BIGINT"),
|
||||
]),
|
||||
("indirect_dispatch_sites", &[
|
||||
("dispatch_pc", "BIGINT"),
|
||||
("vptr_offset", "BIGINT"),
|
||||
("slot", "BIGINT"),
|
||||
("candidate_count", "BIGINT"),
|
||||
]),
|
||||
("indirect_dispatch_candidates", &[
|
||||
("dispatch_pc", "BIGINT"),
|
||||
("vtable_address", "BIGINT"),
|
||||
("method_address", "BIGINT"),
|
||||
]),
|
||||
("vptr_writes", &[
|
||||
("writer_pc", "BIGINT"),
|
||||
("vtable_address", "BIGINT"),
|
||||
("vptr_offset", "BIGINT"),
|
||||
("writer_function", "BIGINT"),
|
||||
]),
|
||||
("eh_funcinfo", &[
|
||||
("address", "BIGINT"),
|
||||
("magic", "BIGINT"),
|
||||
("max_state", "BIGINT"),
|
||||
("p_unwind_map", "BIGINT"),
|
||||
("n_try_blocks", "BIGINT"),
|
||||
("p_try_block_map", "BIGINT"),
|
||||
("n_ip_map_entries", "BIGINT"),
|
||||
("p_ip_to_state_map", "BIGINT"),
|
||||
("p_es_type_list", "BIGINT"),
|
||||
("eh_flags", "BIGINT"),
|
||||
]),
|
||||
("eh_unwind_map", &[
|
||||
("funcinfo_address", "BIGINT"),
|
||||
("state_index", "BIGINT"),
|
||||
("to_state", "BIGINT"),
|
||||
("action_pc", "BIGINT"),
|
||||
]),
|
||||
("eh_try_blocks", &[
|
||||
("funcinfo_address", "BIGINT"),
|
||||
("try_index", "BIGINT"),
|
||||
("try_low", "BIGINT"),
|
||||
("try_high", "BIGINT"),
|
||||
("catch_high", "BIGINT"),
|
||||
("n_catches", "BIGINT"),
|
||||
("p_handler_array", "BIGINT"),
|
||||
]),
|
||||
("xrefs", &[
|
||||
("source", "BIGINT"),
|
||||
("target", "BIGINT"),
|
||||
("kind", "VARCHAR"),
|
||||
("addr_mode", "VARCHAR"),
|
||||
("instruction", "VARCHAR"),
|
||||
("source_func", "BIGINT"),
|
||||
("source_label", "VARCHAR"),
|
||||
("target_label", "VARCHAR"),
|
||||
]),
|
||||
];
|
||||
|
||||
let mut errs: Vec<String> = Vec::new();
|
||||
for (table, cols) in expected {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!("PRAGMA table_info('{}')", table))
|
||||
.unwrap_or_else(|e| panic!("prepare PRAGMA for {table}: {e}"));
|
||||
let rows: Vec<(String, String)> = stmt
|
||||
.query_map([], |row| {
|
||||
let name: String = row.get(1)?;
|
||||
let ty: String = row.get(2)?;
|
||||
Ok((name, ty))
|
||||
})
|
||||
.expect("query")
|
||||
.map(|r| r.unwrap())
|
||||
.collect();
|
||||
|
||||
if rows.len() != cols.len() {
|
||||
writeln!(
|
||||
std::io::stderr(),
|
||||
"{table}: column count mismatch (got {}, expected {})",
|
||||
rows.len(),
|
||||
cols.len()
|
||||
).ok();
|
||||
errs.push(format!("{table}: count {} vs {}", rows.len(), cols.len()));
|
||||
}
|
||||
for (i, (got, expected_col)) in rows.iter().zip(cols.iter()).enumerate() {
|
||||
if got.0 != expected_col.0 || got.1 != expected_col.1 {
|
||||
errs.push(format!(
|
||||
"{table} col {i}: got ({}, {}) expected ({}, {})",
|
||||
got.0, got.1, expected_col.0, expected_col.1
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert!(errs.is_empty(), "schema drift detected:\n {}", errs.join("\n "));
|
||||
|
||||
// Verify row counts in the populated tables.
|
||||
let n_instr: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM instructions", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(n_instr, 4, "expected 4 instruction rows from the synthetic PE");
|
||||
|
||||
// The synthetic mflr should produce target_hex = NULL, blr likewise (indirect).
|
||||
let n_with_target: i64 = conn
|
||||
.query_row("SELECT COUNT(target_hex) FROM instructions", [], |r| r.get(0))
|
||||
.unwrap();
|
||||
assert_eq!(n_with_target, 0, "indirect-only fixture should have no direct branch targets");
|
||||
|
||||
// SQL views must be queryable. The `_` in SQL LIKE is a single-char
|
||||
// wildcard, so we list the names explicitly rather than `LIKE 'v_%'`
|
||||
// (which also matches DuckDB's built-in `views` system view).
|
||||
let expected_views = [
|
||||
"v_branch_xrefs",
|
||||
"v_call_graph",
|
||||
"v_function_first_instruction",
|
||||
"v_imports_called",
|
||||
"v_indirect_reachability_from_entry",
|
||||
"v_reachability_from_entry",
|
||||
];
|
||||
for v in expected_views {
|
||||
let exists: i64 = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*) FROM duckdb_views() WHERE view_name = ?",
|
||||
[v],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(exists, 1, "missing SQL view: {v}");
|
||||
}
|
||||
|
||||
let _ = std::fs::remove_file(&tmp);
|
||||
}
|
||||
123
crates/xenia-analysis/tests/disasm_goldens.rs
Normal file
123
crates/xenia-analysis/tests/disasm_goldens.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
//! Analysis-side goldens: every row in the xenia-cpu fixtures must
|
||||
//! round-trip cleanly through the [`xenia_analysis::ppc`] shim. This
|
||||
//! pins the shim's behaviour to the canonical `xenia_cpu::disasm::format`
|
||||
//! output so that any future refactor of the shim layer surfaces here.
|
||||
//!
|
||||
//! Loads the same JSON fixtures committed under
|
||||
//! `crates/xenia-cpu/tests/golden/`. No separate analysis-side fixture
|
||||
//! files — the cpu canon is the source of truth.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use serde::Deserialize;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct GoldenRow {
|
||||
label: String,
|
||||
raw: String,
|
||||
addr: String,
|
||||
mnemonic: String,
|
||||
operands: String,
|
||||
#[serde(default)]
|
||||
ext_mnemonic: Option<String>,
|
||||
#[serde(default)]
|
||||
ext_operands: Option<String>,
|
||||
#[serde(default)]
|
||||
branch_target: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct GoldenFile {
|
||||
rows: Vec<GoldenRow>,
|
||||
}
|
||||
|
||||
fn cpu_fixture(name: &str) -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("xenia-cpu")
|
||||
.join("tests")
|
||||
.join("golden")
|
||||
.join(name)
|
||||
}
|
||||
|
||||
fn parse_hex(s: &str) -> u32 {
|
||||
let trimmed = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")).unwrap_or(s);
|
||||
u32::from_str_radix(trimmed, 16).expect("hex u32")
|
||||
}
|
||||
|
||||
/// Verify the shim's `Decoded { base, ext }` mirrors the canonical fields
|
||||
/// from `xenia_cpu::disasm::format` for every fixture row.
|
||||
fn check_fixture(fixture_name: &str) {
|
||||
let path = cpu_fixture(fixture_name);
|
||||
assert!(
|
||||
path.exists(),
|
||||
"missing fixture {} — run `cargo test -p xenia-cpu --test disasm_goldens` to (re)generate it",
|
||||
path.display()
|
||||
);
|
||||
let src = std::fs::read_to_string(&path).unwrap();
|
||||
let golden: GoldenFile = serde_json::from_str(&src).unwrap();
|
||||
|
||||
for row in &golden.rows {
|
||||
let raw = parse_hex(&row.raw);
|
||||
let addr = parse_hex(&row.addr);
|
||||
|
||||
let canonical =
|
||||
xenia_cpu::disasm::format(&xenia_cpu::decode(raw, addr));
|
||||
let shim = xenia_analysis::ppc::disasm(raw, addr);
|
||||
|
||||
assert_eq!(
|
||||
shim.base, canonical.disasm,
|
||||
"shim.base drifted for {} (raw={})",
|
||||
row.label, row.raw,
|
||||
);
|
||||
assert_eq!(
|
||||
shim.ext, canonical.ext_disasm,
|
||||
"shim.ext drifted for {} (raw={})",
|
||||
row.label, row.raw,
|
||||
);
|
||||
|
||||
// Also pin against the fixture's structured fields — guards against
|
||||
// someone changing the cpu canon without regenerating the fixture.
|
||||
assert_eq!(canonical.mnemonic, row.mnemonic, "mnemonic drift: {}", row.label);
|
||||
assert_eq!(canonical.operands, row.operands, "operands drift: {}", row.label);
|
||||
assert_eq!(canonical.ext_mnemonic, row.ext_mnemonic, "ext_mnemonic drift: {}", row.label);
|
||||
assert_eq!(canonical.ext_operands, row.ext_operands, "ext_operands drift: {}", row.label);
|
||||
|
||||
let target_str = canonical.branch_target.map(|t| format!("0x{t:08X}"));
|
||||
assert_eq!(target_str, row.branch_target, "branch_target drift: {}", row.label);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn analysis_shim_matches_base_mnemonics() {
|
||||
check_fixture("base_mnemonics.json");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn analysis_shim_matches_extended_mnemonics() {
|
||||
check_fixture("extended_mnemonics.json");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn analysis_shim_matches_vmx128_registers() {
|
||||
check_fixture("vmx128_registers.json");
|
||||
}
|
||||
|
||||
/// Spot-check that the shim's `display()` returns the extended form when
|
||||
/// present and falls back to the base otherwise. This is the contract
|
||||
/// `formatter.rs` and the .asm output rely on.
|
||||
#[test]
|
||||
fn shim_display_prefers_extended() {
|
||||
// ori r0, r0, 0 → base "ori r0, r0, 0x0", ext "nop"
|
||||
let d = xenia_analysis::ppc::disasm(0x60000000, 0);
|
||||
assert_eq!(d.display(), "nop");
|
||||
|
||||
// addi r3, r1, 16 → no extended form, display falls back to base
|
||||
let raw = (14u32 << 26) | (3 << 21) | (1 << 16) | 16;
|
||||
let d = xenia_analysis::ppc::disasm(raw, 0);
|
||||
assert!(
|
||||
d.ext.is_none(),
|
||||
"addi r3, r1, 16 has no extended form (only addi r3, r0, … → li)"
|
||||
);
|
||||
assert_eq!(d.display(), d.base);
|
||||
}
|
||||
@@ -20,9 +20,21 @@ xenia-apu = { workspace = true }
|
||||
xenia-hid = { workspace = true }
|
||||
xenia-debugger = { workspace = true }
|
||||
xenia-analysis = { workspace = true }
|
||||
xenia-ui = { workspace = true }
|
||||
winit = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
tracing-appender = { workspace = true }
|
||||
tracing-chrome = { workspace = true }
|
||||
tracing-error = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
metrics-util = { workspace = true }
|
||||
pprof = { workspace = true, optional = true }
|
||||
anyhow = { workspace = true }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = ["profiling"]
|
||||
profiling = ["dep:pprof"]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
384
crates/xenia-app/src/observability.rs
Normal file
384
crates/xenia-app/src/observability.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! Logging, tracing, and profiling wiring for the `xenia-rs` CLI.
|
||||
//!
|
||||
//! Owns the `tracing-subscriber` registry, optional file / Chrome-trace sinks,
|
||||
//! the `metrics` debugging recorder, and (behind the `profiling` feature) the
|
||||
//! `pprof-rs` sampling profiler. All drop-time cleanup (flushing appenders,
|
||||
//! finalising Chrome output, writing flamegraphs, printing the metrics
|
||||
//! summary) is carried by [`ObservabilityGuards`] so `main` just has to hold
|
||||
//! the value until return.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use tracing::Level;
|
||||
use tracing_error::{ErrorLayer, SpanTrace};
|
||||
use tracing_subscriber::fmt::format::FmtSpan;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::util::SubscriberInitExt;
|
||||
use tracing_subscriber::{fmt, EnvFilter, Layer, Registry};
|
||||
|
||||
/// User-selectable observability settings parsed from CLI + environment.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ObservabilityConfig {
|
||||
/// If `true`, render console logs as JSON instead of compact text.
|
||||
pub log_json: bool,
|
||||
/// Additional log sink file. `.json` → JSON formatter; anything else → text.
|
||||
pub log_file: Option<PathBuf>,
|
||||
/// Overrides `RUST_LOG` when set. Passed through `EnvFilter::try_new`.
|
||||
pub log_filter: Option<String>,
|
||||
/// Default filter directive used when neither `RUST_LOG` nor
|
||||
/// [`log_filter`](Self::log_filter) are set.
|
||||
pub default_level: &'static str,
|
||||
/// If set, emit a Chrome `about:tracing` JSON trace to this path.
|
||||
pub trace_chrome: Option<PathBuf>,
|
||||
/// If set, run the pprof sampling profiler and write output here on drop.
|
||||
/// Extension `.svg` → flamegraph, `.pb` → protobuf.
|
||||
pub profile: Option<PathBuf>,
|
||||
}
|
||||
|
||||
impl ObservabilityConfig {
|
||||
#[allow(dead_code)]
|
||||
pub fn new(default_level: &'static str) -> Self {
|
||||
Self {
|
||||
default_level,
|
||||
..Self::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII handle returned by [`init`]. Drop flushes the appender, finalises
|
||||
/// Chrome output, writes the pprof report, and prints the metrics summary.
|
||||
#[must_use = "drop of ObservabilityGuards is what flushes logs, profiles, and metrics"]
|
||||
pub struct ObservabilityGuards {
|
||||
_appender: Option<tracing_appender::non_blocking::WorkerGuard>,
|
||||
_chrome: Option<tracing_chrome::FlushGuard>,
|
||||
#[cfg(feature = "profiling")]
|
||||
pprof: Option<(pprof::ProfilerGuard<'static>, PathBuf)>,
|
||||
metrics_snapshotter: Option<metrics_util::debugging::Snapshotter>,
|
||||
}
|
||||
|
||||
impl Drop for ObservabilityGuards {
|
||||
fn drop(&mut self) {
|
||||
#[cfg(feature = "profiling")]
|
||||
if let Some((guard, path)) = self.pprof.take() {
|
||||
if let Err(e) = write_pprof_report(&guard, &path) {
|
||||
eprintln!("profile write failed: {e:#}");
|
||||
} else {
|
||||
tracing::info!(path = %path.display(), "pprof report written");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(snap) = self.metrics_snapshotter.take() {
|
||||
print_metrics_summary(&snap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build and install the global tracing subscriber + metrics recorder.
|
||||
pub fn init(config: &ObservabilityConfig) -> Result<ObservabilityGuards> {
|
||||
let span_events = parse_span_events();
|
||||
|
||||
// Resolve the filter directive once; attach a freshly-built `EnvFilter`
|
||||
// per sink layer via `.with_filter()`. Previously the filter was pushed
|
||||
// into the layer-`Vec` but that only gates what *itself* sees in a
|
||||
// boxed-Vec setup; sibling fmt / chrome / file layers kept emitting
|
||||
// filtered-out events. Per-layer filtering is the idiomatic tracing-
|
||||
// subscriber pattern and works cleanly with boxed layer dispatch.
|
||||
let directive = resolve_filter_directive(config);
|
||||
|
||||
let mut layers: Vec<Box<dyn Layer<Registry> + Send + Sync + 'static>> = Vec::new();
|
||||
|
||||
// Console fmt layer — compact text or JSON, always stderr.
|
||||
let console_layer: Box<dyn Layer<Registry> + Send + Sync + 'static> = if config.log_json {
|
||||
fmt::layer()
|
||||
.json()
|
||||
.with_span_events(span_events.clone())
|
||||
.with_writer(std::io::stderr)
|
||||
.with_filter(EnvFilter::try_new(&directive).context("invalid filter")?)
|
||||
.boxed()
|
||||
} else {
|
||||
fmt::layer()
|
||||
.compact()
|
||||
.with_span_events(span_events.clone())
|
||||
.with_writer(std::io::stderr)
|
||||
.with_filter(EnvFilter::try_new(&directive).context("invalid filter")?)
|
||||
.boxed()
|
||||
};
|
||||
layers.push(console_layer);
|
||||
|
||||
// Optional file sink — also filtered.
|
||||
let appender_guard = match &config.log_file {
|
||||
Some(path) => {
|
||||
let (layer, guard) = build_file_layer(path, span_events)?;
|
||||
layers.push(
|
||||
layer
|
||||
.with_filter(EnvFilter::try_new(&directive).context("invalid filter")?)
|
||||
.boxed(),
|
||||
);
|
||||
Some(guard)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
// Optional Chrome `about:tracing` sink — intentionally UNFILTERED so
|
||||
// traces capture the full picture even when the console is quiet.
|
||||
let chrome_guard = match &config.trace_chrome {
|
||||
Some(path) => {
|
||||
let (layer, guard) = tracing_chrome::ChromeLayerBuilder::new()
|
||||
.file(path.clone())
|
||||
.include_args(true)
|
||||
.build();
|
||||
layers.push(layer.boxed());
|
||||
Some(guard)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
// `tracing-error` layer enables SpanTrace capture in `with_span_trace`.
|
||||
layers.push(ErrorLayer::default().boxed());
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(layers)
|
||||
.try_init()
|
||||
.context("tracing subscriber already initialized")?;
|
||||
// `build_env_filter` is retained for compatibility with older callers;
|
||||
// `resolve_filter_directive` above is what actually drives the layer
|
||||
// filters.
|
||||
let _ = build_env_filter(config);
|
||||
|
||||
// Install the metrics debugging recorder. `install` sets the global
|
||||
// recorder; its snapshotter is held in the guards struct.
|
||||
let recorder = metrics_util::debugging::DebuggingRecorder::new();
|
||||
let snapshotter = recorder.snapshotter();
|
||||
if recorder.install().is_err() {
|
||||
tracing::warn!("a metrics recorder was already installed; skipping xenia-rs recorder");
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
let pprof = match &config.profile {
|
||||
Some(path) => {
|
||||
let guard = pprof::ProfilerGuardBuilder::default()
|
||||
.frequency(100)
|
||||
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
|
||||
.build()
|
||||
.context("failed to start pprof sampling profiler")?;
|
||||
Some((guard, path.clone()))
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
#[cfg(not(feature = "profiling"))]
|
||||
if config.profile.is_some() {
|
||||
bail!("--profile requires building with --features profiling");
|
||||
}
|
||||
|
||||
Ok(ObservabilityGuards {
|
||||
_appender: appender_guard,
|
||||
_chrome: chrome_guard,
|
||||
#[cfg(feature = "profiling")]
|
||||
pprof,
|
||||
metrics_snapshotter: Some(snapshotter),
|
||||
})
|
||||
}
|
||||
|
||||
fn resolve_filter_directive(config: &ObservabilityConfig) -> String {
|
||||
if let Some(ref f) = config.log_filter {
|
||||
return f.clone();
|
||||
}
|
||||
if let Ok(f) = std::env::var("RUST_LOG")
|
||||
&& !f.is_empty() {
|
||||
return f;
|
||||
}
|
||||
config.default_level.to_string()
|
||||
}
|
||||
|
||||
fn build_env_filter(config: &ObservabilityConfig) -> Result<EnvFilter> {
|
||||
// Precedence: explicit --log-filter > RUST_LOG > default_level.
|
||||
if let Some(ref f) = config.log_filter {
|
||||
return EnvFilter::try_new(f).context("invalid --log-filter directive");
|
||||
}
|
||||
if let Ok(f) = EnvFilter::try_from_default_env() {
|
||||
return Ok(f);
|
||||
}
|
||||
EnvFilter::try_new(config.default_level)
|
||||
.with_context(|| format!("invalid default filter `{}`", config.default_level))
|
||||
}
|
||||
|
||||
fn parse_span_events() -> FmtSpan {
|
||||
match std::env::var("RUST_LOG_SPAN_EVENTS").as_deref() {
|
||||
Ok("full") => FmtSpan::FULL,
|
||||
Ok("close") => FmtSpan::CLOSE,
|
||||
Ok("active") => FmtSpan::ACTIVE,
|
||||
Ok("enter") => FmtSpan::ENTER,
|
||||
Ok("exit") => FmtSpan::EXIT,
|
||||
Ok("new") => FmtSpan::NEW,
|
||||
_ => FmtSpan::NONE,
|
||||
}
|
||||
}
|
||||
|
||||
type FileLayerBox = Box<dyn Layer<Registry> + Send + Sync + 'static>;
|
||||
|
||||
fn build_file_layer(
|
||||
path: &Path,
|
||||
span_events: FmtSpan,
|
||||
) -> Result<(FileLayerBox, tracing_appender::non_blocking::WorkerGuard)> {
|
||||
let parent = path.parent().unwrap_or_else(|| Path::new("."));
|
||||
let file_name = path
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow::anyhow!("log file path has no file name: {}", path.display()))?;
|
||||
std::fs::create_dir_all(parent)
|
||||
.with_context(|| format!("failed to create {}", parent.display()))?;
|
||||
|
||||
let appender = tracing_appender::rolling::never(parent, file_name);
|
||||
let (non_blocking, guard) = tracing_appender::non_blocking(appender);
|
||||
|
||||
let ext = path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.unwrap_or_default();
|
||||
let layer: FileLayerBox = if ext.eq_ignore_ascii_case("json") {
|
||||
fmt::layer()
|
||||
.json()
|
||||
.with_span_events(span_events)
|
||||
.with_writer(non_blocking)
|
||||
.with_ansi(false)
|
||||
.boxed()
|
||||
} else {
|
||||
fmt::layer()
|
||||
.with_span_events(span_events)
|
||||
.with_writer(non_blocking)
|
||||
.with_ansi(false)
|
||||
.boxed()
|
||||
};
|
||||
|
||||
Ok((layer, guard))
|
||||
}
|
||||
|
||||
/// Wrap an error with a captured span-trace so the top-level `main` can
|
||||
/// render "failed in `cmd_exec > load_image > …`" alongside the regular
|
||||
/// anyhow context chain.
|
||||
#[allow(dead_code)]
|
||||
pub fn with_span_trace<E>(err: E) -> anyhow::Error
|
||||
where
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
anyhow::Error::new(err).context(SpanTraceDisplay(SpanTrace::capture()))
|
||||
}
|
||||
|
||||
/// Attach a captured span-trace to an existing `anyhow::Error` as extra
|
||||
/// context. Used at command boundaries where errors already bubble as
|
||||
/// `anyhow::Error`.
|
||||
pub fn attach_span_trace(err: anyhow::Error) -> anyhow::Error {
|
||||
err.context(SpanTraceDisplay(SpanTrace::capture()))
|
||||
}
|
||||
|
||||
struct SpanTraceDisplay(SpanTrace);
|
||||
|
||||
impl std::fmt::Display for SpanTraceDisplay {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "span trace:\n{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SpanTraceDisplay {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
<Self as std::fmt::Display>::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
fn write_pprof_report(guard: &pprof::ProfilerGuard<'static>, path: &Path) -> Result<()> {
|
||||
let report = guard.report().build().context("pprof report build failed")?;
|
||||
let ext = path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.unwrap_or("")
|
||||
.to_ascii_lowercase();
|
||||
|
||||
let parent = path.parent().unwrap_or_else(|| Path::new("."));
|
||||
std::fs::create_dir_all(parent).ok();
|
||||
|
||||
match ext.as_str() {
|
||||
"svg" | "" => {
|
||||
let file = std::fs::File::create(path)
|
||||
.with_context(|| format!("create {}", path.display()))?;
|
||||
report
|
||||
.flamegraph(file)
|
||||
.context("flamegraph render failed")?;
|
||||
}
|
||||
"pb" | "proto" | "pprof" => {
|
||||
use pprof::protos::Message;
|
||||
let profile = report.pprof().context("pprof protobuf build failed")?;
|
||||
let buf = profile
|
||||
.write_to_bytes()
|
||||
.context("pprof protobuf encode failed")?;
|
||||
std::fs::write(path, &buf).with_context(|| format!("write {}", path.display()))?;
|
||||
}
|
||||
other => bail!("unknown --profile extension `.{other}` (use .svg or .pb)"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_metrics_summary(snap: &metrics_util::debugging::Snapshotter) {
|
||||
use metrics_util::debugging::DebugValue;
|
||||
use metrics_util::MetricKind;
|
||||
|
||||
let snapshot = snap.snapshot();
|
||||
let rows = snapshot.into_vec();
|
||||
if rows.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Group counters, gauges, histograms into simple lines. Use tracing so
|
||||
// the summary honours the installed subscriber (can land in file + JSON
|
||||
// sinks and not just stderr).
|
||||
let mut lines: Vec<String> = Vec::with_capacity(rows.len());
|
||||
for (key, _unit, _desc, value) in rows {
|
||||
let kind = match key.kind() {
|
||||
MetricKind::Counter => "counter",
|
||||
MetricKind::Gauge => "gauge",
|
||||
MetricKind::Histogram => "histogram",
|
||||
};
|
||||
let name = key.key().name();
|
||||
let labels: Vec<String> = key
|
||||
.key()
|
||||
.labels()
|
||||
.map(|l| format!("{}={}", l.key(), l.value()))
|
||||
.collect();
|
||||
let labels_str = if labels.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
format!("{{{}}}", labels.join(","))
|
||||
};
|
||||
let value_str = match value {
|
||||
DebugValue::Counter(n) => n.to_string(),
|
||||
DebugValue::Gauge(g) => format!("{}", g.into_inner()),
|
||||
DebugValue::Histogram(samples) => {
|
||||
if samples.is_empty() {
|
||||
"empty".to_string()
|
||||
} else {
|
||||
let floats: Vec<f64> = samples.iter().map(|s| s.into_inner()).collect();
|
||||
let count = floats.len();
|
||||
let sum: f64 = floats.iter().copied().sum();
|
||||
let min = floats.iter().copied().fold(f64::INFINITY, f64::min);
|
||||
let max = floats.iter().copied().fold(f64::NEG_INFINITY, f64::max);
|
||||
format!(
|
||||
"count={} sum={:.3} min={:.3} max={:.3} mean={:.3}",
|
||||
count,
|
||||
sum,
|
||||
min,
|
||||
max,
|
||||
sum / count as f64
|
||||
)
|
||||
}
|
||||
}
|
||||
};
|
||||
lines.push(format!(" {kind:<9} {name}{labels_str} = {value_str}"));
|
||||
}
|
||||
|
||||
if tracing::enabled!(Level::INFO) {
|
||||
tracing::info!("metrics summary:\n{}", lines.join("\n"));
|
||||
} else {
|
||||
eprintln!("metrics summary:\n{}", lines.join("\n"));
|
||||
}
|
||||
}
|
||||
72
crates/xenia-app/tests/golden/README.md
Normal file
72
crates/xenia-app/tests/golden/README.md
Normal file
@@ -0,0 +1,72 @@
|
||||
# Sylpheed regression goldens
|
||||
|
||||
These JSON files anchor `xenia-rs check` digest output for Project Sylpheed.
|
||||
|
||||
## Files
|
||||
|
||||
| File | -n | Mode | Captures |
|
||||
|------|----|------|----------|
|
||||
| `sylpheed_n2m.json` | 2_000_000 | full digest | early boot (swaps=0, no rendering) |
|
||||
| `sylpheed_n50m.json` | 50_000_000 | stable-digest | first VdSwap pair (swaps=2 post-Phase-A) |
|
||||
|
||||
## Stable-digest mode
|
||||
|
||||
`sylpheed_n50m.json` is captured with `--stable-digest`, which omits
|
||||
timing-sensitive counters: `packets` (±2–8% lockstep noise from a GPU thread
|
||||
race), `resolves`, `interrupts_delivered`, `interrupts_dropped`,
|
||||
`texture_decodes`. The remaining fields are byte-identical across repeated
|
||||
lockstep runs at a fixed -n.
|
||||
|
||||
`sylpheed_n2m.json` predates the stable-digest flag and uses full-digest
|
||||
compare. It still works because at -n 2M the GPU pipeline has not produced any
|
||||
packets yet — `packets=0` is trivially deterministic.
|
||||
|
||||
## Circularity hazard
|
||||
|
||||
Per ORACBUG-001/002/003, these goldens were captured by running the same code
|
||||
they validate. They detect **regression** from a known-good snapshot, not
|
||||
**correctness**. When a planned fix intentionally moves the digest (e.g. a
|
||||
shader fix landing `draws > 0` for the first time), re-baseline the golden as
|
||||
a separate commit and reference the audit ID in the message.
|
||||
|
||||
## Re-baselining
|
||||
|
||||
```sh
|
||||
cargo build --release -p xenia-app
|
||||
target/release/xenia-rs check \
|
||||
"$SYLPHEED_ISO" \
|
||||
-n 50000000 \
|
||||
--stable-digest \
|
||||
--out crates/xenia-app/tests/golden/sylpheed_n50m.json
|
||||
```
|
||||
|
||||
## Running the goldens
|
||||
|
||||
```sh
|
||||
cargo test --release -p xenia-app --test sylpheed_oracles -- --ignored --nocapture
|
||||
```
|
||||
|
||||
The tests are `#[ignore]`-gated because each run takes a few seconds, which is
|
||||
unacceptable in the default `cargo test` cycle. The ISO path defaults to the
|
||||
contributor's local `~/RE Project Sylpheed/Project Sylpheed*.iso` and can be
|
||||
overridden via `SYLPHEED_ISO=/path/to/sylpheed.iso`.
|
||||
|
||||
## n4b canonical-invocation regression anchor (deferred)
|
||||
|
||||
The audit's recommended next sprint also called for a `sylpheed_n4b.json`
|
||||
golden capturing the canonical reference invocation
|
||||
`xenia-rs check sylpheed.iso -n 4_000_000_000 --parallel --reservations-table`.
|
||||
This is **deferred** because:
|
||||
|
||||
1. The `--parallel --reservations-table` combination is empirically pathologically
|
||||
slow at -n 100M (>32 min per run per the audit memory). At -n 4B the run cost
|
||||
is many hours, not the single-session-friendly 5–15 min the original plan
|
||||
estimated.
|
||||
2. Each phase that intentionally moves rendering counters (C, D, E, F) would
|
||||
need a re-baseline of n4b — a significant time cost compounding over the
|
||||
sprint.
|
||||
|
||||
Once the renderer-unblock phases (C+D+E) land and `draws > 0` is confirmed at
|
||||
-n 100M lockstep, an n4b artifact may be captured one-shot and stored under
|
||||
`audit-runs/post-fix/` (not as a test golden) as a manual regression anchor for
|
||||
the canonical invocation.
|
||||
10
crates/xenia-app/tests/golden/sylpheed_n2m.json
Normal file
10
crates/xenia-app/tests/golden/sylpheed_n2m.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"instructions": 2000005,
|
||||
"imports": 5635,
|
||||
"unimpl": 0,
|
||||
"draws": 0,
|
||||
"swaps": 0,
|
||||
"unique_render_targets": 0,
|
||||
"shader_blobs_live": 0,
|
||||
"texture_cache_entries": 0
|
||||
}
|
||||
10
crates/xenia-app/tests/golden/sylpheed_n50m.json
Normal file
10
crates/xenia-app/tests/golden/sylpheed_n50m.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"instructions": 50000001,
|
||||
"imports": 40454,
|
||||
"unimpl": 0,
|
||||
"draws": 0,
|
||||
"swaps": 1,
|
||||
"unique_render_targets": 0,
|
||||
"shader_blobs_live": 0,
|
||||
"texture_cache_entries": 0
|
||||
}
|
||||
111
crates/xenia-app/tests/parallel_stress.rs
Normal file
111
crates/xenia-app/tests/parallel_stress.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
//! M3 real-parallelism stress harness.
|
||||
//!
|
||||
//! Runs `xenia-rs check sylpheed.iso --parallel --halt-on-deadlock`
|
||||
//! many times back-to-back to surface lost-wakeups, lock-order
|
||||
//! inversions, and ABA hazards that a single run wouldn't reliably
|
||||
//! reproduce. Failures dump per-run stdout/stderr to
|
||||
//! `target/parallel-stress-NNN.{stdout,stderr}` for post-mortem.
|
||||
//!
|
||||
//! Two configurations:
|
||||
//! - `parallel_stress_short`: 20 runs at -n 5_000_000. Quick smoke
|
||||
//! check — runs in a few minutes on the current substrate.
|
||||
//! - `parallel_stress_long` (ignored, opt-in): 100 runs at
|
||||
//! -n 50_000_000. The full gate from the master plan; expected
|
||||
//! runtime is hours until the perf gap (Step 05's deferred parking
|
||||
//! fix) closes.
|
||||
//!
|
||||
//! Run with `cargo test --release -p xenia-app --test parallel_stress
|
||||
//! -- --ignored --nocapture` for the full 100x; otherwise the short
|
||||
//! variant runs as part of the normal test suite when explicitly
|
||||
//! invoked: `cargo test --release -p xenia-app --test parallel_stress
|
||||
//! -- --nocapture parallel_stress_short`.
|
||||
|
||||
use std::process::Command;
|
||||
use std::time::Instant;
|
||||
|
||||
const ISO_DEFAULT: &str = "/home/fabi/RE Project Sylpheed/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso";
|
||||
|
||||
fn iso_path() -> String {
|
||||
std::env::var("SYLPHEED_ISO").unwrap_or_else(|_| ISO_DEFAULT.to_string())
|
||||
}
|
||||
|
||||
fn run_stress(label: &str, runs: u32, max_instr: u64) {
|
||||
let bin = env!("CARGO_BIN_EXE_xenia-rs");
|
||||
let iso = iso_path();
|
||||
if !std::path::Path::new(&iso).exists() {
|
||||
eprintln!("{label}: iso not found at {iso}; set SYLPHEED_ISO to override. SKIPPING.");
|
||||
return;
|
||||
}
|
||||
std::fs::create_dir_all("target").ok();
|
||||
let mut failures: u32 = 0;
|
||||
let mut wall_ms: Vec<u128> = Vec::with_capacity(runs as usize);
|
||||
let max_instr_str = max_instr.to_string();
|
||||
for run in 1..=runs {
|
||||
let t0 = Instant::now();
|
||||
let out = Command::new(bin)
|
||||
.args([
|
||||
"exec",
|
||||
&iso,
|
||||
"-n",
|
||||
&max_instr_str,
|
||||
"--parallel",
|
||||
"--halt-on-deadlock",
|
||||
"--quiet",
|
||||
])
|
||||
.output()
|
||||
.expect("failed to spawn xenia-rs");
|
||||
let dt = t0.elapsed().as_millis();
|
||||
wall_ms.push(dt);
|
||||
let exit_ok = out.status.success();
|
||||
let vdswap2 = String::from_utf8_lossy(&out.stderr).contains("VdSwap")
|
||||
|| String::from_utf8_lossy(&out.stdout).contains("VdSwap");
|
||||
let _ = vdswap2; // VdSwap=2 not required at -n 5M; tracked for diagnostic only.
|
||||
if !exit_ok {
|
||||
failures += 1;
|
||||
std::fs::write(
|
||||
format!("target/parallel-stress-{label}-{run:03}.stdout"),
|
||||
&out.stdout,
|
||||
)
|
||||
.ok();
|
||||
std::fs::write(
|
||||
format!("target/parallel-stress-{label}-{run:03}.stderr"),
|
||||
&out.stderr,
|
||||
)
|
||||
.ok();
|
||||
eprintln!(
|
||||
"{label}: run {run}/{runs} FAILED (wall={}ms, exit={:?})",
|
||||
dt,
|
||||
out.status.code()
|
||||
);
|
||||
} else {
|
||||
eprintln!("{label}: run {run}/{runs} ok (wall={dt}ms)");
|
||||
}
|
||||
}
|
||||
wall_ms.sort();
|
||||
let p50 = wall_ms[wall_ms.len() / 2];
|
||||
let p95_idx = ((wall_ms.len() - 1) * 95) / 100;
|
||||
let p95 = wall_ms[p95_idx];
|
||||
let max = *wall_ms.last().unwrap();
|
||||
eprintln!(
|
||||
"{label} summary: runs={runs} ok={} failed={failures} p50={p50}ms p95={p95}ms max={max}ms",
|
||||
runs - failures,
|
||||
);
|
||||
assert_eq!(failures, 0, "{label}: {failures} of {runs} stress runs failed");
|
||||
}
|
||||
|
||||
/// 20 runs at -n 5M. Session-feasible (~10 minutes at the current
|
||||
/// perf level). Surfaces lost-wakeup / lock-order / phaser-timeout
|
||||
/// bugs that a single run wouldn't reproduce.
|
||||
#[test]
|
||||
#[ignore = "stress test; run via `cargo test ... -- --ignored parallel_stress_short`"]
|
||||
fn parallel_stress_short() {
|
||||
run_stress("short", 20, 5_000_000);
|
||||
}
|
||||
|
||||
/// 100 runs at -n 50M. The full M3 follow-up gate per the master
|
||||
/// plan. Expected runtime is hours until the perf gap closes.
|
||||
#[test]
|
||||
#[ignore = "full stress test; run via `cargo test ... -- --ignored parallel_stress_long`"]
|
||||
fn parallel_stress_long() {
|
||||
run_stress("long", 100, 50_000_000);
|
||||
}
|
||||
85
crates/xenia-app/tests/sylpheed_oracles.rs
Normal file
85
crates/xenia-app/tests/sylpheed_oracles.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
//! Sylpheed boot-sequence regression oracles.
|
||||
//!
|
||||
//! These goldens trigger `xenia-rs check` against the Project Sylpheed ISO and
|
||||
//! compare the resulting digest to a checked-in JSON file via `--stable-digest`,
|
||||
//! which excludes timing-sensitive counters (`packets`, `interrupts_*`,
|
||||
//! `resolves`, `texture_decodes`). The remaining fields are deterministic in
|
||||
//! lockstep at a fixed instruction budget — verified empirically across 3
|
||||
//! consecutive runs.
|
||||
//!
|
||||
//! Goldens are CIRCULAR per ORACBUG-001/002/003: they were captured by running
|
||||
//! the same code they validate. Treat them as **regression anchors** (catch
|
||||
//! drift from a known-good snapshot) not **correctness anchors** (no claim
|
||||
//! about absolute behavior). When a planned fix intentionally moves the
|
||||
//! digest (e.g. swap fix → `swaps` increments; renderer fix → `draws` becomes
|
||||
//! non-zero), re-baseline the golden as a separate commit.
|
||||
//!
|
||||
//! Tests are `#[ignore]`-gated because the runs take ~4 seconds each, which
|
||||
//! is unacceptable for the default `cargo test` cycle. Run explicitly:
|
||||
//! cargo test --release -p xenia-app --test sylpheed_oracles -- --ignored --nocapture
|
||||
//!
|
||||
//! ISO path is read from the `SYLPHEED_ISO` env var, falling back to the
|
||||
//! repo-relative default. CI/contributors without the ISO will see the test
|
||||
//! skip gracefully.
|
||||
|
||||
use std::process::Command;
|
||||
|
||||
const ISO_DEFAULT: &str = "/home/fabi/RE Project Sylpheed/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso";
|
||||
|
||||
fn iso_path() -> String {
|
||||
std::env::var("SYLPHEED_ISO").unwrap_or_else(|_| ISO_DEFAULT.to_string())
|
||||
}
|
||||
|
||||
fn run_oracle(label: &str, max_instr: u64, golden_rel: &str) {
|
||||
let bin = env!("CARGO_BIN_EXE_xenia-rs");
|
||||
let iso = iso_path();
|
||||
if !std::path::Path::new(&iso).exists() {
|
||||
eprintln!("{label}: iso not found at {iso}; set SYLPHEED_ISO to override. SKIPPING.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Resolve the golden path relative to the test's CARGO_MANIFEST_DIR so the
|
||||
// test runs correctly from any cwd.
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
let golden = std::path::Path::new(manifest_dir).join(golden_rel);
|
||||
assert!(
|
||||
golden.exists(),
|
||||
"{label}: golden file missing at {}",
|
||||
golden.display()
|
||||
);
|
||||
|
||||
let max_instr_str = max_instr.to_string();
|
||||
let golden_str = golden.to_string_lossy().to_string();
|
||||
|
||||
let out = Command::new(bin)
|
||||
.args([
|
||||
"check",
|
||||
&iso,
|
||||
"-n",
|
||||
&max_instr_str,
|
||||
"--stable-digest",
|
||||
"--expect",
|
||||
&golden_str,
|
||||
])
|
||||
.output()
|
||||
.expect("failed to spawn xenia-rs");
|
||||
|
||||
if !out.status.success() {
|
||||
eprintln!(
|
||||
"{label}: STDOUT:\n{}\nSTDERR:\n{}",
|
||||
String::from_utf8_lossy(&out.stdout),
|
||||
String::from_utf8_lossy(&out.stderr),
|
||||
);
|
||||
panic!("{label}: digest mismatch (exit {:?})", out.status.code());
|
||||
}
|
||||
}
|
||||
|
||||
/// Sylpheed boot to first VdSwap pair, captured at -n 50M lockstep.
|
||||
/// Catches regressions in: addi/addic semantics, kernel HLE for VdSwap path,
|
||||
/// thread spawning, file I/O for sound/config. With Phase A's swap fix landed,
|
||||
/// `swaps` should be 2 and `draws` 0 (Phase E gates draws>0).
|
||||
#[test]
|
||||
#[ignore = "long-running; run via `cargo test ... -- --ignored sylpheed_n50m`"]
|
||||
fn sylpheed_n50m() {
|
||||
run_oracle("sylpheed_n50m", 50_000_000, "tests/golden/sylpheed_n50m.json");
|
||||
}
|
||||
@@ -10,3 +10,11 @@ xenia-memory = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
bitflags = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
|
||||
[[bench]]
|
||||
name = "interpreter"
|
||||
harness = false
|
||||
|
||||
194
crates/xenia-cpu/benches/interpreter.rs
Normal file
194
crates/xenia-cpu/benches/interpreter.rs
Normal file
@@ -0,0 +1,194 @@
|
||||
//! Interpreter throughput micro-benchmarks.
|
||||
//!
|
||||
//! Custom `harness = false` main — no extra dev-deps. Run via
|
||||
//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`).
|
||||
//!
|
||||
//! Three workloads, each measuring `step_cached` throughput in MIPS:
|
||||
//!
|
||||
//! - `tight_alu_loop` — pure dispatch + ALU + decode-cache hit.
|
||||
//! - `loadstore_loop` — alternating `lwz`/`stw` against main RAM. Stresses
|
||||
//! every load/store path and `find_mmio` dispatch.
|
||||
//! - `mmio_storm` — same shape as `loadstore_loop` but the address is
|
||||
//! in a registered MMIO aperture. Sanity-checks that
|
||||
//! MMIO writes still dispatch correctly.
|
||||
//!
|
||||
//! These are not statistically rigorous — no warmup, no variance — they're
|
||||
//! just enough to detect 2x-class wins or regressions on the perf-track
|
||||
//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go
|
||||
//! into commit messages; there is no automated baseline file.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use xenia_cpu::context::PpcContext;
|
||||
use xenia_cpu::decoder::DecodeCache;
|
||||
use xenia_cpu::interpreter::{step_cached, StepResult};
|
||||
use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion};
|
||||
use xenia_memory::page_table::MemoryProtect;
|
||||
|
||||
// PPC instruction encoders — minimal subset needed by the benches.
|
||||
|
||||
#[inline]
|
||||
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
|
||||
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 {
|
||||
(32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 {
|
||||
(36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
|
||||
}
|
||||
|
||||
/// Set up a `GuestMemory` with one writable region for code+data.
|
||||
fn make_mem(code_base: u32, code_size: u32) -> GuestMemory {
|
||||
let mut mem = GuestMemory::new().expect("reserve 4GB");
|
||||
mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE)
|
||||
.expect("alloc bench region");
|
||||
mem
|
||||
}
|
||||
|
||||
/// Write a sequence of raw PPC instructions starting at `base`.
|
||||
fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) {
|
||||
for (i, &raw) in instrs.iter().enumerate() {
|
||||
mem.write_u32(base + (i as u32 * 4), raw);
|
||||
}
|
||||
}
|
||||
|
||||
/// Run `total_instrs` interpreter steps over a program of length `n`,
|
||||
/// wrapping PC back to `base` whenever it falls off the end. Returns the
|
||||
/// elapsed wall time.
|
||||
fn run_loop(
|
||||
ctx: &mut PpcContext,
|
||||
mem: &GuestMemory,
|
||||
cache: &mut DecodeCache,
|
||||
base: u32,
|
||||
n: u32,
|
||||
total_instrs: u64,
|
||||
) -> std::time::Duration {
|
||||
let end = base + n * 4;
|
||||
ctx.pc = base;
|
||||
let t0 = Instant::now();
|
||||
for _ in 0..total_instrs {
|
||||
let pv = mem.page_version(ctx.pc);
|
||||
let r = step_cached(ctx, mem, cache, pv);
|
||||
debug_assert!(matches!(r, StepResult::Continue));
|
||||
if ctx.pc >= end {
|
||||
ctx.pc = base;
|
||||
}
|
||||
}
|
||||
t0.elapsed()
|
||||
}
|
||||
|
||||
fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) {
|
||||
let secs = elapsed.as_secs_f64();
|
||||
let mips = (total_instrs as f64) / secs / 1.0e6;
|
||||
println!(
|
||||
"{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS",
|
||||
label, total_instrs, secs, mips
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_tight_alu_loop() {
|
||||
const BASE: u32 = 0x1000;
|
||||
const N: u32 = 256;
|
||||
const TOTAL: u64 = 50_000_000;
|
||||
|
||||
let mut mem = make_mem(BASE, 0x1000);
|
||||
// 256 × `addi r3, r3, 1` — pure register-register, no memory touch
|
||||
// beyond instruction fetch.
|
||||
let prog: Vec<u32> = (0..N).map(|_| enc_addi(3, 3, 1)).collect();
|
||||
write_program(&mut mem, BASE, &prog);
|
||||
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut cache = DecodeCache::new();
|
||||
|
||||
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL);
|
||||
report("tight_alu_loop", TOTAL, elapsed);
|
||||
}
|
||||
|
||||
fn bench_loadstore_loop() {
|
||||
const CODE_BASE: u32 = 0x1000;
|
||||
const DATA_BASE: u32 = 0x2000;
|
||||
const N: u32 = 256;
|
||||
const TOTAL: u64 = 30_000_000;
|
||||
|
||||
let mut mem = make_mem(CODE_BASE, 0x2000);
|
||||
// 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every
|
||||
// load/store path through `read_u32`/`write_u32` (incl. `find_mmio`).
|
||||
let mut prog = Vec::with_capacity(N as usize);
|
||||
for _ in 0..(N / 2) {
|
||||
prog.push(enc_stw(3, 4, 0));
|
||||
prog.push(enc_lwz(5, 4, 0));
|
||||
}
|
||||
write_program(&mut mem, CODE_BASE, &prog);
|
||||
|
||||
let mut ctx = PpcContext::new();
|
||||
ctx.gpr[3] = 0xDEAD_BEEF;
|
||||
ctx.gpr[4] = DATA_BASE as u64;
|
||||
let mut cache = DecodeCache::new();
|
||||
|
||||
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
|
||||
report("loadstore_loop", TOTAL, elapsed);
|
||||
}
|
||||
|
||||
fn bench_mmio_storm() {
|
||||
const CODE_BASE: u32 = 0x1000;
|
||||
const MMIO_BASE: u32 = 0xEA00_0000;
|
||||
const N: u32 = 64;
|
||||
// MMIO is slower per access — keep total smaller so the bench stays
|
||||
// under a few seconds.
|
||||
const TOTAL: u64 = 2_000_000;
|
||||
|
||||
let mut mem = make_mem(CODE_BASE, 0x1000);
|
||||
|
||||
let writes = Arc::new(AtomicU64::new(0));
|
||||
let reads = Arc::new(AtomicU32::new(0));
|
||||
let writes_clone = writes.clone();
|
||||
let reads_clone = reads.clone();
|
||||
mem.add_mmio_region(MmioRegion {
|
||||
base_address: MMIO_BASE,
|
||||
mask: 0xFFFF_0000,
|
||||
size: 0x0001_0000,
|
||||
read_callback: Box::new(move |_a| {
|
||||
reads_clone.fetch_add(1, Ordering::Relaxed);
|
||||
0
|
||||
}),
|
||||
write_callback: Box::new(move |_a, _v| {
|
||||
writes_clone.fetch_add(1, Ordering::Relaxed);
|
||||
}),
|
||||
});
|
||||
|
||||
let mut prog = Vec::with_capacity(N as usize);
|
||||
for _ in 0..(N / 2) {
|
||||
prog.push(enc_stw(3, 4, 0));
|
||||
prog.push(enc_lwz(5, 4, 0));
|
||||
}
|
||||
write_program(&mut mem, CODE_BASE, &prog);
|
||||
|
||||
let mut ctx = PpcContext::new();
|
||||
ctx.gpr[3] = 0x1234_5678;
|
||||
ctx.gpr[4] = MMIO_BASE as u64;
|
||||
let mut cache = DecodeCache::new();
|
||||
|
||||
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
|
||||
report("mmio_storm", TOTAL, elapsed);
|
||||
|
||||
// Sanity assertions — silently catch a refactor that breaks MMIO dispatch.
|
||||
let w = writes.load(Ordering::Relaxed);
|
||||
let r = reads.load(Ordering::Relaxed);
|
||||
assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched");
|
||||
assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched");
|
||||
}
|
||||
|
||||
fn main() {
|
||||
println!("xenia-cpu interpreter bench");
|
||||
println!(" build: {}", if cfg!(debug_assertions) { "debug" } else { "release" });
|
||||
bench_tight_alu_loop();
|
||||
bench_loadstore_loop();
|
||||
bench_mmio_storm();
|
||||
}
|
||||
423
crates/xenia-cpu/src/block_cache.rs
Normal file
423
crates/xenia-cpu/src/block_cache.rs
Normal file
@@ -0,0 +1,423 @@
|
||||
//! Tier-4 perf — basic-block cache for the PPC interpreter.
|
||||
//!
|
||||
//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction
|
||||
//! per slot, indexed by PC. The hot loop still pays the per-instruction
|
||||
//! cost of fetching the raw word, hashing the PC into a slot, and
|
||||
//! comparing tags. For straight-line code — common in the asset/inflate
|
||||
//! loops where Sylpheed boot is currently CPU-bound — the savings of
|
||||
//! batching N decoded instructions per slot lookup are linear in block
|
||||
//! length.
|
||||
//!
|
||||
//! ## Shape
|
||||
//!
|
||||
//! A `DecodedBlock` is a contiguous run of decoded instructions starting
|
||||
//! at `start_pc`, ending at the first *block terminator* (any branch,
|
||||
//! `sc`, trap, or `Invalid`) or at one of two safety limits:
|
||||
//!
|
||||
//! - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost.
|
||||
//! - 4 KiB page boundary stop. A block is fully contained inside a
|
||||
//! single 4 KiB guest page; that means `mem.page_version(start_pc)`
|
||||
//! is sufficient to detect any code-page rewrite that should
|
||||
//! invalidate the block. Without this rule the cache would have to
|
||||
//! walk every spanned page on every hit, which would erase the win.
|
||||
//!
|
||||
//! ## Invalidation
|
||||
//!
|
||||
//! Each block stamps the page version at build time. On lookup, if
|
||||
//! `mem.page_version(start_pc)` differs from `block.page_version`, the
|
||||
//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at
|
||||
//! block granularity.
|
||||
//!
|
||||
//! ## Debugger semantics
|
||||
//!
|
||||
//! Block dispatch is **opt-in** by the caller. The hot loop in
|
||||
//! `xenia-app/src/main.rs` selects the per-instruction path whenever
|
||||
//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set.
|
||||
//! That's how single-step, breakpoints, in-memory trace, instruction
|
||||
//! trace, and branch trace continue to observe every PC: the block
|
||||
//! cache simply never runs in those modes.
|
||||
|
||||
use crate::decoder::{decode, DecodedInstr};
|
||||
use xenia_memory::MemoryAccess;
|
||||
|
||||
/// Direct-mapped block-cache slot count. Same shape as
|
||||
/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the
|
||||
/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the
|
||||
/// slot collision rate is negligible.
|
||||
const BLOCK_CACHE_SIZE: usize = 1 << 16;
|
||||
const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32;
|
||||
|
||||
/// Hard cap on instructions per block. Keeps the worst-case memory
|
||||
/// footprint bounded and limits the rebuild cost when a code page
|
||||
/// gets bumped. 32 instructions is generous for most basic blocks
|
||||
/// (real-world average across Sylpheed boot is ~6 between branches).
|
||||
pub const MAX_BLOCK_INSTRS: usize = 32;
|
||||
|
||||
/// Guest page size — duplicated here to avoid pulling
|
||||
/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync
|
||||
/// with the memory crate. Both refer to the architectural PowerPC 4 KiB
|
||||
/// page granule, so this constant is locked.
|
||||
const GUEST_PAGE_SIZE: u32 = 4096;
|
||||
const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1);
|
||||
|
||||
/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock`
|
||||
/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and
|
||||
/// stays valid until the next `lookup_or_build` on the same slot.
|
||||
#[derive(Debug)]
|
||||
pub struct DecodedBlock {
|
||||
/// Guest PC at which this block starts. Used as the slot tag.
|
||||
pub start_pc: u32,
|
||||
/// Guest PC immediately after the last instruction in `instrs`.
|
||||
/// Equal to `instrs.last().addr + 4` whether or not the block
|
||||
/// ended on a terminator. Useful for tracing / disassembly.
|
||||
pub end_pc: u32,
|
||||
/// `mem.page_version(start_pc)` at build time. Mismatch on lookup
|
||||
/// invalidates the block. Single value because every block is
|
||||
/// page-bounded by construction.
|
||||
pub page_version: u64,
|
||||
/// Decoded instructions in execution order. Always non-empty after
|
||||
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
|
||||
/// pushes the first decoded word unconditionally).
|
||||
pub instrs: Vec<DecodedInstr>,
|
||||
}
|
||||
|
||||
/// Per-slot status from a `lookup_or_build` probe. Internal only.
|
||||
enum CacheStatus {
|
||||
/// Block at this slot matches `pc` and the page version at build
|
||||
/// time matches `mem.page_version(pc)` — return as-is.
|
||||
Hit,
|
||||
/// Block at this slot matched `pc` but the page version has
|
||||
/// advanced — rebuild and bump `invalidations`.
|
||||
Stale,
|
||||
/// Slot is empty or holds a block keyed at a different `start_pc`.
|
||||
/// Build a fresh block and bump `misses`.
|
||||
Miss,
|
||||
}
|
||||
|
||||
/// Direct-mapped block cache. One instance shared across all HW slots
|
||||
/// (block contents are PC-only and read-only after fill). Not
|
||||
/// thread-safe — owner is the single scheduler thread, same as
|
||||
/// `DecodeCache`.
|
||||
pub struct BlockCache {
|
||||
slots: Box<[Option<Box<DecodedBlock>>]>,
|
||||
hits: u64,
|
||||
misses: u64,
|
||||
invalidations: u64,
|
||||
}
|
||||
|
||||
impl Default for BlockCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockCache {
|
||||
pub fn new() -> Self {
|
||||
// `Option<Box<T>>` is a niche-optimized 8-byte slot; 64 K of
|
||||
// them cost ~512 KiB of cold storage. Live blocks beyond that
|
||||
// sit on the heap.
|
||||
let mut v: Vec<Option<Box<DecodedBlock>>> = Vec::with_capacity(BLOCK_CACHE_SIZE);
|
||||
v.resize_with(BLOCK_CACHE_SIZE, || None);
|
||||
Self {
|
||||
slots: v.into_boxed_slice(),
|
||||
hits: 0,
|
||||
misses: 0,
|
||||
invalidations: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn hits(&self) -> u64 {
|
||||
self.hits
|
||||
}
|
||||
pub fn misses(&self) -> u64 {
|
||||
self.misses
|
||||
}
|
||||
pub fn invalidations(&self) -> u64 {
|
||||
self.invalidations
|
||||
}
|
||||
|
||||
/// Return the cached block starting at `pc`, building it if absent
|
||||
/// or stale. The returned reference is borrowed from the cache and
|
||||
/// stays valid until the next `lookup_or_build` call.
|
||||
pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock {
|
||||
let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize;
|
||||
let cur_pv = mem.page_version(pc);
|
||||
|
||||
// Phase 1: classify the slot. Borrow ends before fill so the
|
||||
// mutable update below doesn't conflict.
|
||||
let status = match &self.slots[idx] {
|
||||
Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit,
|
||||
Some(b) if b.start_pc == pc => CacheStatus::Stale,
|
||||
_ => CacheStatus::Miss,
|
||||
};
|
||||
|
||||
// Phase 2: fill on miss/stale, account.
|
||||
match status {
|
||||
CacheStatus::Hit => {
|
||||
self.hits += 1;
|
||||
}
|
||||
CacheStatus::Stale => {
|
||||
self.invalidations += 1;
|
||||
self.misses += 1;
|
||||
let block = build_block(pc, mem, cur_pv);
|
||||
self.slots[idx] = Some(Box::new(block));
|
||||
}
|
||||
CacheStatus::Miss => {
|
||||
self.misses += 1;
|
||||
let block = build_block(pc, mem, cur_pv);
|
||||
self.slots[idx] = Some(Box::new(block));
|
||||
}
|
||||
}
|
||||
|
||||
// Slot is guaranteed populated at this point — Hit returned a
|
||||
// pre-existing block, Miss/Stale just wrote a new one.
|
||||
self.slots[idx]
|
||||
.as_deref()
|
||||
.expect("block freshly built or hit")
|
||||
}
|
||||
}
|
||||
|
||||
/// Walk forward from `pc`, decoding instructions and collecting them
|
||||
/// into a `DecodedBlock`. The walk stops on the first of:
|
||||
/// - a [`PpcOpcode::terminates_block`] true (the terminator IS
|
||||
/// included as the last instruction),
|
||||
/// - reaching [`MAX_BLOCK_INSTRS`],
|
||||
/// - the next PC would cross a 4 KiB guest page boundary.
|
||||
fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock {
|
||||
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
|
||||
let page_base = start_pc & GUEST_PAGE_MASK;
|
||||
let mut cur = start_pc;
|
||||
|
||||
loop {
|
||||
let raw = mem.read_u32(cur);
|
||||
let decoded = decode(raw, cur);
|
||||
let terminates = decoded.opcode.terminates_block();
|
||||
instrs.push(decoded);
|
||||
|
||||
if terminates {
|
||||
break;
|
||||
}
|
||||
if instrs.len() >= MAX_BLOCK_INSTRS {
|
||||
break;
|
||||
}
|
||||
let next = cur.wrapping_add(4);
|
||||
if (next & GUEST_PAGE_MASK) != page_base {
|
||||
break;
|
||||
}
|
||||
cur = next;
|
||||
}
|
||||
|
||||
let last = instrs.last().expect("build pushes at least one instruction");
|
||||
let end_pc = last.addr.wrapping_add(4);
|
||||
|
||||
DecodedBlock {
|
||||
start_pc,
|
||||
end_pc,
|
||||
page_version,
|
||||
instrs,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::opcode::PpcOpcode;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
/// 64 KiB byte-array memory, big-endian word reads.
|
||||
/// Mirrors `interpreter::tests::TestMem` but lives here so block_cache
|
||||
/// tests don't depend on interpreter internals.
|
||||
struct BlockTestMem {
|
||||
data: Box<[Cell<u8>]>,
|
||||
version_a: u64,
|
||||
version_b: u64,
|
||||
// Address of the page whose version is `version_b` instead of
|
||||
// `version_a`. Used to model an out-of-band page-version bump in
|
||||
// the invalidation test without going through write_*.
|
||||
bumped_page: Cell<Option<u32>>,
|
||||
}
|
||||
|
||||
impl BlockTestMem {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
data: (0..0x10000u32).map(|_| Cell::new(0)).collect(),
|
||||
version_a: 1,
|
||||
version_b: 2,
|
||||
bumped_page: Cell::new(None),
|
||||
}
|
||||
}
|
||||
fn put(&self, addr: u32, raw: u32) {
|
||||
let a = addr as usize;
|
||||
for (i, byte) in raw.to_be_bytes().iter().enumerate() {
|
||||
self.data[a + i].set(*byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MemoryAccess for BlockTestMem {
|
||||
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
let i = a as usize;
|
||||
u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
let i = a as usize;
|
||||
u32::from_be_bytes([
|
||||
self.data[i].get(), self.data[i + 1].get(),
|
||||
self.data[i + 2].get(), self.data[i + 3].get(),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
let i = a as usize;
|
||||
u64::from_be_bytes([
|
||||
self.data[i].get(), self.data[i + 1].get(),
|
||||
self.data[i + 2].get(), self.data[i + 3].get(),
|
||||
self.data[i + 4].get(), self.data[i + 5].get(),
|
||||
self.data[i + 6].get(), self.data[i + 7].get(),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let i = a as usize;
|
||||
let b = v.to_be_bytes();
|
||||
self.data[i].set(b[0]);
|
||||
self.data[i + 1].set(b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let i = a as usize;
|
||||
for (k, byte) in v.to_be_bytes().iter().enumerate() {
|
||||
self.data[i + k].set(*byte);
|
||||
}
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let i = a as usize;
|
||||
for (k, byte) in v.to_be_bytes().iter().enumerate() {
|
||||
self.data[i + k].set(*byte);
|
||||
}
|
||||
}
|
||||
fn translate(&self, _: u32) -> Option<*const u8> { None }
|
||||
fn translate_mut(&self, _: u32) -> Option<*mut u8> { None }
|
||||
fn page_version(&self, addr: u32) -> u64 {
|
||||
if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() {
|
||||
self.version_b
|
||||
} else {
|
||||
self.version_a
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PPC encodings — minimal subset for these tests.
|
||||
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
|
||||
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
|
||||
}
|
||||
fn enc_b_self() -> u32 {
|
||||
// b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0.
|
||||
18 << 26
|
||||
}
|
||||
fn enc_unimplemented() -> u32 {
|
||||
// Use opcode 0 raw = 0; decoder maps to Invalid.
|
||||
0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_built_to_terminator() {
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_addi(3, 3, 1));
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
mem.put(0x10C, enc_b_self()); // terminator
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.start_pc, 0x100);
|
||||
assert_eq!(b.instrs.len(), 4);
|
||||
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx);
|
||||
assert_eq!(b.end_pc, 0x110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_page_boundary() {
|
||||
// Build from 0x1FFC. The next PC (0x2000) is in a different
|
||||
// 4 KiB page — block must contain only the one instruction.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x1FFC, enc_addi(3, 3, 1));
|
||||
mem.put(0x2000, enc_addi(3, 3, 1));
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x1FFC, &mem);
|
||||
assert_eq!(b.instrs.len(), 1);
|
||||
assert_eq!(b.end_pc, 0x2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_max_len() {
|
||||
// 64 consecutive non-terminator instructions on one page —
|
||||
// block must clamp at MAX_BLOCK_INSTRS.
|
||||
let mem = BlockTestMem::new();
|
||||
for i in 0..64u32 {
|
||||
mem.put(0x100 + i * 4, enc_addi(3, 3, 1));
|
||||
}
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS);
|
||||
assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_invalid_opcode() {
|
||||
// Decoder mapping `Invalid` is treated as a block terminator
|
||||
// so the per-instruction Unimplemented path is preserved.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_unimplemented());
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.instrs.len(), 2);
|
||||
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_invalidates_on_page_version_bump() {
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.misses(), 1);
|
||||
assert_eq!(bc.hits(), 0);
|
||||
|
||||
// Same call → hit.
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.hits(), 1);
|
||||
assert_eq!(bc.invalidations(), 0);
|
||||
|
||||
// Bump the page version on the page containing 0x100. Next
|
||||
// lookup must invalidate and rebuild.
|
||||
mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK));
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.invalidations(), 1);
|
||||
assert_eq!(bc.misses(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_hit_returns_same_contents() {
|
||||
// Sanity: cache hit returns a block whose contents reflect the
|
||||
// ORIGINAL instruction stream, even after a non-version-bumping
|
||||
// poke to the underlying bytes. (No real workload would do
|
||||
// this, but it confirms we're returning cached data, not
|
||||
// re-reading.)
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 7));
|
||||
mem.put(0x104, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
|
||||
// Rewrite without bumping version (test-only path).
|
||||
let bytes = enc_addi(3, 3, 99).to_be_bytes();
|
||||
for (i, b) in bytes.iter().enumerate() {
|
||||
mem.data[0x100 + i].set(*b);
|
||||
}
|
||||
let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
|
||||
assert_eq!(first_simm, 7);
|
||||
assert_eq!(cached_simm, 7, "cache must serve original decoded form");
|
||||
}
|
||||
}
|
||||
@@ -29,16 +29,37 @@ pub mod spr {
|
||||
pub const XER: u32 = 1;
|
||||
pub const LR: u32 = 8;
|
||||
pub const CTR: u32 = 9;
|
||||
pub const TBL: u32 = 268;
|
||||
pub const TBU: u32 = 269;
|
||||
pub const DSISR: u32 = 18;
|
||||
pub const DAR: u32 = 19;
|
||||
/// Decrementer (hypervisor-visible, 32-bit down-counter).
|
||||
pub const DEC: u32 = 22;
|
||||
pub const TBL: u32 = 268; // Read (user)
|
||||
pub const TBU: u32 = 269; // Read (user)
|
||||
/// Time-base write (supervisor). Separate SPR number from TBL (268) for
|
||||
/// access-control reasons.
|
||||
pub const TBL_WRITE: u32 = 284;
|
||||
pub const TBU_WRITE: u32 = 285;
|
||||
pub const SPRG0: u32 = 272;
|
||||
pub const SPRG1: u32 = 273;
|
||||
pub const SPRG2: u32 = 274;
|
||||
pub const SPRG3: u32 = 275;
|
||||
pub const VRSAVE: u32 = 256;
|
||||
pub const PVR: u32 = 287;
|
||||
pub const HID0: u32 = 1008;
|
||||
pub const HID1: u32 = 1009;
|
||||
pub const PIR: u32 = 1023;
|
||||
}
|
||||
|
||||
/// LR halt sentinel. When `bclr` returns to this address, the interpreter
|
||||
/// loop halts cleanly (matches the "entry returned" convention).
|
||||
pub const LR_HALT_SENTINEL: u64 = 0xBCBC_BCBC;
|
||||
|
||||
/// VSCR NJ (Non-Java mode) bit. Stored in word 3 at bit 16 (mask 0x0001_0000).
|
||||
/// Set at startup; when clear, denormals are flushed to zero following IEEE-754.
|
||||
pub const VSCR_NJ_MASK: u32 = 0x0001_0000;
|
||||
/// VSCR SAT (saturation sticky) bit. Stored in word 3 at bit 31 (mask 0x0000_0001).
|
||||
pub const VSCR_SAT_MASK: u32 = 0x0000_0001;
|
||||
|
||||
/// PowerPC processor context. Holds all register state for one guest thread.
|
||||
/// Mirrors PPCContext from ppc_context.h, minus JIT-specific fields.
|
||||
#[repr(C, align(64))]
|
||||
@@ -64,15 +85,49 @@ pub struct PpcContext {
|
||||
pub xer_ca: u8,
|
||||
pub xer_ov: u8,
|
||||
pub xer_so: u8,
|
||||
// Altivec VSCR saturation bit
|
||||
pub vscr_sat: u8,
|
||||
/// XER[25:31] string-byte count (`TBC`). Read/written by `mtspr XER`,
|
||||
/// consumed by `lswx`/`stswx`. Per PPCBUG-123/124/161: was previously
|
||||
/// unmodelled, making `lswx`/`stswx` a permanent no-op.
|
||||
pub xer_tbc: u8,
|
||||
// Altivec VSCR. Only bits 16 (NJ) and 31 (SAT) of word 3 are meaningful.
|
||||
pub vscr: Vec128,
|
||||
// VRSAVE (SPR 256). Bitmask of which VRs need saving across context switches.
|
||||
pub vrsave: u32,
|
||||
|
||||
// Program counter
|
||||
pub pc: u32,
|
||||
// Reservation address/value for lwarx/stwcx
|
||||
pub reserved_addr: u32,
|
||||
// Reservation for lwarx/ldarx/stwcx/stdcx. Xenon's reservation granule is
|
||||
// one L2 cache line (128 bytes) — `reserved_line` is stored as the base
|
||||
// address of that line (`ea & !0x7F`). `has_reservation` gates the
|
||||
// validity; stwcx./stdcx. check that both match before committing.
|
||||
// `reserved_val` is retained for possible future use by a coherency
|
||||
// observer; the store-conditional logic itself does not compare it.
|
||||
pub reserved_line: u32,
|
||||
pub reserved_val: u64,
|
||||
pub has_reservation: bool,
|
||||
/// PPCBUG-151 — width of the active reservation: 4 = `lwarx` (word),
|
||||
/// 8 = `ldarx` (doubleword), 0 = no reservation. `stwcx.` requires
|
||||
/// width==4; `stdcx.` requires width==8. Cross-width pairs fail
|
||||
/// deterministically with CR0.EQ=0. Cleared alongside `has_reservation`
|
||||
/// on every `stwcx.`/`stdcx.` exit (success or failure).
|
||||
pub reservation_width: u8,
|
||||
/// M3.7 — generation stamp returned by [`crate::ReservationTable::reserve`]
|
||||
/// at the most recent `lwarx`/`ldarx`. Paired with `reserved_line`;
|
||||
/// `stwcx.`/`stdcx.` pass this back to `try_commit`. Meaningful only
|
||||
/// when `reservation_table` is `Some` and the table is enabled.
|
||||
pub reserved_generation: u32,
|
||||
/// M3.7 — optional handle to the inter-thread reservation table.
|
||||
/// When `Some(table)` *and* `table.is_enabled()`, the interpreter's
|
||||
/// `lwarx`/`stwcx.`/`ldarx`/`stdcx.` arms route through the table;
|
||||
/// otherwise they use the legacy per-`PpcContext` fields above. The
|
||||
/// scheduler populates this when it spawns a thread under a kernel
|
||||
/// that has `reservations` set.
|
||||
pub reservation_table: Option<std::sync::Arc<crate::ReservationTable>>,
|
||||
/// M3.7 — emulated HW slot ID this thread is bound to. Used as the
|
||||
/// reservation table's `hw_id` discriminator so two threads on
|
||||
/// different slots can't accidentally commit each other's
|
||||
/// reservations. Populated by the scheduler at spawn / migration.
|
||||
pub hw_id: u8,
|
||||
|
||||
// Thread ID (for kernel use)
|
||||
pub thread_id: u32,
|
||||
@@ -82,6 +137,12 @@ pub struct PpcContext {
|
||||
|
||||
// Time base (incremented each instruction for debugging)
|
||||
pub timebase: u64,
|
||||
|
||||
// Decrementer (SPR 22): 32-bit down-counter that fires an external
|
||||
// interrupt at underflow on real hw. Xenia-rs doesn't dispatch DEC
|
||||
// interrupts to the guest; this value is maintained so that mfspr DEC
|
||||
// returns something coherent.
|
||||
pub dec: u32,
|
||||
}
|
||||
|
||||
impl PpcContext {
|
||||
@@ -89,7 +150,9 @@ impl PpcContext {
|
||||
Self {
|
||||
gpr: [0; 32],
|
||||
ctr: 0,
|
||||
lr: 0,
|
||||
// Canary sets LR to the halt sentinel at thread start so `blr`
|
||||
// from the top-level entry falls out of the interpreter loop.
|
||||
lr: LR_HALT_SENTINEL,
|
||||
msr: 0,
|
||||
fpr: [0.0; 32],
|
||||
vr: [Vec128::ZERO; 128],
|
||||
@@ -98,14 +161,23 @@ impl PpcContext {
|
||||
xer_ca: 0,
|
||||
xer_ov: 0,
|
||||
xer_so: 0,
|
||||
vscr_sat: 0,
|
||||
xer_tbc: 0,
|
||||
// VSCR starts with NJ bit set (denormals flushed) — matches canary
|
||||
// thread_state.cc initialization.
|
||||
vscr: Vec128::from_u32x4(0, 0, 0, VSCR_NJ_MASK),
|
||||
vrsave: 0xFFFF_FFFF,
|
||||
pc: 0,
|
||||
reserved_addr: 0,
|
||||
reserved_line: 0,
|
||||
reserved_val: 0,
|
||||
has_reservation: false,
|
||||
reservation_width: 0,
|
||||
reserved_generation: 0,
|
||||
reservation_table: None,
|
||||
hw_id: 0,
|
||||
thread_id: 0,
|
||||
cycle_count: 0,
|
||||
timebase: 0,
|
||||
dec: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -173,7 +245,10 @@ impl PpcContext {
|
||||
|
||||
/// Get the full XER register value.
|
||||
pub fn xer(&self) -> u32 {
|
||||
((self.xer_so as u32) << 31) | ((self.xer_ov as u32) << 30) | ((self.xer_ca as u32) << 29)
|
||||
((self.xer_so as u32) << 31)
|
||||
| ((self.xer_ov as u32) << 30)
|
||||
| ((self.xer_ca as u32) << 29)
|
||||
| (self.xer_tbc as u32) // PPCBUG-123/566: bits 0-6 (TBC).
|
||||
}
|
||||
|
||||
/// Set XER from a full 32-bit value.
|
||||
@@ -181,6 +256,28 @@ impl PpcContext {
|
||||
self.xer_so = ((val >> 31) & 1) as u8;
|
||||
self.xer_ov = ((val >> 30) & 1) as u8;
|
||||
self.xer_ca = ((val >> 29) & 1) as u8;
|
||||
self.xer_tbc = (val & 0x7F) as u8; // PPCBUG-124.
|
||||
}
|
||||
|
||||
/// Read the VSCR SAT (sticky saturation) bit.
|
||||
pub fn vscr_sat(&self) -> bool {
|
||||
(self.vscr.u32x4(3) & VSCR_SAT_MASK) != 0
|
||||
}
|
||||
|
||||
/// Set or clear VSCR SAT. Preserves the NJ bit (and any other word-3 bits).
|
||||
pub fn set_vscr_sat(&mut self, v: bool) {
|
||||
let mut w = self.vscr.u32x4(3);
|
||||
if v {
|
||||
w |= VSCR_SAT_MASK;
|
||||
} else {
|
||||
w &= !VSCR_SAT_MASK;
|
||||
}
|
||||
self.vscr.set_u32x4(3, w);
|
||||
}
|
||||
|
||||
/// Read the VSCR NJ (non-Java mode / flush-denormals) bit.
|
||||
pub fn vscr_nj(&self) -> bool {
|
||||
(self.vscr.u32x4(3) & VSCR_NJ_MASK) != 0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -74,9 +74,24 @@ impl DecodedInstr {
|
||||
/// Rc bit (bit 31) - record CR0
|
||||
#[inline] pub fn rc_bit(&self) -> bool { self.raw & 1 != 0 }
|
||||
|
||||
/// Rc for VC-form vector compare instructions — PPC bit 21 = host bit 10.
|
||||
#[inline] pub fn vc_rc_bit(&self) -> bool { (self.raw >> 10) & 1 != 0 }
|
||||
/// Rc for VX128_R-form vector compare instructions — PPC bit 27 = host bit 4.
|
||||
/// VX128_R Rc bit — PPC bit 25 (host bit 6) per canary's FormatVX128_R
|
||||
/// bitfield layout. PPCBUG-700.
|
||||
#[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 6) & 1 != 0 }
|
||||
|
||||
/// IMM field for VX128_4-form instructions (vrlimi128) — 5-bit blend mask at PPC bits 11-15.
|
||||
#[inline] pub fn vx128_4_imm(&self) -> u32 { extract_bits(self.raw, 11, 15) }
|
||||
/// z field for VX128_4-form instructions (vrlimi128) — 2-bit rotation index at PPC bits 24-25.
|
||||
#[inline] pub fn vx128_4_z(&self) -> u32 { extract_bits(self.raw, 24, 25) }
|
||||
|
||||
/// OE bit (bit 21) - overflow enable
|
||||
#[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 }
|
||||
|
||||
/// TO field (bits 6-10) for tw/twi/td/tdi trap instructions.
|
||||
#[inline] pub fn to(&self) -> u32 { extract_bits(self.raw, 6, 10) }
|
||||
|
||||
/// MB, ME fields for rotate instructions
|
||||
#[inline] pub fn mb(&self) -> u32 { extract_bits(self.raw, 21, 25) }
|
||||
#[inline] pub fn me(&self) -> u32 { extract_bits(self.raw, 26, 30) }
|
||||
@@ -86,7 +101,13 @@ impl DecodedInstr {
|
||||
|
||||
/// SH field for 64-bit shifts (bits 16-20 + bit 30)
|
||||
#[inline] pub fn sh64(&self) -> u32 {
|
||||
(extract_bits(self.raw, 16, 20) << 1) | extract_bits(self.raw, 30, 30)
|
||||
(extract_bits(self.raw, 30, 30) << 5) | extract_bits(self.raw, 16, 20)
|
||||
}
|
||||
|
||||
/// MB/ME field for MD-form and MDS-form instructions (6-bit field, split encoding).
|
||||
/// MB[4:0] at PPC bits 21-25; MB[5] at PPC bit 26.
|
||||
#[inline] pub fn mb_md(&self) -> u32 {
|
||||
extract_bits(self.raw, 21, 25) | (extract_bits(self.raw, 26, 26) << 5)
|
||||
}
|
||||
|
||||
/// SPR field (bits 11-20, swapped halves)
|
||||
@@ -114,32 +135,67 @@ impl DecodedInstr {
|
||||
/// crbB (bits 16-20)
|
||||
#[inline] pub fn crbb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
|
||||
|
||||
// VMX128 field extractors
|
||||
// VMX128 field extractors — bit positions match canary's
|
||||
// FormatVX128/VX128_2/VX128_4/VX128_5/VX128_R bitfield layout
|
||||
// (xenia-canary `ppc_decode_data.h:484-663`, LSB-first packed). PPCBUG-700.
|
||||
|
||||
/// VA128 (bits 6-10, plus bit from 29)
|
||||
/// VA128 = VA128l(5) | VA128h(1) << 5 | VA128H(1) << 6.
|
||||
/// Canonical 7-bit register selector: PPC 11-15 (low), PPC 26 (mid), PPC 21 (high).
|
||||
#[inline] pub fn va128(&self) -> usize {
|
||||
(extract_bits(self.raw, 6, 10) | (extract_bits(self.raw, 29, 29) << 5)) as usize
|
||||
(extract_bits(self.raw, 11, 15)
|
||||
| (extract_bits(self.raw, 26, 26) << 5)
|
||||
| (extract_bits(self.raw, 21, 21) << 6)) as usize
|
||||
}
|
||||
|
||||
/// VB128 (bits 16-20, plus bits from 28, 30)
|
||||
/// VB128 = VB128l(5) | VB128h(2) << 5. Canary's VB128h is a 2-bit
|
||||
/// contiguous field at PPC 30-31 (host bits 0-1).
|
||||
#[inline] pub fn vb128(&self) -> usize {
|
||||
(extract_bits(self.raw, 16, 20)
|
||||
| (extract_bits(self.raw, 28, 28) << 5)
|
||||
| (extract_bits(self.raw, 30, 30) << 6)) as usize
|
||||
| (extract_bits(self.raw, 30, 31) << 5)) as usize
|
||||
}
|
||||
|
||||
/// VD128 (bits 6-10, plus bits from 21, 22)
|
||||
/// VD128 = VD128l(5) | VD128h(2) << 5. Canary's VD128h is a 2-bit
|
||||
/// contiguous field at PPC 28-29 (host bits 2-3).
|
||||
#[inline] pub fn vd128(&self) -> usize {
|
||||
(extract_bits(self.raw, 6, 10)
|
||||
| (extract_bits(self.raw, 21, 21) << 5)
|
||||
| (extract_bits(self.raw, 22, 22) << 6)) as usize
|
||||
| (extract_bits(self.raw, 28, 29) << 5)) as usize
|
||||
}
|
||||
|
||||
/// VS128 - same encoding as VD128
|
||||
#[inline] pub fn vs128(&self) -> usize { self.vd128() }
|
||||
|
||||
/// VC register for VX128_2-form instructions (vperm128) — 3-bit at PPC bits 23-25.
|
||||
#[inline] pub fn vc128_2(&self) -> usize { extract_bits(self.raw, 23, 25) as usize }
|
||||
|
||||
/// NB field (bits 16-20) for lswi/stswi
|
||||
#[inline] pub fn nb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
|
||||
|
||||
/// PERM field for VX128_P-form instructions (vpermwi128) — 8-bit split encoding.
|
||||
/// PERMl (5 bits) at PPC bits 11-15; PERMh (3 bits) at PPC bits 23-25.
|
||||
#[inline] pub fn vx128_p_perm(&self) -> u32 {
|
||||
extract_bits(self.raw, 11, 15) | (extract_bits(self.raw, 23, 25) << 5)
|
||||
}
|
||||
|
||||
/// SH field for VX128_5-form instructions (vsldoi128) — 4-bit shift at PPC bits 22-25.
|
||||
#[inline] pub fn vx128_5_sh(&self) -> u32 { extract_bits(self.raw, 22, 25) }
|
||||
}
|
||||
|
||||
/// Extract the 5-bit `UIMM` (`VX128_3`) / `IMM` (`VX128_4`) field. Canary
|
||||
/// packs both formats with LSB-bits 16-20 holding the field, which is
|
||||
/// MSB bits 11-15 in our `extract_bits` convention. For `vpkd3d128` /
|
||||
/// `vupkd3d128` the decoded selector is `type = UIMM >> 2` (3 bits; valid
|
||||
/// values 0-6 per [`crate::vmx::D3dPackType`], 7 is undocumented /
|
||||
/// undefined in canary) and `pack = UIMM & 0x3` (output-slot layout for
|
||||
/// `vpkd3d128` only, `vupkd3d128` ignores it).
|
||||
///
|
||||
/// First-Pixels M3: the interpreter previously used a hand-rolled
|
||||
/// `(instr.raw >> 6) & 0x7` that was **LSB-numbered** and extracted
|
||||
/// bits from a completely different part of the word (the
|
||||
/// secondary-opcode region). Centralizing the extractor here matches
|
||||
/// canary's `FormatVX128_{3,4}::{UIMM,IMM}` field semantics exactly.
|
||||
#[inline]
|
||||
pub fn extract_vx128_uimm5(raw: u32) -> u32 {
|
||||
extract_bits(raw, 11, 15)
|
||||
}
|
||||
|
||||
/// Decode a 32-bit PPC instruction into its opcode.
|
||||
@@ -149,6 +205,123 @@ pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
|
||||
DecodedInstr { opcode, raw, addr }
|
||||
}
|
||||
|
||||
// Perf tier-2 — direct-mapped PC-keyed decode cache.
|
||||
//
|
||||
// The interpreter hot path spends ~15-25% of its time in `decode()`
|
||||
// parsing the raw u32 and walking the primary+secondary opcode tables.
|
||||
// For non-self-modifying guest code — the common case past the XEX
|
||||
// loader — `decode(raw, pc)` is purely a function of `(raw, pc)` and
|
||||
// the output is `Copy + 16B`. A direct-mapped cache indexed by
|
||||
// `(pc >> 2) & MASK` gives the interpreter a 1-comparison fast path,
|
||||
// at the cost of one branch and a 1.5 MiB region of memory.
|
||||
//
|
||||
// Invalidation piggybacks on `xenia_memory::GuestMemory::page_version`
|
||||
// (P5 texture-cache invalidation): every cache entry carries the page
|
||||
// version that was active at decode time; on lookup we compare against
|
||||
// the current version of the containing 4 KiB page. Any write to the
|
||||
// page bumps the counter, so the next decode on that PC is a miss that
|
||||
// refills.
|
||||
|
||||
/// Number of direct-mapped entries. 2^16 = 65,536 slots, one PPC
|
||||
/// instruction address per slot — enough for every hot code path in a
|
||||
/// typical Xbox 360 title to stay resident without collision.
|
||||
const DECODE_CACHE_SIZE: usize = 1 << 16;
|
||||
const DECODE_CACHE_MASK: u32 = (DECODE_CACHE_SIZE - 1) as u32;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct DecodeCacheEntry {
|
||||
/// Guest PC this entry was decoded at. Used as the tag on lookup; a
|
||||
/// mismatch means the slot was last populated by a different PC that
|
||||
/// shares the same low-16 index.
|
||||
pc: u32,
|
||||
/// Page version at decode time (from `GuestMemory::page_version(pc)`).
|
||||
/// Zero means "unused slot" since real page versions start at 1.
|
||||
page_version: u64,
|
||||
decoded: DecodedInstr,
|
||||
}
|
||||
|
||||
impl DecodeCacheEntry {
|
||||
const fn empty() -> Self {
|
||||
// `Invalid` is the decoder's "unrecognized opcode" sentinel; we
|
||||
// use it here as the empty-slot marker. Real misses compare `pc`,
|
||||
// not the opcode, so the sentinel choice is cosmetic.
|
||||
Self {
|
||||
pc: 0,
|
||||
page_version: 0,
|
||||
decoded: DecodedInstr {
|
||||
opcode: PpcOpcode::Invalid,
|
||||
raw: 0,
|
||||
addr: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Direct-mapped PC-keyed decode cache. One instance shared across all
|
||||
/// HW threads (PC is thread-independent; entries are read-only once
|
||||
/// filled). Not thread-safe — the single scheduler thread owns it.
|
||||
pub struct DecodeCache {
|
||||
slots: Box<[DecodeCacheEntry]>,
|
||||
hits: u64,
|
||||
misses: u64,
|
||||
invalidations: u64,
|
||||
}
|
||||
|
||||
impl Default for DecodeCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl DecodeCache {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
slots: vec![DecodeCacheEntry::empty(); DECODE_CACHE_SIZE].into_boxed_slice(),
|
||||
hits: 0,
|
||||
misses: 0,
|
||||
invalidations: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up (or fill) the decoded form of the instruction at `pc`.
|
||||
/// `raw` is the fetched instruction word; `current_page_version` is
|
||||
/// `mem.page_version(pc)` — the caller has it cheaper than we do,
|
||||
/// since they're already touching `mem` to fetch `raw`.
|
||||
#[inline]
|
||||
pub fn lookup(&mut self, pc: u32, raw: u32, current_page_version: u64) -> DecodedInstr {
|
||||
let idx = ((pc >> 2) & DECODE_CACHE_MASK) as usize;
|
||||
// Safety: `idx` is masked into `[0, DECODE_CACHE_SIZE)` so the
|
||||
// slice access is always in-bounds. Opt-out of the bounds check
|
||||
// for the hot path.
|
||||
let entry = unsafe { self.slots.get_unchecked_mut(idx) };
|
||||
if entry.pc == pc && entry.page_version == current_page_version {
|
||||
self.hits += 1;
|
||||
return entry.decoded;
|
||||
}
|
||||
if entry.pc == pc && entry.page_version != current_page_version {
|
||||
self.invalidations += 1;
|
||||
}
|
||||
self.misses += 1;
|
||||
let decoded = decode(raw, pc);
|
||||
*entry = DecodeCacheEntry {
|
||||
pc,
|
||||
page_version: current_page_version,
|
||||
decoded,
|
||||
};
|
||||
decoded
|
||||
}
|
||||
|
||||
pub fn hits(&self) -> u64 {
|
||||
self.hits
|
||||
}
|
||||
pub fn misses(&self) -> u64 {
|
||||
self.misses
|
||||
}
|
||||
pub fn invalidations(&self) -> u64 {
|
||||
self.invalidations
|
||||
}
|
||||
}
|
||||
|
||||
fn lookup_opcode(code: u32) -> PpcOpcode {
|
||||
match extract_bits(code, 0, 5) {
|
||||
2 => PpcOpcode::tdi,
|
||||
@@ -498,9 +671,13 @@ fn decode_op6(code: u32) -> PpcOpcode {
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// VMX128 compare
|
||||
let key4 = (extract_bits(code, 22, 24) << 3) | extract_bits(code, 27, 27);
|
||||
match key4 {
|
||||
// VMX128 compare (VX128_R form). Single dispatch path: bit 27 = 0 always
|
||||
// for these opcodes per canary's table (`ppc_opcode_table_gen.cc:295-305`).
|
||||
// The Rc bit is at PPC 25 (host bit 6) per the FormatVX128_R bitfield —
|
||||
// it's a runtime modifier read by the interpreter, NOT part of the
|
||||
// secondary-opcode discrimination. PPCBUG-700.
|
||||
let key4_nd = (extract_bits(code, 22, 24) << 3) | extract_bits(code, 27, 27);
|
||||
match key4_nd {
|
||||
0b000000 => return PpcOpcode::vcmpeqfp128,
|
||||
0b001000 => return PpcOpcode::vcmpgefp128,
|
||||
0b010000 => return PpcOpcode::vcmpgtfp128,
|
||||
@@ -781,6 +958,57 @@ mod tests {
|
||||
assert_eq!(instr.d(), 0x20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_miss_fills_then_hit() {
|
||||
let mut cache = DecodeCache::new();
|
||||
let raw: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let pc = 0x8200_0000u32;
|
||||
let first = cache.lookup(pc, raw, 1);
|
||||
assert_eq!(first.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.hits(), 0);
|
||||
assert_eq!(cache.misses(), 1);
|
||||
// Same pc, same version → cache hit, no new decode.
|
||||
let second = cache.lookup(pc, raw, 1);
|
||||
assert_eq!(second.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.hits(), 1);
|
||||
assert_eq!(cache.misses(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_stale_version_refills() {
|
||||
let mut cache = DecodeCache::new();
|
||||
// First fill with an `addi`.
|
||||
let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let pc = 0x8200_0000u32;
|
||||
cache.lookup(pc, raw_addi, 1);
|
||||
// Guest rewrote the page: same pc, different raw + bumped version.
|
||||
// Cache must refill — not return the stale `addi`.
|
||||
let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
|
||||
let refreshed = cache.lookup(pc, raw_lwz, 2);
|
||||
assert_eq!(refreshed.opcode, PpcOpcode::lwz);
|
||||
assert_eq!(cache.invalidations(), 1);
|
||||
assert_eq!(cache.misses(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_pc_collision_refills() {
|
||||
// Two PCs that hash to the same slot (pc >> 2 low 16 bits equal)
|
||||
// must not alias. Slot index = ((pc >> 2) & 0xFFFF) — pick two
|
||||
// PCs 4 * 2^16 bytes apart.
|
||||
let mut cache = DecodeCache::new();
|
||||
let pc_a = 0x8200_0000u32;
|
||||
let pc_b = pc_a.wrapping_add(0x0004_0000u32); // (>> 2) differs by 2^16
|
||||
let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
|
||||
cache.lookup(pc_a, raw_addi, 1);
|
||||
// Different pc but same slot → miss + refill.
|
||||
cache.lookup(pc_b, raw_lwz, 1);
|
||||
// First pc comes back → miss + refill (slot was taken by pc_b).
|
||||
let back = cache.lookup(pc_a, raw_addi, 1);
|
||||
assert_eq!(back.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.misses(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_branch() {
|
||||
// b +0x100 => opcode 18, LI=0x40 (shifted left 2 = 0x100), AA=0, LK=0
|
||||
@@ -816,4 +1044,202 @@ mod tests {
|
||||
assert_eq!(extract_bits(0x8000_0000, 0, 0), 1);
|
||||
assert_eq!(extract_bits(0x0000_0001, 31, 31), 1);
|
||||
}
|
||||
|
||||
// VMX128 register-name extraction. Locks the canonical bit positions
|
||||
// (decoder.rs is the single source of truth — the analysis crate's
|
||||
// old `ppc.rs` had different positions, which produced wrong printed
|
||||
// register names; the bug was silent because the interpreter never
|
||||
// used those extractors). Each test poke-bits exactly the slots the
|
||||
// accessor reads and asserts the assembled register number.
|
||||
|
||||
/// Build a VMX128 test word for the canary-compliant register layout.
|
||||
/// `vd128 = vd_lo | (vd_hi << 5)` where vd_lo is 5 bits (PPC 6-10) and
|
||||
/// vd_hi is 2 bits (PPC 28-29). Same shape for vb128 (vb_lo at PPC 16-20,
|
||||
/// vb_hi 2 bits at PPC 30-31). va128 = va_lo | (va_h26<<5) | (va_h21<<6)
|
||||
/// per canary's 7-bit VA selector.
|
||||
fn vmx128_test_word(vd_lo: u32, vd_hi: u32, va_lo: u32, va_h26: u32, va_h21: u32,
|
||||
vb_lo: u32, vb_hi: u32) -> u32 {
|
||||
// PPC bit i -> host bit (31-i).
|
||||
(vd_lo << (31 - 10)) // VD128l: PPC 6-10 = host 21-25
|
||||
| (vd_hi << (31 - 29)) // VD128h: PPC 28-29 = host 2-3 (LSB at host 2)
|
||||
| (va_lo << (31 - 15)) // VA128l: PPC 11-15 = host 16-20
|
||||
| (va_h26 << (31 - 26)) // VA128h: PPC 26 = host 5
|
||||
| (va_h21 << (31 - 21)) // VA128H: PPC 21 = host 10
|
||||
| (vb_lo << (31 - 20)) // VB128l: PPC 16-20 = host 11-15
|
||||
| (vb_hi << (31 - 31)) // VB128h: PPC 30-31 = host 0-1 (LSB at host 0)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_low_5_bits_only() {
|
||||
// vd_lo = 0..31, vd_hi = 0 → vd128 = vd_lo
|
||||
for r in 0..32u32 {
|
||||
let raw = (r as u32) << (31 - 10);
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), r as usize, "vd_lo={r}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_high_low_bit_adds_32() {
|
||||
// vd_lo = 0, VD128h = 0b01 (LSB only at host bit 2 = PPC 29) → vd128 = 32
|
||||
let raw = (1u32 << (31 - 29));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_high_high_bit_adds_64() {
|
||||
// vd_lo = 0, VD128h = 0b10 (MSB only at host bit 3 = PPC 28) → vd128 = 64
|
||||
let raw = (1u32 << (31 - 28));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_full_127() {
|
||||
// vd_lo = 31, VD128h = 0b11 → vd128 = 127
|
||||
let raw = (31u32 << (31 - 10))
|
||||
| (1u32 << (31 - 28))
|
||||
| (1u32 << (31 - 29));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 127);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_va128_canary_layout() {
|
||||
// va_lo = 7 at PPC 11-15, VA128h = 1 at PPC 26 → va128 = 7 | 32 = 39
|
||||
let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 26));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.va128(), 39);
|
||||
// VA128H = 1 at PPC 21 → va128 += 64 = 103
|
||||
let raw = raw | (1u32 << (31 - 21));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.va128(), 7 | 32 | 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vb128_uses_bits30_31() {
|
||||
// vb_lo = 5 at PPC 16-20. VB128h = 0b01 (LSB at PPC 31 = host 0) → +32.
|
||||
// VB128h = 0b11 → +96.
|
||||
let raw = (5u32 << (31 - 20)) | (1u32 << (31 - 31));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vb128(), 5 | 32);
|
||||
let raw = raw | (1u32 << (31 - 30));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vb128(), 5 | 32 | 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vs128_aliases_vd128() {
|
||||
// vs128 must always equal vd128.
|
||||
for r in [0u32, 31, 32, 64, 96, 127] {
|
||||
let lo = r & 0x1F;
|
||||
let hi = (r >> 5) & 0x3;
|
||||
let raw = (lo << (31 - 10))
|
||||
| (hi << (31 - 29));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}");
|
||||
assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}");
|
||||
assert_eq!(d.vd128(), d.vs128());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[allow(dead_code)]
|
||||
fn _vmx128_test_word_helper_compiles() {
|
||||
// Keep the helper validated against the real accessor.
|
||||
// vd_lo=5, vd_hi=0b11 → vd128 = 5 | 96 = 101
|
||||
let raw = vmx128_test_word(5, 3, 0, 0, 0, 0, 0);
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 5 | 32 | 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vx128_5_sh_bit_positions() {
|
||||
// SH=8 (binary 1000): bit 3 = 1, bits 0-2 = 0.
|
||||
// Host bit 9 = 1 (PPC bit 22), host bits 6-8 = 0.
|
||||
// So raw bit 9 set = raw |= 1 << 9 = 0x200
|
||||
let raw = 0x200u32; // host bit 9 set only
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vx128_5_sh(), 8, "SH=8: MSB at PPC bit 22");
|
||||
|
||||
// SH=1 (binary 0001): host bit 6 set = raw |= 1 << 6 = 0x40
|
||||
let raw = 0x40u32;
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vx128_5_sh(), 1, "SH=1: LSB at PPC bit 25");
|
||||
|
||||
// SH=15 (binary 1111): host bits 6-9 all set = raw |= 0xF << 6 = 0x3C0
|
||||
let raw = 0x3C0u32;
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vx128_5_sh(), 15, "SH=15: all 4 bits set");
|
||||
|
||||
// SH=0: raw=0
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 0, addr: 0 };
|
||||
assert_eq!(d.vx128_5_sh(), 0, "SH=0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vx128_4_accessors_correct_bit_positions() {
|
||||
// z=3 (binary 11) at PPC bits 24-25 = host bits 6-7
|
||||
let raw = 0b11u32 << 6;
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vx128_4_z(), 3, "z=3 from host bits 6-7");
|
||||
|
||||
// IMM=0x15 (binary 10101) at PPC bits 11-15 = host bits 16-20
|
||||
let raw2 = 0x15u32 << 16;
|
||||
let d2 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: raw2, addr: 0 };
|
||||
assert_eq!(d2.vx128_4_imm(), 0x15, "IMM=0x15 from host bits 16-20");
|
||||
|
||||
// Combined: z=1, IMM=0xA — fields must not bleed into each other
|
||||
let raw3 = (0x1u32 << 6) | (0xAu32 << 16);
|
||||
let d3 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: raw3, addr: 0 };
|
||||
assert_eq!(d3.vx128_4_z(), 1, "z=1 combined");
|
||||
assert_eq!(d3.vx128_4_imm(), 0xA, "IMM=0xA combined");
|
||||
|
||||
// z=2, IMM=0xF — max 4-bit blend mask, exercises the full lower nibble
|
||||
let raw4 = (0b10u32 << 6) | (0xFu32 << 16);
|
||||
let d4 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: raw4, addr: 0 };
|
||||
assert_eq!(d4.vx128_4_z(), 2, "z=2 from binary 10");
|
||||
assert_eq!(d4.vx128_4_imm(), 0xF, "IMM=0xF all-ones nibble");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vc128_2_extracts_ppc_bits_23_25() {
|
||||
// VC=5 (binary 101) at PPC bits 23-25 = host bits 6-8
|
||||
// extract_bits(raw, 23, 25) = (raw >> (31-25)) & 0x7 = (raw >> 6) & 0x7
|
||||
let raw = 5u32 << 6; // host bits 6-8 = 5
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vc128_2(), 5);
|
||||
|
||||
let d0 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 0, addr: 0 };
|
||||
assert_eq!(d0.vc128_2(), 0);
|
||||
|
||||
let d7 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 7u32 << 6, addr: 0 };
|
||||
assert_eq!(d7.vc128_2(), 7);
|
||||
|
||||
let d1 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 1u32 << 6, addr: 0 };
|
||||
assert_eq!(d1.vc128_2(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vx128_p_perm_assembles_correctly() {
|
||||
// PERMl=0x1F (all 5 bits set) at host bits 16-20: raw = 0x1F << 16
|
||||
let raw = 0x1Fu32 << 16;
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vx128_p_perm(), 0x1F, "PERMl only");
|
||||
|
||||
// PERMh=0x7 (all 3 bits set) at host bits 6-8: raw = 0x7 << 6 = 0x1C0
|
||||
let raw = 0x7u32 << 6;
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vx128_p_perm(), 0x7 << 5, "PERMh only: bits 5-7");
|
||||
|
||||
// PERMl=0xA, PERMh=0x5: raw = (0xA << 16) | (0x5 << 6)
|
||||
let raw = (0xAu32 << 16) | (0x5u32 << 6);
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vx128_p_perm(), 0xA | (0x5 << 5));
|
||||
|
||||
// PERMl and PERMh bits must not bleed into each other
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 0, addr: 0 };
|
||||
assert_eq!(d.vx128_p_perm(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
447
crates/xenia-cpu/src/fpscr.rs
Normal file
447
crates/xenia-cpu/src/fpscr.rs
Normal file
@@ -0,0 +1,447 @@
|
||||
//! FPSCR (Floating-Point Status and Control Register) maintenance.
|
||||
//!
|
||||
//! Scope per project plan: rounding modes honoured, plus the exception bits
|
||||
//! games actually read (FX, FEX, VX, OX, UX, ZX, XX, FI, FPRF). Enabled-
|
||||
//! exception dispatch (FE[0,1], VE/OE/UE/ZE/XE) is *not* modelled — games
|
||||
//! running on Xenon almost never take FP traps.
|
||||
//!
|
||||
//! Bit layout (PowerISA, MSB-0 numbering; stored in a u32 with bit 31 = MSB):
|
||||
//!
|
||||
//! | PPC bit | u32 mask | Name |
|
||||
//! |---------|-------------------------|-------------|
|
||||
//! | 0 | `1<<31` | FX |
|
||||
//! | 1 | `1<<30` | FEX |
|
||||
//! | 2 | `1<<29` | VX (summary)|
|
||||
//! | 3 | `1<<28` | OX |
|
||||
//! | 4 | `1<<27` | UX |
|
||||
//! | 5 | `1<<26` | ZX |
|
||||
//! | 6 | `1<<25` | XX |
|
||||
//! | 7 | `1<<24` | VXSNAN |
|
||||
//! | 8 | `1<<23` | VXISI |
|
||||
//! | 9 | `1<<22` | VXIDI |
|
||||
//! | 10 | `1<<21` | VXZDZ |
|
||||
//! | 11 | `1<<20` | VXIMZ |
|
||||
//! | 12 | `1<<19` | VXVC |
|
||||
//! | 13 | `1<<18` | FR |
|
||||
//! | 14 | `1<<17` | FI |
|
||||
//! | 15..19 | `0xF8000 >> 15` @ 15..19 | FPRF (5 bits)|
|
||||
//! | 21 | `1<<10` | VXSOFT |
|
||||
//! | 22 | `1<<9` | VXSQRT |
|
||||
//! | 23 | `1<<8` | VXCVI |
|
||||
//! | 30..31 | `0x3` | RN (2 bits) |
|
||||
|
||||
use crate::context::PpcContext;
|
||||
|
||||
pub const FX: u32 = 1 << 31;
|
||||
pub const FEX: u32 = 1 << 30;
|
||||
pub const VX: u32 = 1 << 29;
|
||||
pub const OX: u32 = 1 << 28;
|
||||
pub const UX: u32 = 1 << 27;
|
||||
pub const ZX: u32 = 1 << 26;
|
||||
pub const XX: u32 = 1 << 25;
|
||||
pub const VXSNAN: u32 = 1 << 24;
|
||||
pub const VXISI: u32 = 1 << 23;
|
||||
pub const VXIDI: u32 = 1 << 22;
|
||||
pub const VXZDZ: u32 = 1 << 21;
|
||||
pub const VXIMZ: u32 = 1 << 20;
|
||||
pub const VXVC: u32 = 1 << 19;
|
||||
pub const FR: u32 = 1 << 18;
|
||||
pub const FI: u32 = 1 << 17;
|
||||
pub const FPRF_MASK: u32 = 0x1F << 12; // bits 15..19
|
||||
pub const VXSOFT: u32 = 1 << 10;
|
||||
pub const VXSQRT: u32 = 1 << 9;
|
||||
pub const VXCVI: u32 = 1 << 8;
|
||||
pub const RN_MASK: u32 = 0x3;
|
||||
|
||||
/// Union of all VX* bits (used for the VX summary recomputation).
|
||||
pub const VX_ALL: u32 = VXSNAN | VXISI | VXIDI | VXZDZ | VXIMZ | VXVC | VXSOFT | VXSQRT | VXCVI;
|
||||
|
||||
/// FPRF classification codes (5-bit, placed in FPSCR bits 15..19).
|
||||
/// The high bit ("C" in PowerISA) distinguishes ±zero/±denormal/QNaN from
|
||||
/// ±normal/±inf. The next 4 bits are (FL, FG, FE, FU) = (less, greater, equal, unordered).
|
||||
pub mod fprf {
|
||||
pub const QNAN: u8 = 0b1_0001;
|
||||
pub const NEG_INF: u8 = 0b0_1001;
|
||||
pub const NEG_NORMAL: u8 = 0b0_1000;
|
||||
pub const NEG_DENORMAL: u8 = 0b1_1000;
|
||||
pub const NEG_ZERO: u8 = 0b1_0010;
|
||||
pub const POS_ZERO: u8 = 0b0_0010;
|
||||
pub const POS_DENORMAL: u8 = 0b1_0100;
|
||||
pub const POS_NORMAL: u8 = 0b0_0100;
|
||||
pub const POS_INF: u8 = 0b0_0101;
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum RoundingMode {
|
||||
NearestEven, // RN=00
|
||||
TowardZero, // RN=01
|
||||
TowardPosInf, // RN=10
|
||||
TowardNegInf, // RN=11
|
||||
}
|
||||
|
||||
pub fn rounding_mode(ctx: &PpcContext) -> RoundingMode {
|
||||
match ctx.fpscr & RN_MASK {
|
||||
0 => RoundingMode::NearestEven,
|
||||
1 => RoundingMode::TowardZero,
|
||||
2 => RoundingMode::TowardPosInf,
|
||||
_ => RoundingMode::TowardNegInf,
|
||||
}
|
||||
}
|
||||
|
||||
/// Classify a finite f64 into its FPRF 5-bit code.
|
||||
pub fn classify_fprf(v: f64) -> u8 {
|
||||
if v.is_nan() {
|
||||
fprf::QNAN
|
||||
} else if v.is_infinite() {
|
||||
if v.is_sign_negative() { fprf::NEG_INF } else { fprf::POS_INF }
|
||||
} else if v == 0.0 {
|
||||
if v.is_sign_negative() { fprf::NEG_ZERO } else { fprf::POS_ZERO }
|
||||
} else if v.is_subnormal() {
|
||||
if v.is_sign_negative() { fprf::NEG_DENORMAL } else { fprf::POS_DENORMAL }
|
||||
} else if v.is_sign_negative() { fprf::NEG_NORMAL } else { fprf::POS_NORMAL }
|
||||
}
|
||||
|
||||
/// Write FPRF into FPSCR, preserving other bits.
|
||||
pub fn set_fprf(ctx: &mut PpcContext, code: u8) {
|
||||
ctx.fpscr = (ctx.fpscr & !FPRF_MASK) | ((code as u32 & 0x1F) << 12);
|
||||
}
|
||||
|
||||
/// Set one or more exception bits on FPSCR, maintaining FX (sticky set on any
|
||||
/// new exception) and VX (summary of VX* bits).
|
||||
pub fn set_exception(ctx: &mut PpcContext, bits: u32) {
|
||||
let prev = ctx.fpscr;
|
||||
let new = prev | bits;
|
||||
// FX is sticky-set if any new non-sticky bit transitions to 1. PPC defines
|
||||
// FX as "any of OX, UX, ZX, XX, VX* newly set". Compute the transition set.
|
||||
let transition = (new & !prev) & (OX | UX | ZX | XX | VX_ALL);
|
||||
let mut updated = new;
|
||||
if transition != 0 {
|
||||
updated |= FX;
|
||||
}
|
||||
// Recompute VX summary from any VX* bits currently set.
|
||||
if (updated & VX_ALL) != 0 { updated |= VX; }
|
||||
ctx.fpscr = updated;
|
||||
}
|
||||
|
||||
/// Classify the inputs of a floating-point arithmetic op and set appropriate
|
||||
/// VX* bits. Returns true if any invalid-operation was detected (caller may
|
||||
/// want to write a default QNaN result).
|
||||
///
|
||||
/// Detected cases:
|
||||
/// * any SNaN input → VXSNAN
|
||||
/// * infinity - infinity (same sign) → VXISI
|
||||
/// * 0 / 0 → VXZDZ
|
||||
/// * infinity / infinity → VXIDI
|
||||
/// * 0 * infinity → VXIMZ
|
||||
pub fn check_invalid_add(ctx: &mut PpcContext, a: f64, b: f64, sub: bool) -> bool {
|
||||
let mut bits = 0u32;
|
||||
if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
|
||||
if a.is_infinite() && b.is_infinite() {
|
||||
// For add: VXISI iff same-sign(a,b) negated — inf - inf
|
||||
// For sub: VXISI iff same-sign(a,b) — (+inf) - (+inf) or (-inf) - (-inf)
|
||||
let both_pos = a.is_sign_positive() && b.is_sign_positive();
|
||||
let both_neg = a.is_sign_negative() && b.is_sign_negative();
|
||||
if sub {
|
||||
if both_pos || both_neg { bits |= VXISI; }
|
||||
} else {
|
||||
// add: opposite signs cancel to inf-inf
|
||||
if a.is_sign_positive() != b.is_sign_positive() { bits |= VXISI; }
|
||||
}
|
||||
}
|
||||
if bits != 0 { set_exception(ctx, bits); return true; }
|
||||
false
|
||||
}
|
||||
|
||||
/// FMA-aware add/sub VXISI check. Per PPCBUG-202+203: the previous code
|
||||
/// passed `a*c` as `lhs` to `check_invalid_add`, which suffers from two
|
||||
/// rounding errors and can spuriously raise/miss VXISI in extreme cases.
|
||||
/// This helper derives the mathematical product's sign and infinity status
|
||||
/// from the inputs directly.
|
||||
///
|
||||
/// `sub` follows the same semantics as `check_invalid_add`:
|
||||
/// - false (add): VXISI when product and b have opposite signs at infinity
|
||||
/// - true (sub): VXISI when product and b have same sign at infinity
|
||||
pub fn check_invalid_fma_add(ctx: &mut PpcContext, a: f64, c: f64, b: f64, sub: bool) -> bool {
|
||||
let mut bits = 0u32;
|
||||
if is_snan(a) || is_snan(c) || is_snan(b) { bits |= VXSNAN; }
|
||||
let product_is_inf = (a.is_infinite() || c.is_infinite())
|
||||
&& a != 0.0 && c != 0.0
|
||||
&& !a.is_nan() && !c.is_nan();
|
||||
if product_is_inf && b.is_infinite() {
|
||||
let p_neg = a.is_sign_negative() != c.is_sign_negative();
|
||||
let b_neg = b.is_sign_negative();
|
||||
let same_sign = p_neg == b_neg;
|
||||
if (sub && same_sign) || (!sub && !same_sign) {
|
||||
bits |= VXISI;
|
||||
}
|
||||
}
|
||||
if bits != 0 { set_exception(ctx, bits); return true; }
|
||||
false
|
||||
}
|
||||
|
||||
pub fn check_invalid_mul(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
|
||||
let mut bits = 0u32;
|
||||
if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
|
||||
let zero_times_inf =
|
||||
(a == 0.0 && b.is_infinite()) || (b == 0.0 && a.is_infinite());
|
||||
if zero_times_inf { bits |= VXIMZ; }
|
||||
if bits != 0 { set_exception(ctx, bits); return true; }
|
||||
false
|
||||
}
|
||||
|
||||
pub fn check_invalid_div(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
|
||||
let mut bits = 0u32;
|
||||
if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
|
||||
if a == 0.0 && b == 0.0 { bits |= VXZDZ; }
|
||||
if a.is_infinite() && b.is_infinite() { bits |= VXIDI; }
|
||||
if bits != 0 { set_exception(ctx, bits); return true; }
|
||||
false
|
||||
}
|
||||
|
||||
/// Divide-by-zero (finite nonzero / 0) — sets ZX but not VX.
|
||||
pub fn check_zero_divide(ctx: &mut PpcContext, a: f64, b: f64) {
|
||||
if b == 0.0 && a != 0.0 && !a.is_nan() && !a.is_infinite() {
|
||||
set_exception(ctx, ZX);
|
||||
}
|
||||
}
|
||||
|
||||
/// Post-op: classify the result and update FPRF + detect overflow/underflow/inexact.
|
||||
/// `inputs_finite` lets us suppress OX for ops whose output is infinite because
|
||||
/// an input already was.
|
||||
pub fn update_after_op(ctx: &mut PpcContext, result: f64, inputs_were_finite: bool) {
|
||||
let mut bits = 0u32;
|
||||
if result.is_infinite() && inputs_were_finite {
|
||||
bits |= OX;
|
||||
}
|
||||
if result.is_subnormal() {
|
||||
bits |= UX;
|
||||
}
|
||||
if bits != 0 { set_exception(ctx, bits); }
|
||||
set_fprf(ctx, classify_fprf(result));
|
||||
}
|
||||
|
||||
/// Test whether an f64 is a signalling NaN.
|
||||
/// In IEEE 754-2008 (binary64), the signalling bit is the high bit of the
|
||||
/// mantissa. SNaN has it clear, QNaN has it set. NaN with high mantissa bit
|
||||
/// clear (and mantissa nonzero) is an SNaN.
|
||||
pub fn is_snan(x: f64) -> bool {
|
||||
if !x.is_nan() { return false; }
|
||||
let bits = x.to_bits();
|
||||
// Highest mantissa bit (bit 51) clear ⇒ SNaN. Mantissa nonzero always true for NaN.
|
||||
(bits & (1u64 << 51)) == 0
|
||||
}
|
||||
|
||||
/// Round an f64 to f32 honouring FPSCR[RN]. Uses the current hardware
|
||||
/// rounding mode when RN=0 (nearest-even, the PPC default), otherwise
|
||||
/// emulates the directed rounding via bit-manipulation.
|
||||
pub fn round_to_single(ctx: &PpcContext, v: f64) -> f64 {
|
||||
match rounding_mode(ctx) {
|
||||
RoundingMode::NearestEven => (v as f32) as f64,
|
||||
RoundingMode::TowardZero => round_single_toward_zero(v) as f64,
|
||||
RoundingMode::TowardPosInf => round_single_toward_pos_inf(v) as f64,
|
||||
RoundingMode::TowardNegInf => round_single_toward_neg_inf(v) as f64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Round an f64 to an i64 integer honouring FPSCR[RN]. Used by fctidx.
|
||||
pub fn round_to_i64(ctx: &PpcContext, v: f64) -> i64 {
|
||||
match rounding_mode(ctx) {
|
||||
RoundingMode::NearestEven => {
|
||||
// PPCBUG-221: round-half-to-even (banker's rounding). The previous
|
||||
// tie-detection used `(diff - 0.5).abs() < f64::EPSILON` which
|
||||
// breaks for |v| > 2^52 (where v.trunc() == v exactly, giving diff
|
||||
// == 0). Use a fractional-part-only check that's exact for
|
||||
// |v| <= 2^52 and degenerates correctly above.
|
||||
let t = v.trunc();
|
||||
let frac = v - t;
|
||||
let fa = frac.abs();
|
||||
if fa > 0.5 {
|
||||
t as i64 + if v >= 0.0 { 1 } else { -1 }
|
||||
} else if fa < 0.5 {
|
||||
t as i64
|
||||
} else {
|
||||
// Exact 0.5 tie — round to even.
|
||||
let fi = t as i64;
|
||||
if fi & 1 == 0 { fi } else { fi + if v >= 0.0 { 1 } else { -1 } }
|
||||
}
|
||||
}
|
||||
RoundingMode::TowardZero => v.trunc() as i64,
|
||||
RoundingMode::TowardPosInf => v.ceil() as i64,
|
||||
RoundingMode::TowardNegInf => v.floor() as i64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Round an f64 to an i32 integer honouring FPSCR[RN]. Used by fctiwx.
|
||||
pub fn round_to_i32(ctx: &PpcContext, v: f64) -> i32 {
|
||||
round_to_i64(ctx, v).clamp(i32::MIN as i64, i32::MAX as i64) as i32
|
||||
}
|
||||
|
||||
// ------ directed rounding helpers (f64 → f32) ------
|
||||
|
||||
fn round_single_toward_zero(v: f64) -> f32 {
|
||||
// Default f64→f32 is round-to-nearest-even. Emulate truncation:
|
||||
// take the default rounded value; if the absolute rounded magnitude
|
||||
// exceeds |v|, bump down by one ULP toward zero.
|
||||
let rn = v as f32;
|
||||
if rn.is_nan() || rn.is_infinite() || rn == 0.0 { return rn; }
|
||||
if rn.abs() as f64 <= v.abs() { return rn; }
|
||||
let adj_bits = rn.to_bits();
|
||||
// Both positive and negative finite f32 values have the IEEE-754 sign
|
||||
// bit as the MSB; subtracting 1 from `to_bits()` always reduces the
|
||||
// magnitude by one ULP (clearing the lowest mantissa bit, with carry
|
||||
// never reaching the sign bit since adj_bits is already not-zero,
|
||||
// not-inf, not-NaN, and we already returned early for those).
|
||||
let lower = adj_bits - 1;
|
||||
f32::from_bits(lower)
|
||||
}
|
||||
|
||||
fn round_single_toward_pos_inf(v: f64) -> f32 {
|
||||
let rn = v as f32;
|
||||
if rn.is_nan() || rn.is_infinite() { return rn; }
|
||||
if (rn as f64) >= v { return rn; }
|
||||
// rn < v — bump up by one ULP in the +direction.
|
||||
let b = rn.to_bits();
|
||||
let nb = if rn.is_sign_negative() { b - 1 } else { b + 1 };
|
||||
f32::from_bits(nb)
|
||||
}
|
||||
|
||||
fn round_single_toward_neg_inf(v: f64) -> f32 {
|
||||
let rn = v as f32;
|
||||
if rn.is_nan() || rn.is_infinite() { return rn; }
|
||||
if (rn as f64) <= v { return rn; }
|
||||
// rn > v — bump down.
|
||||
let b = rn.to_bits();
|
||||
let nb = if rn.is_sign_negative() { b + 1 } else { b - 1 };
|
||||
f32::from_bits(nb)
|
||||
}
|
||||
|
||||
/// Drop-in replacement for the old `update_cr1_from_fpscr`. Reads the
|
||||
/// currently-maintained FPSCR bits (FX, FEX, VX, OX) into CR1.
|
||||
pub fn update_cr1(ctx: &mut PpcContext) {
|
||||
ctx.cr[1].lt = (ctx.fpscr & FX) != 0;
|
||||
ctx.cr[1].gt = (ctx.fpscr & FEX) != 0;
|
||||
ctx.cr[1].eq = (ctx.fpscr & VX) != 0;
|
||||
ctx.cr[1].so = (ctx.fpscr & OX) != 0;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn ctx() -> PpcContext { PpcContext::new() }
|
||||
|
||||
#[test]
|
||||
fn rn_default_is_nearest() {
|
||||
assert_eq!(rounding_mode(&ctx()), RoundingMode::NearestEven);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rn_bits_decode() {
|
||||
let mut c = ctx();
|
||||
c.fpscr = 0x1;
|
||||
assert_eq!(rounding_mode(&c), RoundingMode::TowardZero);
|
||||
c.fpscr = 0x2;
|
||||
assert_eq!(rounding_mode(&c), RoundingMode::TowardPosInf);
|
||||
c.fpscr = 0x3;
|
||||
assert_eq!(rounding_mode(&c), RoundingMode::TowardNegInf);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fprf_classifies_correctly() {
|
||||
assert_eq!(classify_fprf(1.0), fprf::POS_NORMAL);
|
||||
assert_eq!(classify_fprf(-1.0), fprf::NEG_NORMAL);
|
||||
assert_eq!(classify_fprf(0.0), fprf::POS_ZERO);
|
||||
assert_eq!(classify_fprf(-0.0), fprf::NEG_ZERO);
|
||||
assert_eq!(classify_fprf(f64::INFINITY), fprf::POS_INF);
|
||||
assert_eq!(classify_fprf(f64::NEG_INFINITY), fprf::NEG_INF);
|
||||
assert_eq!(classify_fprf(f64::NAN), fprf::QNAN);
|
||||
assert_eq!(classify_fprf(f64::from_bits(1)), fprf::POS_DENORMAL);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fx_is_sticky_on_new_exception() {
|
||||
let mut c = ctx();
|
||||
set_exception(&mut c, OX);
|
||||
assert_ne!(c.fpscr & FX, 0);
|
||||
// Clear FX/OX manually.
|
||||
c.fpscr &= !(FX | OX);
|
||||
// Re-set OX; FX should re-latch.
|
||||
set_exception(&mut c, OX);
|
||||
assert_ne!(c.fpscr & FX, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vx_summary_set_on_any_vx_bit() {
|
||||
let mut c = ctx();
|
||||
set_exception(&mut c, VXSNAN);
|
||||
assert_ne!(c.fpscr & VX, 0);
|
||||
assert_ne!(c.fpscr & VXSNAN, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_single_nearest_is_identity_on_representable() {
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_single(&c, 1.0_f64), 1.0_f64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_i32_clamps_out_of_range() {
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_i32(&c, 1e20_f64), i32::MAX);
|
||||
assert_eq!(round_to_i32(&c, -1e20_f64), i32::MIN);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_i64_nearest_even_on_tie() {
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_i64(&c, 0.5_f64), 0);
|
||||
assert_eq!(round_to_i64(&c, 1.5_f64), 2);
|
||||
assert_eq!(round_to_i64(&c, 2.5_f64), 2);
|
||||
assert_eq!(round_to_i64(&c, 3.5_f64), 4);
|
||||
assert_eq!(round_to_i64(&c, -0.5_f64), 0);
|
||||
assert_eq!(round_to_i64(&c, -1.5_f64), -2);
|
||||
assert_eq!(round_to_i64(&c, -2.5_f64), -2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_i64_non_tie_cases() {
|
||||
// PPCBUG-221 regression: non-tie fractions must round to nearest.
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_i64(&c, 0.4_f64), 0);
|
||||
assert_eq!(round_to_i64(&c, 0.6_f64), 1);
|
||||
assert_eq!(round_to_i64(&c, -0.4_f64), 0);
|
||||
assert_eq!(round_to_i64(&c, -0.6_f64), -1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_i32_nearest_even_on_tie() {
|
||||
// PPCBUG-227: round_to_i32 inherits round_to_i64's tie semantics.
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_i32(&c, 0.5_f64), 0);
|
||||
assert_eq!(round_to_i32(&c, 1.5_f64), 2);
|
||||
assert_eq!(round_to_i32(&c, 2.5_f64), 2);
|
||||
assert_eq!(round_to_i32(&c, -1.5_f64), -2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn check_invalid_add_detects_inf_minus_inf() {
|
||||
let mut c = ctx();
|
||||
assert!(check_invalid_add(&mut c, f64::INFINITY, f64::INFINITY, true));
|
||||
assert_ne!(c.fpscr & VXISI, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn check_invalid_div_detects_zero_over_zero() {
|
||||
let mut c = ctx();
|
||||
assert!(check_invalid_div(&mut c, 0.0, 0.0));
|
||||
assert_ne!(c.fpscr & VXZDZ, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snan_detection() {
|
||||
// SNaN in binary64: sign=0, exp=all-ones, mantissa nonzero with bit 51 clear.
|
||||
let snan = f64::from_bits(0x7FF0_0000_0000_0001);
|
||||
assert!(is_snan(snan));
|
||||
assert!(!is_snan(f64::NAN));
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,25 @@
|
||||
pub mod block_cache;
|
||||
pub mod context;
|
||||
pub mod decoder;
|
||||
pub mod disasm;
|
||||
pub mod fpscr;
|
||||
pub mod interpreter;
|
||||
pub mod opcode;
|
||||
pub mod overflow;
|
||||
pub mod phaser;
|
||||
pub mod reservation;
|
||||
pub mod scheduler;
|
||||
pub mod trap;
|
||||
pub mod vmx;
|
||||
|
||||
pub use context::PpcContext;
|
||||
pub use decoder::decode;
|
||||
pub use disasm::{DisasmItem, DisasmText, disassemble, format as disasm_format, iter_disasm};
|
||||
pub use opcode::PpcOpcode;
|
||||
pub use phaser::{Phaser, PhaserOutcome};
|
||||
pub use reservation::ReservationTable;
|
||||
pub use scheduler::{
|
||||
BlockReason, GuestThread, HwSlot, HwState, MigrationFixup, OrderMode, PcrWriter, RoundOutcome,
|
||||
Scheduler, SpawnError, SpawnParams, ThreadRef, HW_THREAD_COUNT, INITIAL_GUEST_TID,
|
||||
QUANTUM_DEFAULT,
|
||||
};
|
||||
|
||||
@@ -145,6 +145,33 @@ impl PpcOpcode {
|
||||
matches!(self, Self::sc)
|
||||
}
|
||||
|
||||
/// Returns true if this opcode unconditionally ends a basic block:
|
||||
/// any branch, system call, trap, or `Invalid` (decoder couldn't
|
||||
/// recognize the instruction — execution will hit the
|
||||
/// `Unimplemented` arm and we don't want to swallow the boundary
|
||||
/// inside a cached block).
|
||||
///
|
||||
/// Notably *not* terminating: `mtmsr`/`mtmsrd`/`isync`/`mfmsr`.
|
||||
/// On real hardware these have synchronization semantics (a context
|
||||
/// synchronizing event for `isync`, MSR rewrite for the `mt*`s) but
|
||||
/// our interpreter has no asynchronous-exception model and no
|
||||
/// out-of-order execution — they execute as plain ALU/move ops and
|
||||
/// don't change control flow synchronously. Block-cache replay is
|
||||
/// still bit-for-bit identical to per-instruction dispatch for
|
||||
/// those.
|
||||
///
|
||||
/// Used by the basic-block cache (`block_cache.rs`) to know when to
|
||||
/// stop accumulating instructions during a forward decode walk.
|
||||
pub fn terminates_block(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::bx | Self::bcx | Self::bclrx | Self::bcctrx
|
||||
| Self::sc
|
||||
| Self::td | Self::tdi | Self::tw | Self::twi
|
||||
| Self::Invalid
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns true if this is a load instruction.
|
||||
pub fn is_load(&self) -> bool {
|
||||
matches!(self,
|
||||
@@ -194,3 +221,60 @@ impl std::fmt::Display for PpcOpcode {
|
||||
std::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn terminates_block_includes_all_branches() {
|
||||
assert!(PpcOpcode::bx.terminates_block());
|
||||
assert!(PpcOpcode::bcx.terminates_block());
|
||||
assert!(PpcOpcode::bclrx.terminates_block());
|
||||
assert!(PpcOpcode::bcctrx.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_includes_sc_and_traps() {
|
||||
assert!(PpcOpcode::sc.terminates_block());
|
||||
assert!(PpcOpcode::td.terminates_block());
|
||||
assert!(PpcOpcode::tdi.terminates_block());
|
||||
assert!(PpcOpcode::tw.terminates_block());
|
||||
assert!(PpcOpcode::twi.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_includes_invalid() {
|
||||
// Decoder failure must end the block — otherwise an unknown
|
||||
// opcode would be replayed inside a cached block without going
|
||||
// through the per-instruction Unimplemented path.
|
||||
assert!(PpcOpcode::Invalid.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_excludes_straight_line_ops() {
|
||||
// Common ALU and load/store ops must NOT terminate a block.
|
||||
assert!(!PpcOpcode::addi.terminates_block());
|
||||
assert!(!PpcOpcode::addis.terminates_block());
|
||||
assert!(!PpcOpcode::addx.terminates_block());
|
||||
assert!(!PpcOpcode::cmpi.terminates_block());
|
||||
assert!(!PpcOpcode::cmp.terminates_block());
|
||||
assert!(!PpcOpcode::lwz.terminates_block());
|
||||
assert!(!PpcOpcode::stw.terminates_block());
|
||||
assert!(!PpcOpcode::lbzx.terminates_block());
|
||||
assert!(!PpcOpcode::ori.terminates_block());
|
||||
assert!(!PpcOpcode::oris.terminates_block());
|
||||
assert!(!PpcOpcode::rlwinmx.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_excludes_msr_and_sync_ops() {
|
||||
// Documented decision: synchronizing ops execute as ALU within
|
||||
// a block since the interpreter has no async-exception model.
|
||||
assert!(!PpcOpcode::mtmsr.terminates_block());
|
||||
assert!(!PpcOpcode::mtmsrd.terminates_block());
|
||||
assert!(!PpcOpcode::isync.terminates_block());
|
||||
assert!(!PpcOpcode::sync.terminates_block());
|
||||
assert!(!PpcOpcode::mfmsr.terminates_block());
|
||||
}
|
||||
}
|
||||
|
||||
178
crates/xenia-cpu/src/overflow.rs
Normal file
178
crates/xenia-cpu/src/overflow.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
//! OE / XER[OV] / XER[SO] handling for integer arithmetic.
|
||||
//!
|
||||
//! PPC integer ops with the OE bit set update XER[OV] (overflow) and sticky-set
|
||||
//! XER[SO]. When OE is clear the instruction leaves XER untouched. Signed
|
||||
//! overflow is predicated on the operation width and operand signs per the
|
||||
//! PowerISA pseudocode. For 32-bit-word operations (`addw`, `mullw`, `divw`,
|
||||
//! `neg`, etc. — on PPC these all have `w` in the mnemonic in spec
|
||||
//! descriptions even when the assembler spells them without) the predicate
|
||||
//! uses the low 32 bits. For 64-bit operations (`add`, `mulld`, `divd`) the
|
||||
//! predicate uses the full 64 bits.
|
||||
|
||||
use crate::context::PpcContext;
|
||||
|
||||
#[inline]
|
||||
pub fn apply(ctx: &mut PpcContext, overflowed: bool) {
|
||||
if overflowed {
|
||||
ctx.xer_ov = 1;
|
||||
ctx.xer_so = 1;
|
||||
} else {
|
||||
ctx.xer_ov = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Signed addition overflow at width-64 (plain `add`, `addc`, `subf`, `subfc`).
|
||||
///
|
||||
/// Predicate: same-sign inputs with opposite-sign result.
|
||||
/// For sub callers, rewrite as `a + b'` first (see `_sub`).
|
||||
#[inline]
|
||||
pub fn add_ov_64(a: u64, b: u64, result: u64) -> bool {
|
||||
((!(a ^ b)) & (a ^ result)) >> 63 != 0
|
||||
}
|
||||
|
||||
/// Universal signed-overflow predicate for 64-bit arithmetic.
|
||||
///
|
||||
/// Caller computes the mathematical (infinite-precision) signed sum as i128,
|
||||
/// plus the stored 64-bit result. Overflow iff the two disagree — i.e. the
|
||||
/// true value doesn't fit in i64.
|
||||
///
|
||||
/// Use this for multi-term chains (`adde`, `addme`, `addze`, `subfe`, `subfme`,
|
||||
/// `subfze`) where the carry-in makes the bit-predicate above awkward.
|
||||
#[inline]
|
||||
pub fn sum_overflow_64(true_sum: i128, result: u64) -> bool {
|
||||
true_sum != (result as i64) as i128
|
||||
}
|
||||
|
||||
/// Signed subtraction: RT = b - a. Overflow iff opposite-sign inputs with
|
||||
/// result sign != b's sign. Equivalently, reduce to addition with `!a + 1`.
|
||||
#[inline]
|
||||
pub fn sub_ov_64(a: u64, b: u64, result: u64) -> bool {
|
||||
((a ^ b) & (b ^ result)) >> 63 != 0
|
||||
}
|
||||
|
||||
/// Signed `addc`/`adde` chain overflow. Same rule as `add_ov_64` — the carry
|
||||
/// in doesn't alter the sign predicate directly because it's already folded
|
||||
/// into the stored result.
|
||||
#[inline]
|
||||
pub fn adde_ov_64(a: u64, b: u64, result: u64) -> bool {
|
||||
add_ov_64(a, b, result)
|
||||
}
|
||||
|
||||
/// Signed 32-bit multiply overflow (`mullwo`): result fits in 32 bits signed
|
||||
/// iff bit 32 equals bits 33..63 of the 64-bit product.
|
||||
#[inline]
|
||||
pub fn mullw_ov(product: i64) -> bool {
|
||||
let lo = product as i32 as i64;
|
||||
lo != product
|
||||
}
|
||||
|
||||
/// Signed 64-bit multiply overflow (`mulldo`). Detected via checked_mul.
|
||||
#[inline]
|
||||
pub fn mulld_ov(a: i64, b: i64) -> bool {
|
||||
a.checked_mul(b).is_none()
|
||||
}
|
||||
|
||||
/// `divwo` / `divwuo` / `divdo` / `divduo` raise OV in two cases:
|
||||
/// * divisor is zero, or
|
||||
/// * signed division of `INT_MIN / -1` (quotient doesn't fit).
|
||||
#[inline]
|
||||
pub fn divw_ov_signed(ra: i32, rb: i32) -> bool {
|
||||
rb == 0 || (ra == i32::MIN && rb == -1)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn divw_ov_unsigned(rb: u32) -> bool {
|
||||
rb == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn divd_ov_signed(ra: i64, rb: i64) -> bool {
|
||||
rb == 0 || (ra == i64::MIN && rb == -1)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn divd_ov_unsigned(rb: u64) -> bool {
|
||||
rb == 0
|
||||
}
|
||||
|
||||
/// `negx`: RT = -(RA). Overflow only when RA = INT_MIN (the negation doesn't fit).
|
||||
#[inline]
|
||||
pub fn neg_ov_64(ra: u64) -> bool {
|
||||
ra == 0x8000_0000_0000_0000
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn add_no_overflow() {
|
||||
assert!(!add_ov_64(1, 2, 3));
|
||||
assert!(!add_ov_64(u64::MAX, 0, u64::MAX));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_positive_overflow() {
|
||||
// INT64_MAX + 1 = INT64_MIN — signed overflow
|
||||
let a = i64::MAX as u64;
|
||||
let b = 1u64;
|
||||
let r = a.wrapping_add(b);
|
||||
assert!(add_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_negative_overflow() {
|
||||
// INT64_MIN + -1 = INT64_MAX — signed overflow
|
||||
let a = i64::MIN as u64;
|
||||
let b = (-1i64) as u64;
|
||||
let r = a.wrapping_add(b);
|
||||
assert!(add_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub_overflow_min_minus_pos() {
|
||||
// INT64_MIN - 1 overflows
|
||||
let b = i64::MIN as u64;
|
||||
let a = 1u64;
|
||||
let r = b.wrapping_sub(a);
|
||||
assert!(sub_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub_no_overflow() {
|
||||
let b = 5u64;
|
||||
let a = 2u64;
|
||||
let r = b.wrapping_sub(a);
|
||||
assert!(!sub_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mullw_fits_32_bits() {
|
||||
assert!(!mullw_ov((i32::MAX as i64) * 1));
|
||||
assert!(!mullw_ov(-1i64));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mullw_overflows_32_bits() {
|
||||
let p = (i32::MAX as i64) * 2;
|
||||
assert!(mullw_ov(p));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mulld_overflows() {
|
||||
assert!(mulld_ov(i64::MAX, 2));
|
||||
assert!(!mulld_ov(i64::MAX, 1));
|
||||
// PPCBUG-022: INT_MIN * -1 overflows (=-INT_MIN > INT_MAX).
|
||||
// checked_mul correctly returns None for this case.
|
||||
assert!(mulld_ov(i64::MIN, -1), "INT_MIN * -1 overflows i64");
|
||||
assert!(!mulld_ov(i64::MIN, 1));
|
||||
assert!(!mulld_ov(i64::MIN + 1, -1), "INT_MIN+1 * -1 = INT_MAX, no overflow");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn neg_ov_only_at_min() {
|
||||
assert!(neg_ov_64(i64::MIN as u64));
|
||||
assert!(!neg_ov_64(0));
|
||||
assert!(!neg_ov_64(1));
|
||||
}
|
||||
}
|
||||
345
crates/xenia-cpu/src/phaser.rs
Normal file
345
crates/xenia-cpu/src/phaser.rs
Normal file
@@ -0,0 +1,345 @@
|
||||
//! Quantum-boundary phaser for the M3 per-HW-thread parallel scheduler.
|
||||
//!
|
||||
//! Six [`super::HW_THREAD_COUNT`] host threads run their slots' interpreters
|
||||
//! in parallel, then meet at a phaser to advance to the next quantum. This
|
||||
//! is **not** [`std::sync::Barrier`]: a Barrier needs a fixed party count,
|
||||
//! but our slots can become idle (no runnable thread) and shouldn't block
|
||||
//! the phaser arrival.
|
||||
//!
|
||||
//! ## Semantics
|
||||
//!
|
||||
//! - Each slot at the end of its quantum either calls
|
||||
//! [`Phaser::arrive_and_wait`] (it has a runnable thread to run next
|
||||
//! quantum) or [`Phaser::skip`] (it's idle this round and will wake on
|
||||
//! `slot_wake[i]`).
|
||||
//! - The phase advances when **all 6 slots have either arrived or
|
||||
//! skipped**. Arrived slots block until the advance; skipped slots
|
||||
//! return immediately and re-poll their wake state.
|
||||
//! - The phaser uses a generation counter so a slot that arrives "early"
|
||||
//! in the next phase doesn't see the prior phase's "all arrived"
|
||||
//! condition.
|
||||
//! - Defensive timeout: [`Phaser::arrive_and_wait_timeout`] returns
|
||||
//! [`PhaserOutcome::Timeout`] if a peer crashes / hangs. Callers
|
||||
//! typically convert this into a graceful shutdown rather than
|
||||
//! panicking, so the rest of the topology can tear down cleanly.
|
||||
//!
|
||||
//! ## Memory ordering
|
||||
//!
|
||||
//! - The participant counter (`arrived` + `skipped`) uses `AcqRel` on
|
||||
//! the increment so the last-to-arrive thread sees a consistent
|
||||
//! "everyone is here" snapshot.
|
||||
//! - The generation `phase` is read with `Acquire` in arrivers' wait
|
||||
//! loops; the advancing thread stores with `Release` after bumping.
|
||||
//! - The condvar's broadcast publishes the phase; the wait loop
|
||||
//! re-checks `phase` against its captured value to defend against
|
||||
//! spurious wakeups.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Condvar, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Outcome of a phaser arrival.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum PhaserOutcome {
|
||||
/// All participants arrived/skipped — phase advanced. Caller proceeds
|
||||
/// into the next quantum.
|
||||
Advanced,
|
||||
/// Defensive timeout fired before all peers arrived. Caller should
|
||||
/// log + initiate shutdown rather than retry.
|
||||
Timeout,
|
||||
/// Phaser was shut down via [`Phaser::shutdown`]; all waiters are
|
||||
/// woken and return this. Caller exits cleanly.
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Custom barrier-with-skip primitive. Construct once with the number of
|
||||
/// participating slots; share via `Arc` across host threads.
|
||||
pub struct Phaser {
|
||||
/// Total participant count (constant after construction). For our
|
||||
/// scheduler this is `HW_THREAD_COUNT = 6`.
|
||||
party_count: u32,
|
||||
/// Monotonic phase counter, incremented every time the phase
|
||||
/// advances. Used as a generation marker so a slot that wakes "into"
|
||||
/// the next phase doesn't observe the old "everyone arrived" state.
|
||||
phase: AtomicU32,
|
||||
/// Inner state guarded by the condvar's mutex.
|
||||
inner: Mutex<Inner>,
|
||||
/// Notified when a phase advances or shutdown fires.
|
||||
cv: Condvar,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Inner {
|
||||
arrived_or_skipped: u32,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
impl Phaser {
|
||||
/// Create a phaser with `party_count` participants. Panics if
|
||||
/// `party_count == 0`.
|
||||
pub fn new(party_count: u32) -> Self {
|
||||
assert!(party_count > 0, "phaser party_count must be > 0");
|
||||
Self {
|
||||
party_count,
|
||||
phase: AtomicU32::new(0),
|
||||
inner: Mutex::new(Inner {
|
||||
arrived_or_skipped: 0,
|
||||
shutdown: false,
|
||||
}),
|
||||
cv: Condvar::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the current phase number. Useful for tests and observability.
|
||||
pub fn current_phase(&self) -> u32 {
|
||||
self.phase.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
/// Mark this slot as not participating in the current phase. Counts
|
||||
/// toward the advance threshold but does not block. Used when a slot
|
||||
/// has no runnable thread and is parked waiting on
|
||||
/// `slot_wake[i].unpark()`.
|
||||
///
|
||||
/// `_slot_id` is informational (not stored); the parameter exists so
|
||||
/// call sites stay greppable.
|
||||
pub fn skip(&self, _slot_id: u8) {
|
||||
self.contribute_advance();
|
||||
}
|
||||
|
||||
/// Block until the phase advances or the defensive 5-second timeout
|
||||
/// fires. Returns [`PhaserOutcome::Advanced`] on a clean phase
|
||||
/// transition; [`Timeout`] if a peer hung; [`Shutdown`] on tear-down.
|
||||
///
|
||||
/// `_slot_id` is informational (see [`Self::skip`]).
|
||||
pub fn arrive_and_wait(&self, _slot_id: u8) -> PhaserOutcome {
|
||||
self.arrive_and_wait_timeout(Duration::from_secs(5))
|
||||
}
|
||||
|
||||
/// Same as [`Self::arrive_and_wait`] with a caller-supplied timeout.
|
||||
pub fn arrive_and_wait_timeout(&self, timeout: Duration) -> PhaserOutcome {
|
||||
let pre_phase = self.phase.load(Ordering::Acquire);
|
||||
self.contribute_advance();
|
||||
let deadline = Instant::now() + timeout;
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
loop {
|
||||
if guard.shutdown {
|
||||
return PhaserOutcome::Shutdown;
|
||||
}
|
||||
if self.phase.load(Ordering::Acquire) != pre_phase {
|
||||
return PhaserOutcome::Advanced;
|
||||
}
|
||||
let now = Instant::now();
|
||||
if now >= deadline {
|
||||
return PhaserOutcome::Timeout;
|
||||
}
|
||||
let remaining = deadline - now;
|
||||
let result = self.cv.wait_timeout(guard, remaining).unwrap();
|
||||
guard = result.0;
|
||||
if result.1.timed_out() {
|
||||
// Loop once more to disambiguate "real timeout" vs
|
||||
// "spurious wakeup just before the deadline".
|
||||
if self.phase.load(Ordering::Acquire) != pre_phase {
|
||||
return PhaserOutcome::Advanced;
|
||||
}
|
||||
if guard.shutdown {
|
||||
return PhaserOutcome::Shutdown;
|
||||
}
|
||||
return PhaserOutcome::Timeout;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wake every parked arriver and signal shutdown. After this, all
|
||||
/// future and outstanding `arrive_and_wait_*` calls return
|
||||
/// [`PhaserOutcome::Shutdown`].
|
||||
pub fn shutdown(&self) {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.shutdown = true;
|
||||
self.cv.notify_all();
|
||||
}
|
||||
|
||||
/// Common path for both arrive-and-wait and skip: bump the
|
||||
/// participant counter, and if we were the last one in, advance the
|
||||
/// phase + broadcast.
|
||||
fn contribute_advance(&self) {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.arrived_or_skipped += 1;
|
||||
if guard.arrived_or_skipped >= self.party_count {
|
||||
// Last one in. Reset the counter, bump the phase, broadcast.
|
||||
guard.arrived_or_skipped = 0;
|
||||
// `Release` on the phase store pairs with `Acquire` reads in
|
||||
// arriving slots' wait-loop predicates.
|
||||
self.phase.fetch_add(1, Ordering::Release);
|
||||
self.cv.notify_all();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::thread;
|
||||
|
||||
/// All N participants arrive — phase advances, every arriver returns
|
||||
/// `Advanced`.
|
||||
#[test]
|
||||
fn n_arrivers_all_advance() {
|
||||
const N: u32 = 6;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..N {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-test-{i}"))
|
||||
.spawn(move || p.arrive_and_wait(i as u8))
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
|
||||
}
|
||||
assert_eq!(p.current_phase(), 1);
|
||||
}
|
||||
|
||||
/// 5 arrive + 1 skip → phase advances; arrivers see `Advanced`.
|
||||
#[test]
|
||||
fn skip_counts_toward_advance() {
|
||||
const N: u32 = 6;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..(N - 1) {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-arrive-{i}"))
|
||||
.spawn(move || p.arrive_and_wait(i as u8))
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
// Brief pause to let arrivers park first (exercising the
|
||||
// skip-unblocks-arrivers path).
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
p.skip((N - 1) as u8);
|
||||
for h in handles {
|
||||
assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
|
||||
}
|
||||
assert_eq!(p.current_phase(), 1);
|
||||
}
|
||||
|
||||
/// Shutdown wakes parked arrivers; they return `Shutdown`.
|
||||
#[test]
|
||||
fn shutdown_wakes_arrivers() {
|
||||
const N: u32 = 6;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
// Only N-1 arrive — phase will not advance.
|
||||
for i in 0..(N - 1) {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-arrive-shutdown-{i}"))
|
||||
.spawn(move || p.arrive_and_wait(i as u8))
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
p.shutdown();
|
||||
for h in handles {
|
||||
assert_eq!(h.join().unwrap(), PhaserOutcome::Shutdown);
|
||||
}
|
||||
}
|
||||
|
||||
/// Defensive timeout: if some peers never arrive, others surface
|
||||
/// `Timeout` rather than blocking forever.
|
||||
#[test]
|
||||
fn timeout_fires_when_peer_hangs() {
|
||||
const N: u32 = 4;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
// Only 2 of 4 arrive — others "hang".
|
||||
let p1 = p.clone();
|
||||
let h1 = thread::spawn(move || {
|
||||
p1.arrive_and_wait_timeout(Duration::from_millis(50))
|
||||
});
|
||||
let p2 = p.clone();
|
||||
let h2 = thread::spawn(move || {
|
||||
p2.arrive_and_wait_timeout(Duration::from_millis(50))
|
||||
});
|
||||
assert_eq!(h1.join().unwrap(), PhaserOutcome::Timeout);
|
||||
assert_eq!(h2.join().unwrap(), PhaserOutcome::Timeout);
|
||||
}
|
||||
|
||||
/// Multi-phase stress: all participants run a tight loop of
|
||||
/// arrive_and_wait calls; after K phases they all observe the same
|
||||
/// `current_phase()` value. Catches generation/counter resync bugs.
|
||||
#[test]
|
||||
fn multi_phase_progress() {
|
||||
const N: u32 = 6;
|
||||
const K: u32 = 1000;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let counter = Arc::new(AtomicU32::new(0));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..N {
|
||||
let p = p.clone();
|
||||
let c = counter.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-multi-{i}"))
|
||||
.spawn(move || {
|
||||
for _ in 0..K {
|
||||
assert_eq!(
|
||||
p.arrive_and_wait(i as u8),
|
||||
PhaserOutcome::Advanced
|
||||
);
|
||||
}
|
||||
c.fetch_add(1, Ordering::Relaxed);
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
h.join().unwrap();
|
||||
}
|
||||
assert_eq!(counter.load(Ordering::Relaxed), N);
|
||||
assert_eq!(p.current_phase(), K);
|
||||
}
|
||||
|
||||
/// Mixed skip/arrive across phases — emulates the realistic scheduler
|
||||
/// pattern where slots become idle for some quanta.
|
||||
#[test]
|
||||
fn mixed_skip_and_arrive_random() {
|
||||
const N: u32 = 6;
|
||||
const K: u32 = 200;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..N {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-mixed-{i}"))
|
||||
.spawn(move || {
|
||||
// Pseudo-random skip pattern based on slot+phase
|
||||
let mut state: u32 = 0x9E37_79B9u32.wrapping_add(i);
|
||||
for phase in 0..K {
|
||||
state = state.wrapping_mul(0x6C8E_9CF7).wrapping_add(phase);
|
||||
if state & 0xF == 0 {
|
||||
p.skip(i as u8);
|
||||
} else {
|
||||
let _ = p.arrive_and_wait(i as u8);
|
||||
}
|
||||
}
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
h.join().unwrap();
|
||||
}
|
||||
// After K rounds with all-N participation each phase, the phase
|
||||
// counter equals K. Each iteration contributes exactly N to the
|
||||
// counter (split between arrive and skip).
|
||||
assert_eq!(p.current_phase(), K);
|
||||
}
|
||||
}
|
||||
424
crates/xenia-cpu/src/reservation.rs
Normal file
424
crates/xenia-cpu/src/reservation.rs
Normal file
@@ -0,0 +1,424 @@
|
||||
//! Inter-thread reservation table for `lwarx`/`stwcx.` and
|
||||
//! `ldarx`/`stdcx.`.
|
||||
//!
|
||||
//! On real Xenon, each core's `lwarx` places a reservation on a 128-byte
|
||||
//! cache line; any other CPU's store to the line invalidates the
|
||||
//! reservation. `stwcx.`'s success depends on the reservation still being
|
||||
//! valid. Under M3's per-HW-thread parallelism, we need an inter-thread
|
||||
//! mechanism for the same guarantee.
|
||||
//!
|
||||
//! M2 introduces the table behind a runtime `reservations_enabled` flag
|
||||
//! (default `false`). When the flag is `false`, the interpreter's
|
||||
//! existing per-`PpcContext` `reserved_line`/`has_reservation` fields are
|
||||
//! used as-is — no inter-thread tracking. M3 flips the flag on once the
|
||||
//! per-HW-thread host threads are spawning.
|
||||
//!
|
||||
//! ## Design
|
||||
//!
|
||||
//! - **Banked AtomicU64 array** of [`NUM_LINES`] entries (4096 × 8 B =
|
||||
//! 32 KiB total). Each entry packs `(line_address, generation,
|
||||
//! hw_id)`. A zero value means "no reservation on this bank".
|
||||
//! - **Hash function**: `(line >> 7) & (NUM_LINES - 1)`. Different lines
|
||||
//! that map to the same bank conservatively invalidate each other's
|
||||
//! reservations — sound (real Xenon's L2 has finite associativity and
|
||||
//! has the same property), at the cost of slightly more `stwcx.`
|
||||
//! failures than a perfect-mapping table would produce.
|
||||
//! - **`active_reservers: AtomicU16`** — a fast-path counter
|
||||
//! incremented by every `lwarx` and decremented when its reservation is
|
||||
//! either committed or invalidated. `write_u32` checks this with a
|
||||
//! single `Relaxed` load; when zero (the common case in code that
|
||||
//! doesn't use atomics), the invalidation hook is a one-instruction
|
||||
//! skip.
|
||||
//! - **Generation counter**: monotonic across all reservations,
|
||||
//! incremented atomically. 24 bits of generation packed in the slot
|
||||
//! means 16 M reuses per slot before wraparound; at multi-million
|
||||
//! reservations/sec sustained that's still many seconds, and a
|
||||
//! stale-gen `stwcx.` simply fails (sound, not livelocking).
|
||||
//!
|
||||
//! ## Invariants
|
||||
//!
|
||||
//! 1. A `stwcx.(addr)` succeeds only if the line slot still holds the
|
||||
//! same `(line, gen, hw_id)` triple the reserver stamped at `lwarx`.
|
||||
//! 2. Any plain store to a reserved line invalidates it (slot CASed to
|
||||
//! zero). Hash-collision side-effect: a store to a different line
|
||||
//! that maps to the same bank also invalidates — guests that observe
|
||||
//! a `stwcx.` failure simply retry, so this is correctness-preserving.
|
||||
//! 3. `stwcx.` from a different `hw_id` than the reserver fails even if
|
||||
//! the line and gen would otherwise match — only the originating HW
|
||||
//! thread can commit its own reservation.
|
||||
//!
|
||||
//! Memory ordering: all CAS / store operations on the line slot use
|
||||
//! `AcqRel`; readers use `Acquire`. The store inside `stwcx.`'s payload
|
||||
//! itself (the actual data write) is the caller's responsibility — see
|
||||
//! [`crate::interpreter`]'s `stwcx.` arm.
|
||||
|
||||
use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
|
||||
|
||||
/// Real Xenon L2 cache-line size — the granule a reservation covers.
|
||||
pub const LINE_BYTES: u32 = 0x80;
|
||||
/// Mask to align an address to a cache-line boundary.
|
||||
pub const LINE_MASK: u32 = !(LINE_BYTES - 1);
|
||||
/// Number of bank entries in the reservation table. Power of two so the
|
||||
/// hash is a single AND. 32 KiB total at 8 B per entry.
|
||||
pub const NUM_LINES: usize = 4096;
|
||||
const HASH_MASK: u32 = (NUM_LINES as u32) - 1;
|
||||
|
||||
/// Pack `(line_addr, generation, hw_id)` into a single u64. The packed
|
||||
/// layout is:
|
||||
/// bits 63..32: line address (we only need the high bits since the
|
||||
/// low 7 are always zero — reserved range is line-aligned)
|
||||
/// bits 31..8: 24-bit generation
|
||||
/// bits 7..0: 8-bit `hw_id`
|
||||
///
|
||||
/// A packed value of `0` means "no reservation". Since we never reserve
|
||||
/// on guest virtual address `0` (the page is unmapped) and the
|
||||
/// generation increments from `1`, zero is a safe sentinel.
|
||||
#[inline]
|
||||
pub fn pack(line_addr: u32, generation: u32, hw_id: u8) -> u64 {
|
||||
debug_assert!(line_addr & !LINE_MASK == 0, "line_addr must be line-aligned");
|
||||
debug_assert!(generation < (1 << 24), "generation must fit in 24 bits");
|
||||
((line_addr as u64) << 32)
|
||||
| ((generation as u64 & 0xFF_FFFF) << 8)
|
||||
| (hw_id as u64)
|
||||
}
|
||||
|
||||
/// Inverse of [`pack`]. Returns `None` if the value is the zero sentinel
|
||||
/// (no reservation).
|
||||
#[inline]
|
||||
pub fn unpack(raw: u64) -> Option<(u32, u32, u8)> {
|
||||
if raw == 0 {
|
||||
return None;
|
||||
}
|
||||
let line = (raw >> 32) as u32;
|
||||
let generation = ((raw >> 8) & 0xFF_FFFF) as u32;
|
||||
let hw_id = (raw & 0xFF) as u8;
|
||||
Some((line, generation, hw_id))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hash(line_addr: u32) -> usize {
|
||||
((line_addr >> 7) & HASH_MASK) as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn align_to_line(addr: u32) -> u32 {
|
||||
addr & LINE_MASK
|
||||
}
|
||||
|
||||
/// Banked reservation table shared across all emulated HW threads. Built
|
||||
/// once per emulation instance; lives behind an `Arc` so worker host
|
||||
/// threads (M3) can hold their own clones without lifetime gymnastics.
|
||||
pub struct ReservationTable {
|
||||
lines: Vec<AtomicU64>,
|
||||
active_reservers: AtomicU16,
|
||||
next_gen: AtomicU64,
|
||||
/// Runtime activation flag. Default `false`. M2.8's
|
||||
/// `--reservations-table` flag (or M3 spawn) flips this to `true`,
|
||||
/// at which point the interpreter's `lwarx`/`stwcx.` arms route
|
||||
/// through the table; otherwise they use the legacy per-`PpcContext`
|
||||
/// reservation fields.
|
||||
enabled: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl Default for ReservationTable {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ReservationTable {
|
||||
/// Construct a fresh table with all banks empty.
|
||||
pub fn new() -> Self {
|
||||
let mut lines = Vec::with_capacity(NUM_LINES);
|
||||
for _ in 0..NUM_LINES {
|
||||
lines.push(AtomicU64::new(0));
|
||||
}
|
||||
Self {
|
||||
lines,
|
||||
active_reservers: AtomicU16::new(0),
|
||||
// Start at 1 so the very first reservation gets a non-zero
|
||||
// gen and the packed slot value is non-zero (zero is the
|
||||
// "no reservation" sentinel).
|
||||
next_gen: AtomicU64::new(1),
|
||||
enabled: std::sync::atomic::AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
/// Activate the table. The interpreter's `lwarx`/`stwcx.` arms will
|
||||
/// route through this table on subsequent dispatches. Idempotent.
|
||||
pub fn enable(&self) {
|
||||
self.enabled
|
||||
.store(true, std::sync::atomic::Ordering::Release);
|
||||
}
|
||||
|
||||
/// Deactivate the table. The interpreter falls back to per-`PpcContext`
|
||||
/// reservation fields. Idempotent.
|
||||
pub fn disable(&self) {
|
||||
self.enabled
|
||||
.store(false, std::sync::atomic::Ordering::Release);
|
||||
}
|
||||
|
||||
/// Whether the table is currently active. The interpreter consults
|
||||
/// this on every `lwarx`/`stwcx.` to decide which path runs.
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
self.enabled.load(std::sync::atomic::Ordering::Acquire)
|
||||
}
|
||||
|
||||
/// True when at least one reservation is currently outstanding.
|
||||
/// Plain `write_u32` consults this to skip the invalidation hook
|
||||
/// when no thread holds a reservation — the common case for
|
||||
/// non-atomic code.
|
||||
#[inline]
|
||||
pub fn has_active_reservers(&self) -> bool {
|
||||
self.active_reservers.load(Ordering::Relaxed) > 0
|
||||
}
|
||||
|
||||
/// `lwarx(addr)` — claim a reservation on the line containing `addr`.
|
||||
/// Returns the generation stamped into the slot; the interpreter
|
||||
/// stores this alongside the per-`PpcContext` `has_reservation` bit
|
||||
/// so a subsequent `stwcx.` can verify the same gen still holds.
|
||||
///
|
||||
/// If a different reservation already occupied the bank, it's
|
||||
/// silently overwritten — that thread's `stwcx.` will fail because
|
||||
/// the slot no longer matches its stamped gen. Matches Xenon
|
||||
/// behavior (a different core's lwarx on the same line displaces
|
||||
/// any prior reservation).
|
||||
pub fn reserve(&self, addr: u32, hw_id: u8) -> u32 {
|
||||
let line = align_to_line(addr);
|
||||
let generation = (self
|
||||
.next_gen
|
||||
.fetch_add(1, Ordering::Relaxed)
|
||||
& 0xFF_FFFF) as u32;
|
||||
let new_raw = pack(line, generation, hw_id);
|
||||
// Release: prior reads of the reservation target should
|
||||
// happen-before any thread that observes the new slot value.
|
||||
let prev = self.lines[hash(line)].swap(new_raw, Ordering::AcqRel);
|
||||
// If the previous slot was non-zero, the displaced reserver is
|
||||
// implicitly invalidated — decrement the active counter for it.
|
||||
// Else, increment for our new reservation. Net effect: the
|
||||
// counter equals the number of *bank slots* with a non-zero
|
||||
// value, which is an upper bound on actual reservers.
|
||||
if prev == 0 {
|
||||
self.active_reservers.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
generation
|
||||
}
|
||||
|
||||
/// `stwcx.(addr)` — try to commit a reservation. Returns `true` if
|
||||
/// the slot still holds `(line, my_gen, my_hw_id)` (in which case
|
||||
/// it's CAS'd back to zero, releasing the bank), `false` otherwise.
|
||||
/// The data store itself is the caller's responsibility — see
|
||||
/// [`crate::interpreter`]'s `stwcx.` arm.
|
||||
pub fn try_commit(&self, addr: u32, my_gen: u32, my_hw_id: u8) -> bool {
|
||||
let line = align_to_line(addr);
|
||||
let expected = pack(line, my_gen, my_hw_id);
|
||||
match self.lines[hash(line)].compare_exchange(
|
||||
expected,
|
||||
0,
|
||||
Ordering::AcqRel,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => {
|
||||
// Successfully released the slot; decrement the active
|
||||
// count.
|
||||
self.active_reservers.fetch_sub(1, Ordering::Relaxed);
|
||||
true
|
||||
}
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Hook for plain (non-reserving) stores: invalidate any
|
||||
/// reservation on the containing line. Cheap when the bank is
|
||||
/// already empty (single Acquire load + branch).
|
||||
pub fn invalidate_for_write(&self, addr: u32) {
|
||||
let line = align_to_line(addr);
|
||||
let bank = &self.lines[hash(line)];
|
||||
let prev = bank.load(Ordering::Acquire);
|
||||
if prev == 0 {
|
||||
return;
|
||||
}
|
||||
// Verify the slot still holds a reservation on *this* line
|
||||
// before clearing — hash collisions mean the bank may hold a
|
||||
// reservation on an unrelated line that maps to the same slot.
|
||||
// Real Xenon has the same property (limited L2 associativity);
|
||||
// we mirror it here. A spurious bank match invalidates a
|
||||
// different line's reservation; the affected `stwcx.` retries —
|
||||
// sound, slightly less efficient.
|
||||
if let Some((bank_line, _generation, _hw)) = unpack(prev) {
|
||||
if bank_line != line {
|
||||
// Different line in the same bank — leave it alone (we
|
||||
// chose not to invalidate cross-line collisions to
|
||||
// reduce false-fail noise; real-HW behavior is similar
|
||||
// since L2 associativity sets cross-line constraints).
|
||||
return;
|
||||
}
|
||||
}
|
||||
// CAS-clear the bank if it still holds the value we observed.
|
||||
// If a concurrent `stwcx.` or `reserve` raced with us, the CAS
|
||||
// fails — that's fine; the line slot is now in a different
|
||||
// state and the displaced reservation will be picked up there.
|
||||
if bank
|
||||
.compare_exchange(prev, 0, Ordering::AcqRel, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
self.active_reservers.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop a per-`PpcContext` reservation without committing. Called
|
||||
/// when the interpreter clears `has_reservation` due to a
|
||||
/// non-`stwcx.` event (context switch, exception, etc.). Safe to
|
||||
/// call when the table doesn't hold our reservation anymore (the
|
||||
/// CAS simply fails).
|
||||
pub fn release(&self, addr: u32, my_gen: u32, my_hw_id: u8) {
|
||||
let _ = self.try_commit(addr, my_gen, my_hw_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
#[test]
|
||||
fn pack_unpack_roundtrip() {
|
||||
let raw = pack(0x1000_0000, 42, 5);
|
||||
let (line, generation, hw) = unpack(raw).unwrap();
|
||||
assert_eq!(line, 0x1000_0000);
|
||||
assert_eq!(generation, 42);
|
||||
assert_eq!(hw, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unpack_zero_is_none() {
|
||||
assert!(unpack(0).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reserve_then_commit_succeeds() {
|
||||
let t = ReservationTable::new();
|
||||
let gn = t.reserve(0x1234, 0);
|
||||
assert!(t.try_commit(0x1234, gn, 0));
|
||||
// Already released — second commit fails.
|
||||
assert!(!t.try_commit(0x1234, gn, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn other_hw_id_cannot_commit() {
|
||||
let t = ReservationTable::new();
|
||||
let gn = t.reserve(0x1234, 0);
|
||||
assert!(
|
||||
!t.try_commit(0x1234, gn, 1),
|
||||
"stwcx. from a different hw_id must fail"
|
||||
);
|
||||
// Original owner can still commit.
|
||||
assert!(t.try_commit(0x1234, gn, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lwarx_displaces_prior_reservation() {
|
||||
let t = ReservationTable::new();
|
||||
let g0 = t.reserve(0x1234, 0);
|
||||
// Different HW thread's lwarx on the same line.
|
||||
let g1 = t.reserve(0x1234, 1);
|
||||
// Original reserver's stwcx. fails because the gen changed.
|
||||
assert!(!t.try_commit(0x1234, g0, 0));
|
||||
// New reserver's stwcx. succeeds.
|
||||
assert!(t.try_commit(0x1234, g1, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalidate_clears_matching_reservation() {
|
||||
let t = ReservationTable::new();
|
||||
let gn = t.reserve(0x1234, 0);
|
||||
t.invalidate_for_write(0x1238); // same line as 0x1234
|
||||
assert!(!t.try_commit(0x1234, gn, 0));
|
||||
assert_eq!(t.active_reservers.load(Ordering::Relaxed), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalidate_different_line_in_same_bank_is_noop() {
|
||||
let t = ReservationTable::new();
|
||||
// Force a hash collision: addr A and addr B with same hash but
|
||||
// different line addresses.
|
||||
let line_a = 0x0000_1000;
|
||||
let line_b = line_a + ((NUM_LINES as u32) << 7); // +0x80000 → same hash
|
||||
assert_eq!(hash(line_a), hash(line_b));
|
||||
let gn = t.reserve(line_a, 0);
|
||||
// Invalidating line_b must NOT clear line_a's reservation.
|
||||
t.invalidate_for_write(line_b);
|
||||
assert!(t.try_commit(line_a, gn, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn has_active_reservers_tracks_count() {
|
||||
let t = ReservationTable::new();
|
||||
assert!(!t.has_active_reservers());
|
||||
let g0 = t.reserve(0x1000, 0);
|
||||
assert!(t.has_active_reservers());
|
||||
let g1 = t.reserve(0x2000, 1);
|
||||
assert!(t.has_active_reservers());
|
||||
t.try_commit(0x1000, g0, 0);
|
||||
assert!(t.has_active_reservers());
|
||||
t.try_commit(0x2000, g1, 1);
|
||||
assert!(!t.has_active_reservers());
|
||||
}
|
||||
|
||||
/// Stress test: 8 host threads each loop reserve+stwcx on the same
|
||||
/// line. Exactly one stwcx per round can win; the others fail and
|
||||
/// retry. The total number of *successful* commits across N
|
||||
/// outer iterations equals N (one winner per round).
|
||||
///
|
||||
/// This proves the table's mutual-exclusion property: at most one
|
||||
/// thread's stwcx. on a given line can succeed between two events
|
||||
/// that would invalidate the line.
|
||||
#[test]
|
||||
fn concurrent_lwarx_stwcx_serializes() {
|
||||
let t = Arc::new(ReservationTable::new());
|
||||
const ROUNDS: u32 = 1000;
|
||||
const THREADS: u8 = 8;
|
||||
let total_successes = Arc::new(AtomicU64::new(0));
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for hw_id in 0..THREADS {
|
||||
let t_clone = t.clone();
|
||||
let s_clone = total_successes.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("res-stress-{hw_id}"))
|
||||
.spawn(move || {
|
||||
let mut wins = 0u64;
|
||||
for _ in 0..ROUNDS {
|
||||
let gn = t_clone.reserve(0x1234_5678, hw_id);
|
||||
if t_clone.try_commit(0x1234_5678, gn, hw_id) {
|
||||
wins += 1;
|
||||
}
|
||||
}
|
||||
s_clone.fetch_add(wins, Ordering::Relaxed);
|
||||
})
|
||||
.expect("spawn"),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
h.join().expect("join");
|
||||
}
|
||||
let total = total_successes.load(Ordering::Relaxed);
|
||||
// Lower bound: every round had at least one winner — but races
|
||||
// can cause some rounds to have zero (all threads' reservations
|
||||
// got displaced before any could commit). Assert progress: at
|
||||
// least 10% of attempts succeed, and active_reservers is back
|
||||
// to zero.
|
||||
let attempts = ROUNDS as u64 * THREADS as u64;
|
||||
assert!(
|
||||
total > attempts / 10,
|
||||
"expected at least 10% successful commits, got {total}/{attempts}"
|
||||
);
|
||||
assert_eq!(
|
||||
t.active_reservers.load(Ordering::Relaxed),
|
||||
0,
|
||||
"all reservations should have been resolved"
|
||||
);
|
||||
}
|
||||
}
|
||||
1919
crates/xenia-cpu/src/scheduler.rs
Normal file
1919
crates/xenia-cpu/src/scheduler.rs
Normal file
File diff suppressed because it is too large
Load Diff
95
crates/xenia-cpu/src/trap.rs
Normal file
95
crates/xenia-cpu/src/trap.rs
Normal file
@@ -0,0 +1,95 @@
|
||||
//! TO-field evaluation for `tw`, `twi`, `td`, `tdi`.
|
||||
//!
|
||||
//! The TO field (5 bits) encodes which comparison outcomes trigger a trap:
|
||||
//!
|
||||
//! | bit | condition |
|
||||
//! |-----|-----------|
|
||||
//! | 0 | a < b (signed) |
|
||||
//! | 1 | a > b (signed) |
|
||||
//! | 2 | a == b |
|
||||
//! | 3 | a < b (unsigned) |
|
||||
//! | 4 | a > b (unsigned) |
|
||||
//!
|
||||
//! The bit numbering matches PowerISA ("MSB is bit 0"): TO[0] corresponds to
|
||||
//! the high bit of the 5-bit field, i.e. (to >> 4) & 1.
|
||||
//!
|
||||
//! `tw` / `twi` compare the low 32 bits of the operands (sign-extended back to
|
||||
//! 64 for the signed comparison); `td` / `tdi` compare the full 64 bits.
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum TrapWidth {
|
||||
Word, // tw, twi: 32-bit
|
||||
Doubleword, // td, tdi: 64-bit
|
||||
}
|
||||
|
||||
const TO_SLT: u32 = 1 << 4; // a < b signed
|
||||
const TO_SGT: u32 = 1 << 3; // a > b signed
|
||||
const TO_EQ: u32 = 1 << 2; // a == b
|
||||
const TO_ULT: u32 = 1 << 1; // a < b unsigned
|
||||
const TO_UGT: u32 = 1 << 0; // a > b unsigned
|
||||
|
||||
/// Returns true when the trap should fire.
|
||||
pub fn evaluate(to: u32, a: u64, b: u64, width: TrapWidth) -> bool {
|
||||
let (sa, sb, ua, ub): (i64, i64, u64, u64) = match width {
|
||||
TrapWidth::Word => (
|
||||
a as i32 as i64,
|
||||
b as i32 as i64,
|
||||
a as u32 as u64,
|
||||
b as u32 as u64,
|
||||
),
|
||||
TrapWidth::Doubleword => (a as i64, b as i64, a, b),
|
||||
};
|
||||
|
||||
if (to & TO_SLT) != 0 && sa < sb { return true; }
|
||||
if (to & TO_SGT) != 0 && sa > sb { return true; }
|
||||
if (to & TO_EQ) != 0 && ua == ub { return true; }
|
||||
if (to & TO_ULT) != 0 && ua < ub { return true; }
|
||||
if (to & TO_UGT) != 0 && ua > ub { return true; }
|
||||
false
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn to_zero_never_traps() {
|
||||
assert!(!evaluate(0, 0, 0, TrapWidth::Doubleword));
|
||||
assert!(!evaluate(0, 5, 3, TrapWidth::Doubleword));
|
||||
assert!(!evaluate(0, !0, 0, TrapWidth::Doubleword));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_31_always_traps_when_any_condition_holds() {
|
||||
// 31 = 0b11111 = all conditions enabled
|
||||
assert!(evaluate(31, 1, 2, TrapWidth::Doubleword)); // slt+ult
|
||||
assert!(evaluate(31, 2, 1, TrapWidth::Doubleword)); // sgt+ugt
|
||||
assert!(evaluate(31, 7, 7, TrapWidth::Doubleword)); // eq
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_eq_only() {
|
||||
// TO[2] = 0b00100 = 4
|
||||
assert!(evaluate(4, 5, 5, TrapWidth::Doubleword));
|
||||
assert!(!evaluate(4, 5, 6, TrapWidth::Doubleword));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_signed_vs_unsigned_on_negative() {
|
||||
// a=-1 (as u64 = all-ones). TO[0]=slt enabled = 0b10000 = 16
|
||||
// Signed: -1 < 0 → true
|
||||
let neg1 = (-1i64) as u64;
|
||||
assert!(evaluate(16, neg1, 0, TrapWidth::Doubleword));
|
||||
// TO[3]=ult enabled = 0b00010 = 2 → unsigned: all-ones < 0 is false
|
||||
assert!(!evaluate(2, neg1, 0, TrapWidth::Doubleword));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn word_width_ignores_high_32_bits() {
|
||||
// a's low 32 = 1, high 32 = different; b = 1. With TO=eq, should trap.
|
||||
let a = 0xDEAD_BEEF_0000_0001u64;
|
||||
assert!(evaluate(4, a, 1, TrapWidth::Word));
|
||||
// In doubleword, different.
|
||||
assert!(!evaluate(4, a, 1, TrapWidth::Doubleword));
|
||||
}
|
||||
}
|
||||
920
crates/xenia-cpu/src/vmx.rs
Normal file
920
crates/xenia-cpu/src/vmx.rs
Normal file
@@ -0,0 +1,920 @@
|
||||
//! VMX / AltiVec helper routines shared by the interpreter's 150+ vector
|
||||
//! opcode handlers.
|
||||
//!
|
||||
//! Big-endian lane indexing throughout: `Vec128::bytes[0]` is the most
|
||||
//! significant byte, which corresponds to PowerPC lane 0. Operations that
|
||||
//! care about "even" vs "odd" lanes follow the PPC convention (lane 0 = most
|
||||
//! significant = "even" for multiply-even/odd purposes).
|
||||
|
||||
use xenia_memory::MemoryAccess;
|
||||
use xenia_types::Vec128;
|
||||
|
||||
// ─── Lane accessors ────────────────────────────────────────────────────────
|
||||
|
||||
#[inline] pub fn as_i8x16(v: Vec128) -> [i8; 16] {
|
||||
let b = v.as_bytes();
|
||||
let mut r = [0i8; 16];
|
||||
for i in 0..16 { r[i] = b[i] as i8; }
|
||||
r
|
||||
}
|
||||
|
||||
#[inline] pub fn as_i16x8(v: Vec128) -> [i16; 8] {
|
||||
let u = v.as_u16x8();
|
||||
[u[0] as i16, u[1] as i16, u[2] as i16, u[3] as i16,
|
||||
u[4] as i16, u[5] as i16, u[6] as i16, u[7] as i16]
|
||||
}
|
||||
|
||||
#[inline] pub fn as_i32x4(v: Vec128) -> [i32; 4] {
|
||||
let u = v.as_u32x4();
|
||||
[u[0] as i32, u[1] as i32, u[2] as i32, u[3] as i32]
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i8x16(r: [i8; 16]) -> Vec128 {
|
||||
let mut b = [0u8; 16];
|
||||
for i in 0..16 { b[i] = r[i] as u8; }
|
||||
Vec128::from_bytes(b)
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i16x8(r: [i16; 8]) -> Vec128 {
|
||||
Vec128::from_u16x8_array([
|
||||
r[0] as u16, r[1] as u16, r[2] as u16, r[3] as u16,
|
||||
r[4] as u16, r[5] as u16, r[6] as u16, r[7] as u16,
|
||||
])
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i32x4(r: [i32; 4]) -> Vec128 {
|
||||
Vec128::from_u32x4_array([r[0] as u32, r[1] as u32, r[2] as u32, r[3] as u32])
|
||||
}
|
||||
|
||||
// ─── Saturation helpers ────────────────────────────────────────────────────
|
||||
// Each returns (clamped_value, saturated_flag). Handlers OR the flags together
|
||||
// and call `ctx.set_vscr_sat(true)` once per instruction.
|
||||
|
||||
#[inline] pub fn sat_add_u8(a: u8, b: u8) -> (u8, bool) {
|
||||
let s = a as u16 + b as u16;
|
||||
if s > u8::MAX as u16 { (u8::MAX, true) } else { (s as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u8(a: u8, b: u8) -> (u8, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i8(a: i8, b: i8) -> (i8, bool) {
|
||||
let s = a as i16 + b as i16;
|
||||
if s > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if s < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (s as i8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i8(a: i8, b: i8) -> (i8, bool) {
|
||||
let s = a as i16 - b as i16;
|
||||
if s > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if s < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (s as i8, false) }
|
||||
}
|
||||
|
||||
#[inline] pub fn sat_add_u16(a: u16, b: u16) -> (u16, bool) {
|
||||
let s = a as u32 + b as u32;
|
||||
if s > u16::MAX as u32 { (u16::MAX, true) } else { (s as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u16(a: u16, b: u16) -> (u16, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i16(a: i16, b: i16) -> (i16, bool) {
|
||||
let s = a as i32 + b as i32;
|
||||
if s > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if s < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (s as i16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i16(a: i16, b: i16) -> (i16, bool) {
|
||||
let s = a as i32 - b as i32;
|
||||
if s > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if s < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (s as i16, false) }
|
||||
}
|
||||
|
||||
#[inline] pub fn sat_add_u32(a: u32, b: u32) -> (u32, bool) {
|
||||
let s = a as u64 + b as u64;
|
||||
if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u32(a: u32, b: u32) -> (u32, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i32(a: i32, b: i32) -> (i32, bool) {
|
||||
let s = a as i64 + b as i64;
|
||||
if s > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if s < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (s as i32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i32(a: i32, b: i32) -> (i32, bool) {
|
||||
let s = a as i64 - b as i64;
|
||||
if s > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if s < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (s as i32, false) }
|
||||
}
|
||||
|
||||
// Pack-with-saturation helpers — clamp a wider integer to the narrower type.
|
||||
#[inline] pub fn sat_i16_to_i8(v: i16) -> (i8, bool) {
|
||||
if v > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if v < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (v as i8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i16_to_u8(v: i16) -> (u8, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u8::MAX as i16 { (u8::MAX, true) }
|
||||
else { (v as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_u16_to_u8(v: u16) -> (u8, bool) {
|
||||
if v > u8::MAX as u16 { (u8::MAX, true) } else { (v as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i32_to_i16(v: i32) -> (i16, bool) {
|
||||
if v > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if v < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (v as i16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i32_to_u16(v: i32) -> (u16, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u16::MAX as i32 { (u16::MAX, true) }
|
||||
else { (v as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_u32_to_u16(v: u32) -> (u16, bool) {
|
||||
if v > u16::MAX as u32 { (u16::MAX, true) } else { (v as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i64_to_i32(v: i64) -> (i32, bool) {
|
||||
if v > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if v < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (v as i32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i64_to_u32(v: i64) -> (u32, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u32::MAX as i64 { (u32::MAX, true) }
|
||||
else { (v as u32, false) }
|
||||
}
|
||||
|
||||
// ─── Averages ──────────────────────────────────────────────────────────────
|
||||
// PPC avg is rounded up: (a + b + 1) / 2.
|
||||
#[inline] pub fn avg_u8(a: u8, b: u8) -> u8 {
|
||||
((a as u16 + b as u16 + 1) >> 1) as u8
|
||||
}
|
||||
#[inline] pub fn avg_u16(a: u16, b: u16) -> u16 {
|
||||
((a as u32 + b as u32 + 1) >> 1) as u16
|
||||
}
|
||||
#[inline] pub fn avg_u32(a: u32, b: u32) -> u32 {
|
||||
((a as u64 + b as u64 + 1) >> 1) as u32
|
||||
}
|
||||
#[inline] pub fn avg_i8(a: i8, b: i8) -> i8 {
|
||||
((a as i32 + b as i32 + 1) >> 1) as i8
|
||||
}
|
||||
#[inline] pub fn avg_i16(a: i16, b: i16) -> i16 {
|
||||
((a as i32 + b as i32 + 1) >> 1) as i16
|
||||
}
|
||||
#[inline] pub fn avg_i32(a: i32, b: i32) -> i32 {
|
||||
((a as i64 + b as i64 + 1) >> 1) as i32
|
||||
}
|
||||
|
||||
// ─── NaN-aware f32 min/max for vmaxfp / vminfp ────────────────────────────
|
||||
//
|
||||
// Altivec PEM: "If either element of vA or vB is a NaN, the corresponding
|
||||
// element of vD is set to the quiet NaN form of that NaN". Rust's `>` / `<`
|
||||
// comparison with NaN always returns false, so `if a > b { a } else { b }`
|
||||
// would silently pick `b` whenever `a` is NaN — losing NaN propagation.
|
||||
|
||||
#[inline]
|
||||
pub fn max_nan(a: f32, b: f32) -> f32 {
|
||||
if a.is_nan() { quiet_nan(a) }
|
||||
else if b.is_nan() { quiet_nan(b) }
|
||||
else if a > b { a } else { b }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn min_nan(a: f32, b: f32) -> f32 {
|
||||
if a.is_nan() { quiet_nan(a) }
|
||||
else if b.is_nan() { quiet_nan(b) }
|
||||
else if a < b { a } else { b }
|
||||
}
|
||||
|
||||
/// Convert an SNaN to QNaN by setting the high mantissa bit. A QNaN is
|
||||
/// returned unchanged.
|
||||
#[inline]
|
||||
pub fn quiet_nan(x: f32) -> f32 {
|
||||
if !x.is_nan() { return x; }
|
||||
f32::from_bits(x.to_bits() | 0x0040_0000)
|
||||
}
|
||||
|
||||
/// Flush a subnormal f32 to ±0 (preserving the sign). Used by vmaddfp family,
|
||||
/// vctsxs / vctuxs, and any instruction whose AltiVec definition specifies
|
||||
/// input-side denormal flushing regardless of VSCR[NJ].
|
||||
#[inline]
|
||||
pub fn flush_denorm(x: f32) -> f32 {
|
||||
if x.is_subnormal() {
|
||||
if x.is_sign_negative() { -0.0 } else { 0.0 }
|
||||
} else {
|
||||
x
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Float ⇄ fixed-point conversions (scaled by 2^scale_bits) ─────────────
|
||||
//
|
||||
// vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
|
||||
#[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
|
||||
// PPCBUG-433: AltiVec ISA saturates NaN to INT_MIN (0x80000000), not 0.
|
||||
// (vctuxs's NaN→0 is correct per AltiVec ISA — see PPCBUG-434.)
|
||||
if x.is_nan() { return (i32::MIN, true); }
|
||||
let x = flush_denorm(x);
|
||||
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
|
||||
if scaled >= i32::MAX as f64 { return (i32::MAX, true); }
|
||||
if scaled <= i32::MIN as f64 { return (i32::MIN, true); }
|
||||
(scaled.trunc() as i32, false)
|
||||
}
|
||||
#[inline] pub fn cvt_f32_to_u32_sat(x: f32, scale_bits: u32) -> (u32, bool) {
|
||||
if x.is_nan() { return (0, true); }
|
||||
let x = flush_denorm(x);
|
||||
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
|
||||
if scaled < 0.0 { return (0, true); }
|
||||
if scaled > u32::MAX as f64 { return (u32::MAX, true); }
|
||||
(scaled.trunc() as u32, false)
|
||||
}
|
||||
#[inline] pub fn cvt_i32_to_f32(v: i32, scale_bits: u32) -> f32 {
|
||||
(v as f64 / (1u64 << scale_bits) as f64) as f32
|
||||
}
|
||||
#[inline] pub fn cvt_u32_to_f32(v: u32, scale_bits: u32) -> f32 {
|
||||
(v as f64 / (1u64 << scale_bits) as f64) as f32
|
||||
}
|
||||
|
||||
// ─── Unaligned vector load/store ──────────────────────────────────────────
|
||||
//
|
||||
// lvlx/lvrx and stvlx/stvrx combine to perform any unaligned 16-byte access:
|
||||
// lvlx(EA) | lvrx(EA + 16) loads 16 bytes starting at unaligned EA.
|
||||
// stvlx(EA); stvrx(EA + 16) stores 16 bytes starting at unaligned EA.
|
||||
//
|
||||
// Semantics per the AltiVec manual (and xenia-canary ppc_emit_memory.cc):
|
||||
// lvlx: shift = EA & 0xF, n = 16 - shift. Loads mem[EA..EA+n] into
|
||||
// lanes VR[0..n], zeros VR[n..16].
|
||||
// lvrx: shift = EA & 0xF. If shift == 0, VR = 0. Otherwise loads
|
||||
// mem[EA-shift..EA] into lanes VR[16-shift..16], zeros VR[0..16-shift].
|
||||
// stvlx / stvrx are the symmetric stores.
|
||||
//
|
||||
// `Vec128::bytes[0]` is the most significant byte (PPC lane 0 in BE view).
|
||||
|
||||
pub fn load_vector_left(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
let n = 16 - shift;
|
||||
let mut bytes = [0u8; 16];
|
||||
for i in 0..n {
|
||||
bytes[i] = mem.read_u8(ea.wrapping_add(i as u32));
|
||||
}
|
||||
Vec128::from_bytes(bytes)
|
||||
}
|
||||
|
||||
pub fn load_vector_right(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
if shift == 0 { return Vec128::ZERO; }
|
||||
let base = ea & !0xFu32;
|
||||
let mut bytes = [0u8; 16];
|
||||
for i in 0..shift {
|
||||
bytes[16 - shift + i] = mem.read_u8(base.wrapping_add(i as u32));
|
||||
}
|
||||
Vec128::from_bytes(bytes)
|
||||
}
|
||||
|
||||
pub fn store_vector_left(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
let n = 16 - shift;
|
||||
let b = v.as_bytes();
|
||||
for i in 0..n {
|
||||
mem.write_u8(ea.wrapping_add(i as u32), b[i]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
if shift == 0 { return; }
|
||||
let base = ea & !0xFu32;
|
||||
let b = v.as_bytes();
|
||||
for i in 0..shift {
|
||||
mem.write_u8(base.wrapping_add(i as u32), b[16 - shift + i]);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ─────────────────────────
|
||||
// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel.
|
||||
// vupkhpx / vupklpx reverse the operation.
|
||||
//
|
||||
// Format: input 32-bit word holds
|
||||
// bits 0-6: unused (0)
|
||||
// bit 7: alpha-select (→ bit 15 of output)
|
||||
// bits 8-15: R (top 5 bits kept)
|
||||
// bits 16-23: G (top 5 bits kept)
|
||||
// bits 24-31: B (top 5 bits kept)
|
||||
// Output 16-bit word:
|
||||
// bit 15: A (from input bit 7)
|
||||
// bits 10-14: R
|
||||
// bits 5-9: G
|
||||
// bits 0-4: B
|
||||
|
||||
#[inline] pub fn pack_pixel_555(input: u32) -> u16 {
|
||||
let a = (input >> 7) & 0x1;
|
||||
let r = (input >> 8) & 0xFF;
|
||||
let g = (input >> 16) & 0xFF;
|
||||
let b = (input >> 24) & 0xFF;
|
||||
((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16
|
||||
}
|
||||
|
||||
#[inline] pub fn unpack_pixel_555(input: u16) -> u32 {
|
||||
let input = input as u32;
|
||||
let a = (input >> 15) & 0x1;
|
||||
let r = (input >> 10) & 0x1F;
|
||||
let g = (input >> 5) & 0x1F;
|
||||
let b = input & 0x1F;
|
||||
// Sign-extend A and replicate 5-bit RGB into the top of each byte.
|
||||
let a8 = if a != 0 { 0xFFu32 } else { 0 };
|
||||
let r8 = (r << 3) | (r >> 2);
|
||||
let g8 = (g << 3) | (g >> 2);
|
||||
let b8 = (b << 3) | (b >> 2);
|
||||
(a8 << 24) | (r8 << 16) | (g8 << 8) | b8
|
||||
}
|
||||
|
||||
// ─── VMX128 D3D pack/unpack dispatch ──────────────────────────────────────
|
||||
// `vpkd3d128` / `vupkd3d128` encode a small enum in the instruction word
|
||||
// (VX128_4 immediate field). The exact enum lives in canary's
|
||||
// ppc_emit_altivec.cc under PACK_TYPE_*; titles usually touch D3DCOLOR
|
||||
// (type 0) and a handful of texture-coordinate variants.
|
||||
//
|
||||
// Rather than risk getting a rarely-used sub-case wrong, we implement the
|
||||
// common types and fall back to a warning + pass-through for unknown types.
|
||||
// Returning the VB register value unchanged is always preferable to emitting
|
||||
// StepResult::Unimplemented because it keeps the interpreter running.
|
||||
|
||||
/// Pack-type encoding of `vpkd3d128` / `vupkd3d128`.
|
||||
///
|
||||
/// The immediate field lives at PPC bits 16-22 (VX128_3/4 IMM, 7 bits).
|
||||
/// Canary decodes `type = IMM >> 2` (top 5 bits) and `pack = IMM & 0x3`
|
||||
/// (low 2 bits, used only by `vpkd3d128` to select output-slot layout).
|
||||
/// Valid `type` values are 0..=6 per `ppc_emit_altivec.cc:2095-2118`:
|
||||
///
|
||||
/// | id | canary name | format |
|
||||
/// |----|-------------------|---------------------------------------|
|
||||
/// | 0 | VPACK_D3DCOLOR | 4 f32 [0,1] ↔ ARGB8 |
|
||||
/// | 1 | VPACK_NORMSHORT2 | 2 f32 [-1,1] ↔ 2× signed-normalized i16 |
|
||||
/// | 2 | VPACK_NORMPACKED32| 4 f32 [-1,1] ↔ UINT_2101010 (w:2,z:10,y:10,x:10) |
|
||||
/// | 3 | VPACK_FLOAT16_2 | 2 f32 ↔ 2× fp16 |
|
||||
/// | 4 | VPACK_NORMSHORT4 | 4 f32 [-1,1] ↔ 4× signed-normalized i16 |
|
||||
/// | 5 | VPACK_FLOAT16_4 | 4 f32 ↔ 4× fp16 |
|
||||
/// | 6 | VPACK_NORMPACKED64| 4 f32 [-1,1] ↔ ULONG_4202020 (w:4,z:20,y:20,x:20) |
|
||||
///
|
||||
/// Prior (M3-pre) this enum listed made-up "Normal16"/"Normal8"/"UByteN4"
|
||||
/// variants that didn't match canary; the immediate extraction was also
|
||||
/// wrong (LSB-numbered `>>6 & 0x7` instead of MSB-numbered `>>11 & 0x1F`
|
||||
/// against a 7-bit IMM field). M3 fixes both.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum D3dPackType {
|
||||
D3dColor,
|
||||
NormShort2,
|
||||
NormPacked32,
|
||||
Float16_2,
|
||||
NormShort4,
|
||||
Float16_4,
|
||||
NormPacked64,
|
||||
Other(u32),
|
||||
}
|
||||
|
||||
impl D3dPackType {
|
||||
/// Decode the `type` bits extracted from the VX128_3/4 IMM field via
|
||||
/// canary's `IMM >> 2` convention (i.e. the caller has already divided
|
||||
/// out the 2-bit `pack` subfield).
|
||||
pub fn from_immediate(type_bits: u32) -> Self {
|
||||
match type_bits {
|
||||
0 => Self::D3dColor,
|
||||
1 => Self::NormShort2,
|
||||
2 => Self::NormPacked32,
|
||||
3 => Self::Float16_2,
|
||||
4 => Self::NormShort4,
|
||||
5 => Self::Float16_4,
|
||||
6 => Self::NormPacked64,
|
||||
other => Self::Other(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pack an f32x4 vector of [R, G, B, A] in [0.0, 1.0] into a single D3DCOLOR
|
||||
/// value in lane 3 of the output.
|
||||
pub fn pack_d3dcolor(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let to_byte = |x: f32| -> u32 {
|
||||
let c = x.clamp(0.0, 1.0) * 255.0;
|
||||
(c + 0.5) as u32 & 0xFF
|
||||
};
|
||||
// D3DCOLOR is A,R,G,B in that byte order inside a u32.
|
||||
let word = (to_byte(f[3]) << 24) | (to_byte(f[0]) << 16) | (to_byte(f[1]) << 8) | to_byte(f[2]);
|
||||
Vec128::from_u32x4(0, 0, 0, word)
|
||||
}
|
||||
|
||||
/// Unpack a D3DCOLOR value (in lane 3 of the input) into an f32x4 [R, G, B, A].
|
||||
pub fn unpack_d3dcolor(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let a = ((word >> 24) & 0xFF) as f32 / 255.0;
|
||||
let r = ((word >> 16) & 0xFF) as f32 / 255.0;
|
||||
let g = ((word >> 8) & 0xFF) as f32 / 255.0;
|
||||
let b = (word & 0xFF) as f32 / 255.0;
|
||||
Vec128::from_f32x4(r, g, b, a)
|
||||
}
|
||||
|
||||
// ───────────────────────────────────────────────────────────────────────
|
||||
// First-Pixels M3 — pack/unpack for the remaining canary pack types.
|
||||
//
|
||||
// Conventions shared across all helpers:
|
||||
// * Input-to-`unpack_*` (packed data) lives in the *source* lane position
|
||||
// canary's HIR assumes: canonically the 32-bit word is in lane 3 and
|
||||
// the 64-bit value straddles lanes 2-3. We match that so the existing
|
||||
// D3DCOLOR helpers' 3-lane convention is preserved across the whole
|
||||
// pack-type family.
|
||||
// * Output-from-`pack_*` sits in the same lane(s). The caller usually
|
||||
// follows with a permute to move it elsewhere (the VX128_4 `pack`
|
||||
// subfield controls that in `vpkd3d128`).
|
||||
// * Range semantics match canary: normalized types use `max` = (1<<N-1)-1
|
||||
// for signed, clamp before rounding.
|
||||
// ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[inline]
|
||||
fn norm_to_i16(x: f32) -> i16 {
|
||||
let c = x.clamp(-1.0, 1.0) * 32767.0;
|
||||
// Round half away from zero, matching canary's `vcfsx` semantics.
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
r.clamp(-32768, 32767) as i16
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn i16_to_norm(s: i16) -> f32 {
|
||||
(s as f32) / 32767.0
|
||||
}
|
||||
|
||||
/// **NORMSHORT2** — 2 f32s in [-1, 1] → two 16-bit signed-normalized
|
||||
/// shorts packed as `(x << 16) | y` in lane 3 (high 32 bits of the word
|
||||
/// hold X; low 16 hold Y). Output lanes 0..=2 are zero-filled.
|
||||
pub fn pack_normshort2(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = norm_to_i16(f[0]) as u16 as u32;
|
||||
let y = norm_to_i16(f[1]) as u16 as u32;
|
||||
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
|
||||
}
|
||||
|
||||
pub fn unpack_normshort2(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let x = i16_to_norm((word >> 16) as i16);
|
||||
let y = i16_to_norm(word as i16);
|
||||
Vec128::from_f32x4(x, y, 0.0, 1.0)
|
||||
}
|
||||
|
||||
/// **NORMSHORT4** — 4 f32s in [-1, 1] → four 16-bit signed-normalized
|
||||
/// shorts packed across lanes 2-3 (big-endian dword order: X in the
|
||||
/// high word of lane 2, Y low of lane 2, Z high of lane 3, W low of lane
|
||||
/// 3).
|
||||
pub fn pack_normshort4(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = norm_to_i16(f[0]) as u16 as u32;
|
||||
let y = norm_to_i16(f[1]) as u16 as u32;
|
||||
let z = norm_to_i16(f[2]) as u16 as u32;
|
||||
let w = norm_to_i16(f[3]) as u16 as u32;
|
||||
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
|
||||
}
|
||||
|
||||
pub fn unpack_normshort4(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2);
|
||||
let lo = v.u32x4(3);
|
||||
let x = i16_to_norm((hi >> 16) as i16);
|
||||
let y = i16_to_norm(hi as i16);
|
||||
let z = i16_to_norm((lo >> 16) as i16);
|
||||
let w = i16_to_norm(lo as i16);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// **NORMPACKED32** — UINT_2101010 layout, 4 f32s in [-1, 1] packed into
|
||||
/// 32 bits in lane 3. Per canary's comment `2_10_10_10 w_z_y_x`: the
|
||||
/// high 2 bits hold W (signed 2-bit, -2..=1), then Z/Y/X each use 10
|
||||
/// signed-normalized bits.
|
||||
pub fn pack_normpacked32(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
#[inline]
|
||||
fn n10(x: f32) -> u32 {
|
||||
let c = x.clamp(-1.0, 1.0) * 511.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
(r.clamp(-512, 511) as i32 as u32) & 0x3FF
|
||||
}
|
||||
#[inline]
|
||||
fn n2(x: f32) -> u32 {
|
||||
let c = x.clamp(-1.0, 1.0) * 1.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
(r.clamp(-2, 1) as i32 as u32) & 0x3
|
||||
}
|
||||
let x = n10(f[0]);
|
||||
let y = n10(f[1]);
|
||||
let z = n10(f[2]);
|
||||
let w = n2(f[3]);
|
||||
let word = (w << 30) | (z << 20) | (y << 10) | x;
|
||||
Vec128::from_u32x4(0, 0, 0, word)
|
||||
}
|
||||
|
||||
pub fn unpack_normpacked32(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
#[inline]
|
||||
fn u10_to_norm(bits: u32) -> f32 {
|
||||
// Sign-extend the 10-bit field then normalize.
|
||||
let s = ((bits & 0x3FF) as i32) << 22 >> 22;
|
||||
(s as f32) / 511.0
|
||||
}
|
||||
#[inline]
|
||||
fn u2_to_norm(bits: u32) -> f32 {
|
||||
let s = ((bits & 0x3) as i32) << 30 >> 30;
|
||||
(s as f32).clamp(-1.0, 1.0)
|
||||
}
|
||||
let x = u10_to_norm(word);
|
||||
let y = u10_to_norm(word >> 10);
|
||||
let z = u10_to_norm(word >> 20);
|
||||
let w = u2_to_norm(word >> 30);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// **NORMPACKED64** — ULONG_4202020, 4 f32s in [-1, 1] packed into 64
|
||||
/// bits across lanes 2-3. Per canary's comment `4_20_20_20 w_z_y_x`:
|
||||
/// the high 4 bits of the dword hold W (signed 4-bit); the remaining 60
|
||||
/// bits hold 3× 20-bit signed-normalized Z/Y/X. Rare outside very few
|
||||
/// titles (canary notes 54540829).
|
||||
pub fn pack_normpacked64(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
#[inline]
|
||||
fn n20(x: f32) -> u64 {
|
||||
let c = x.clamp(-1.0, 1.0) * 524287.0; // 2^19 - 1
|
||||
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
|
||||
(r.clamp(-524288, 524287) as i64 as u64) & 0xF_FFFF
|
||||
}
|
||||
#[inline]
|
||||
fn n4(x: f32) -> u64 {
|
||||
let c = x.clamp(-1.0, 1.0) * 7.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
|
||||
(r.clamp(-8, 7) as i64 as u64) & 0xF
|
||||
}
|
||||
let x = n20(f[0]);
|
||||
let y = n20(f[1]);
|
||||
let z = n20(f[2]);
|
||||
let w = n4(f[3]);
|
||||
let dw: u64 = (w << 60) | (z << 40) | (y << 20) | x;
|
||||
Vec128::from_u32x4(0, 0, (dw >> 32) as u32, dw as u32)
|
||||
}
|
||||
|
||||
pub fn unpack_normpacked64(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2) as u64;
|
||||
let lo = v.u32x4(3) as u64;
|
||||
let dw = (hi << 32) | lo;
|
||||
#[inline]
|
||||
fn u20_to_norm(bits: u64) -> f32 {
|
||||
let s = ((bits & 0xF_FFFF) as i64) << 44 >> 44;
|
||||
(s as f32) / 524287.0
|
||||
}
|
||||
#[inline]
|
||||
fn u4_to_norm(bits: u64) -> f32 {
|
||||
let s = ((bits & 0xF) as i64) << 60 >> 60;
|
||||
(s as f32) / 7.0
|
||||
}
|
||||
let x = u20_to_norm(dw);
|
||||
let y = u20_to_norm(dw >> 20);
|
||||
let z = u20_to_norm(dw >> 40);
|
||||
let w = u4_to_norm(dw >> 60);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// IEEE 754 half-precision float pack/unpack — used by both FLOAT16_2
|
||||
/// and FLOAT16_4. No FMA quirks involved; we go via `f32::to_bits` and
|
||||
/// manual bit-twiddling (the stable-Rust `f16` type isn't available
|
||||
/// yet).
|
||||
#[inline]
|
||||
fn f32_to_f16_bits(f: f32) -> u16 {
|
||||
let bits = f.to_bits();
|
||||
let sign = ((bits >> 31) & 0x1) as u16;
|
||||
let exp = ((bits >> 23) & 0xFF) as i32;
|
||||
let mant = bits & 0x7FFFFF;
|
||||
// Handle the easy cases first.
|
||||
if exp == 0xFF {
|
||||
// NaN or infinity.
|
||||
let half_exp = 0x1F;
|
||||
let half_mant = if mant != 0 { 0x200 } else { 0 }; // quiet NaN / zero mantissa for Inf
|
||||
return (sign << 15) | (half_exp << 10) | half_mant;
|
||||
}
|
||||
let unbiased_exp = exp - 127;
|
||||
if unbiased_exp >= 16 {
|
||||
// Overflow → infinity.
|
||||
return (sign << 15) | (0x1F << 10);
|
||||
}
|
||||
if unbiased_exp <= -15 {
|
||||
// Denormal or zero. Compute the shift and subnormal mantissa;
|
||||
// anything too small flushes to signed zero.
|
||||
if unbiased_exp < -24 {
|
||||
return sign << 15;
|
||||
}
|
||||
let shift = -14 - unbiased_exp as i32; // amount to shift the implicit-1'd mantissa
|
||||
let full_mant = 0x800000 | mant; // 24 bits with implicit leading 1
|
||||
let half_mant = (full_mant >> (shift + 13)) as u16;
|
||||
return (sign << 15) | half_mant;
|
||||
}
|
||||
let half_exp = ((unbiased_exp + 15) as u16) & 0x1F;
|
||||
let half_mant = (mant >> 13) as u16;
|
||||
(sign << 15) | (half_exp << 10) | half_mant
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn f16_bits_to_f32(h: u16) -> f32 {
|
||||
let sign = ((h >> 15) & 0x1) as u32;
|
||||
let exp = ((h >> 10) & 0x1F) as i32;
|
||||
let mant = (h & 0x3FF) as u32;
|
||||
let bits = if exp == 0x1F {
|
||||
// NaN or infinity.
|
||||
let f32_exp = 0xFFu32;
|
||||
let f32_mant = if mant != 0 { 0x400000 } else { 0 };
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
} else if exp == 0 && mant == 0 {
|
||||
// Signed zero.
|
||||
sign << 31
|
||||
} else if exp == 0 {
|
||||
// Subnormal — renormalize.
|
||||
let mut e = -14i32;
|
||||
let mut m = mant;
|
||||
while (m & 0x400) == 0 {
|
||||
m <<= 1;
|
||||
e -= 1;
|
||||
}
|
||||
let f32_exp = ((e + 127) as u32) & 0xFF;
|
||||
let f32_mant = (m & 0x3FF) << 13;
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
} else {
|
||||
let f32_exp = ((exp - 15 + 127) as u32) & 0xFF;
|
||||
let f32_mant = mant << 13;
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
};
|
||||
f32::from_bits(bits)
|
||||
}
|
||||
|
||||
/// **FLOAT16_2** — two 32-bit floats → two half-floats packed into one
|
||||
/// 32-bit word (X in high 16 bits of lane 3, Y in low 16).
|
||||
pub fn pack_float16_2(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = f32_to_f16_bits(f[0]) as u32;
|
||||
let y = f32_to_f16_bits(f[1]) as u32;
|
||||
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
|
||||
}
|
||||
|
||||
pub fn unpack_float16_2(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let x = f16_bits_to_f32((word >> 16) as u16);
|
||||
let y = f16_bits_to_f32(word as u16);
|
||||
Vec128::from_f32x4(x, y, 0.0, 1.0)
|
||||
}
|
||||
|
||||
/// **FLOAT16_4** — four 32-bit floats → four half-floats packed across
|
||||
/// 64 bits (lanes 2-3).
|
||||
pub fn pack_float16_4(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = f32_to_f16_bits(f[0]) as u32;
|
||||
let y = f32_to_f16_bits(f[1]) as u32;
|
||||
let z = f32_to_f16_bits(f[2]) as u32;
|
||||
let w = f32_to_f16_bits(f[3]) as u32;
|
||||
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
|
||||
}
|
||||
|
||||
pub fn unpack_float16_4(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2);
|
||||
let lo = v.u32x4(3);
|
||||
let x = f16_bits_to_f32((hi >> 16) as u16);
|
||||
let y = f16_bits_to_f32(hi as u16);
|
||||
let z = f16_bits_to_f32((lo >> 16) as u16);
|
||||
let w = f16_bits_to_f32(lo as u16);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
// ─── CR6 helpers used by integer compares ─────────────────────────────────
|
||||
// vcmp*. (record-form) updates CR6 in a compressed form:
|
||||
// CR6 = {all-true, 0, all-false, 0}
|
||||
// where each bit reflects the per-lane mask across the whole register.
|
||||
|
||||
#[inline] pub fn cr6_flags_from_mask(mask: Vec128) -> (bool, bool) {
|
||||
let b = mask.as_bytes();
|
||||
let mut any_set = false;
|
||||
let mut any_clear = false;
|
||||
for &byte in b.iter() {
|
||||
if byte != 0 { any_set = true; }
|
||||
if byte != 0xFF { any_clear = true; }
|
||||
}
|
||||
let all_true = !any_clear;
|
||||
let all_false = !any_set;
|
||||
(all_true, all_false)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
struct TestMem { data: Box<[Cell<u8>]> }
|
||||
impl TestMem {
|
||||
fn new(size: usize) -> Self {
|
||||
Self { data: (0..size).map(|_| Cell::new(0)).collect() }
|
||||
}
|
||||
}
|
||||
impl MemoryAccess for TestMem {
|
||||
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
u16::from_be_bytes([self.data[a as usize].get(), self.data[a as usize + 1].get()])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
let a = a as usize;
|
||||
u32::from_be_bytes([
|
||||
self.data[a].get(), self.data[a+1].get(),
|
||||
self.data[a+2].get(), self.data[a+3].get(),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
let a = a as usize;
|
||||
u64::from_be_bytes([
|
||||
self.data[a].get(), self.data[a+1].get(),
|
||||
self.data[a+2].get(), self.data[a+3].get(),
|
||||
self.data[a+4].get(), self.data[a+5].get(),
|
||||
self.data[a+6].get(), self.data[a+7].get(),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let b = v.to_be_bytes();
|
||||
self.data[a as usize].set(b[0]);
|
||||
self.data[a as usize + 1].set(b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let b = v.to_be_bytes(); let a = a as usize;
|
||||
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let b = v.to_be_bytes(); let a = a as usize;
|
||||
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
|
||||
}
|
||||
fn translate(&self, _a: u32) -> Option<*const u8> { None }
|
||||
fn translate_mut(&self, _a: u32) -> Option<*mut u8> { None }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvlx_lvrx_round_trip() {
|
||||
let m = TestMem::new(0x40);
|
||||
for i in 0..0x30 { m.data[i].set((i as u8).wrapping_add(0x10)); }
|
||||
// Unaligned load from 0x13 should combine lvlx(0x13) | lvrx(0x23).
|
||||
let lo = load_vector_left(&m, 0x13);
|
||||
let hi = load_vector_right(&m, 0x23);
|
||||
let mut combined = [0u8; 16];
|
||||
let lob = lo.as_bytes();
|
||||
let hib = hi.as_bytes();
|
||||
for i in 0..16 { combined[i] = lob[i] | hib[i]; }
|
||||
for i in 0..16 {
|
||||
assert_eq!(combined[i], m.data[0x13 + i].get(), "lane {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvlx_aligned_is_full_load() {
|
||||
let m = TestMem::new(0x20);
|
||||
for i in 0..0x20 { m.data[i].set(i as u8); }
|
||||
let v = load_vector_left(&m, 0x10);
|
||||
let b = v.as_bytes();
|
||||
for i in 0..16 { assert_eq!(b[i], 0x10 + i as u8); }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvrx_aligned_is_zero() {
|
||||
let m = TestMem::new(0x20);
|
||||
let v = load_vector_right(&m, 0x10);
|
||||
assert_eq!(v.as_bytes(), [0u8; 16]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sat_add_signed_overflow() {
|
||||
assert_eq!(sat_add_i8(120, 10), (127, true));
|
||||
assert_eq!(sat_add_i8(-120, -10), (-128, true));
|
||||
assert_eq!(sat_add_i8(1, 2), (3, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sat_sub_unsigned_underflow() {
|
||||
assert_eq!(sat_sub_u8(5, 10), (0, true));
|
||||
assert_eq!(sat_sub_u8(10, 5), (5, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_unpack_pixel_555() {
|
||||
let encoded = pack_pixel_555(0x80_F8_F8_F8);
|
||||
assert_eq!(encoded & 0x8000, 0x8000);
|
||||
let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F);
|
||||
assert_eq!(w & 0xFF000000, 0xFF000000);
|
||||
}
|
||||
|
||||
// ─── First-Pixels M3 pack/unpack roundtrip tests ───
|
||||
|
||||
/// Quantization error tolerance for N-bit signed normalized values.
|
||||
/// `1.0 / ((1 << (bits - 1)) - 1)` is the step size.
|
||||
fn tol_normalized(bits: u32) -> f32 {
|
||||
1.0 / ((1u32 << (bits - 1)) - 1) as f32
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normshort2_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.75, 0.0, 0.0);
|
||||
let packed = pack_normshort2(v);
|
||||
let back = unpack_normshort2(packed).as_f32x4();
|
||||
let tol = tol_normalized(16);
|
||||
assert!((back[0] - 0.5).abs() < tol, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.75).abs() < tol, "y got {}", back[1]);
|
||||
assert_eq!(back[2], 0.0);
|
||||
assert_eq!(back[3], 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normshort4_roundtrip_extremes() {
|
||||
let v = Vec128::from_f32x4(1.0, -1.0, 0.0, 0.25);
|
||||
let packed = pack_normshort4(v);
|
||||
let back = unpack_normshort4(packed).as_f32x4();
|
||||
let tol = tol_normalized(16);
|
||||
assert!((back[0] - 1.0).abs() < tol);
|
||||
assert!((back[1] - -1.0).abs() < tol);
|
||||
assert!((back[2] - 0.0).abs() < tol);
|
||||
assert!((back[3] - 0.25).abs() < tol);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normpacked32_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.5, 0.9, -1.0);
|
||||
let packed = pack_normpacked32(v);
|
||||
let back = unpack_normpacked32(packed).as_f32x4();
|
||||
let tol10 = tol_normalized(10);
|
||||
let tol2 = tol_normalized(2);
|
||||
assert!((back[0] - 0.5).abs() < tol10, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.5).abs() < tol10, "y got {}", back[1]);
|
||||
assert!((back[2] - 0.9).abs() < tol10, "z got {}", back[2]);
|
||||
// 2-bit signed quantizes to {-1, -0.5-ish, 0, 0.5-ish}; tolerance
|
||||
// is the full step.
|
||||
assert!((back[3] - -1.0).abs() < 2.0 * tol2, "w got {}", back[3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normpacked64_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.25, 0.75, 0.5);
|
||||
let packed = pack_normpacked64(v);
|
||||
let back = unpack_normpacked64(packed).as_f32x4();
|
||||
let tol20 = tol_normalized(20);
|
||||
let tol4 = tol_normalized(4);
|
||||
assert!((back[0] - 0.5).abs() < tol20, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.25).abs() < tol20, "y got {}", back[1]);
|
||||
assert!((back[2] - 0.75).abs() < tol20, "z got {}", back[2]);
|
||||
assert!((back[3] - 0.5).abs() < tol4, "w got {}", back[3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_2_roundtrip_normals() {
|
||||
// Half has ~3 decimal digits of precision. Pick values that
|
||||
// survive conversion cleanly: powers of 2 + simple fractions.
|
||||
let v = Vec128::from_f32x4(1.0, -2.5, 0.0, 0.0);
|
||||
let packed = pack_float16_2(v);
|
||||
let back = unpack_float16_2(packed).as_f32x4();
|
||||
assert_eq!(back[0], 1.0);
|
||||
assert_eq!(back[1], -2.5);
|
||||
assert_eq!(back[2], 0.0);
|
||||
assert_eq!(back[3], 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_4_roundtrip_normals() {
|
||||
let v = Vec128::from_f32x4(0.5, -3.0, 16.0, -0.125);
|
||||
let packed = pack_float16_4(v);
|
||||
let back = unpack_float16_4(packed).as_f32x4();
|
||||
assert_eq!(back[0], 0.5);
|
||||
assert_eq!(back[1], -3.0);
|
||||
assert_eq!(back[2], 16.0);
|
||||
assert_eq!(back[3], -0.125);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_handles_zero_and_infinity() {
|
||||
// Zero should survive.
|
||||
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(0.0)), 0.0);
|
||||
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(-0.0)).to_bits(), (-0.0f32).to_bits());
|
||||
// +inf.
|
||||
let inf_back = f16_bits_to_f32(f32_to_f16_bits(f32::INFINITY));
|
||||
assert!(inf_back.is_infinite() && inf_back > 0.0);
|
||||
// Overflow → +inf.
|
||||
let overflow_back = f16_bits_to_f32(f32_to_f16_bits(65536.0));
|
||||
assert!(overflow_back.is_infinite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_type_enum_maps_canary_values() {
|
||||
use D3dPackType::*;
|
||||
assert!(matches!(D3dPackType::from_immediate(0), D3dColor));
|
||||
assert!(matches!(D3dPackType::from_immediate(1), NormShort2));
|
||||
assert!(matches!(D3dPackType::from_immediate(2), NormPacked32));
|
||||
assert!(matches!(D3dPackType::from_immediate(3), Float16_2));
|
||||
assert!(matches!(D3dPackType::from_immediate(4), NormShort4));
|
||||
assert!(matches!(D3dPackType::from_immediate(5), Float16_4));
|
||||
assert!(matches!(D3dPackType::from_immediate(6), NormPacked64));
|
||||
assert!(matches!(D3dPackType::from_immediate(7), Other(7)));
|
||||
}
|
||||
}
|
||||
550
crates/xenia-cpu/tests/disasm_goldens.rs
Normal file
550
crates/xenia-cpu/tests/disasm_goldens.rs
Normal file
@@ -0,0 +1,550 @@
|
||||
//! Assert-based goldens for the PPC disassembler.
|
||||
//!
|
||||
//! Each test owns an inline list of `(raw, addr, label)` cases. On a
|
||||
//! normal run, the test reads the corresponding fixture JSON and asserts
|
||||
//! that `format(decode(raw, addr))` reproduces every field exactly. On
|
||||
//! first creation (fixture file missing) or with `REGEN_GOLDENS=1` set,
|
||||
//! the test (re)writes the fixture from `format()` output.
|
||||
//!
|
||||
//! Workflow:
|
||||
//! ```sh
|
||||
//! cargo test -p xenia-cpu --test disasm_goldens # assert
|
||||
//! REGEN_GOLDENS=1 cargo test -p xenia-cpu --test disasm_goldens # regen
|
||||
//! ```
|
||||
//!
|
||||
//! The hand-encoded test cases below cover the silent-bug regression
|
||||
//! cases that lived in the old println-based `disasm_audit.rs` harness
|
||||
//! (now deleted).
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use xenia_cpu::decoder::{DecodedInstr, decode};
|
||||
use xenia_cpu::disasm::format;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
struct GoldenRow {
|
||||
label: String,
|
||||
raw: String,
|
||||
addr: String,
|
||||
mnemonic: String,
|
||||
operands: String,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
ext_mnemonic: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
ext_operands: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
branch_target: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
struct GoldenFile {
|
||||
rows: Vec<GoldenRow>,
|
||||
}
|
||||
|
||||
fn fixture_path(name: &str) -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("golden")
|
||||
.join(name)
|
||||
}
|
||||
|
||||
/// Encode a VMX128 VX128-form (or VX128_R/_2) instruction with canary's
|
||||
/// 7-bit register layout: VD low at PPC 6-10, high 2 bits at PPC 28-29;
|
||||
/// VA low at PPC 11-15, mid bit at PPC 26, high bit at PPC 21; VB low at
|
||||
/// PPC 16-20, high 2 bits at PPC 30-31. `secondary_bits` carries any
|
||||
/// secondary opcode + VC + Rc + key bits the caller needs.
|
||||
fn encode_vx128(op6: u32, vd: u32, va: u32, vb: u32, secondary_bits: u32) -> u32 {
|
||||
((op6 & 0x3F) << 26)
|
||||
| ((vd & 0x1F) << 21)
|
||||
| (((vd >> 5) & 0x3) << 2)
|
||||
| ((va & 0x1F) << 16)
|
||||
| (((va >> 5) & 0x1) << 5)
|
||||
| (((va >> 6) & 0x1) << 10)
|
||||
| ((vb & 0x1F) << 11)
|
||||
| (((vb >> 5) & 0x3) << 0)
|
||||
| secondary_bits
|
||||
}
|
||||
|
||||
fn build_rows(cases: &[(u32, u32, &str)]) -> Vec<GoldenRow> {
|
||||
cases
|
||||
.iter()
|
||||
.map(|&(raw, addr, label)| {
|
||||
let d = decode(raw, addr);
|
||||
let t = format(&d);
|
||||
GoldenRow {
|
||||
label: label.to_string(),
|
||||
raw: format!("0x{raw:08X}"),
|
||||
addr: format!("0x{addr:08X}"),
|
||||
mnemonic: t.mnemonic,
|
||||
operands: t.operands,
|
||||
ext_mnemonic: t.ext_mnemonic,
|
||||
ext_operands: t.ext_operands,
|
||||
branch_target: t.branch_target.map(|t| format!("0x{t:08X}")),
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Compare what `format()` produces against the committed JSON snapshot.
|
||||
/// Set `REGEN_GOLDENS=1` to overwrite the snapshot from current output.
|
||||
/// Missing snapshot is treated as "first creation": writes and panics so
|
||||
/// CI can't accidentally accept blank goldens.
|
||||
fn assert_or_regen(fixture_name: &str, cases: &[(u32, u32, &str)]) {
|
||||
let rows = build_rows(cases);
|
||||
let path = fixture_path(fixture_name);
|
||||
let regen = std::env::var("REGEN_GOLDENS").is_ok();
|
||||
|
||||
if regen || !path.exists() {
|
||||
if let Some(parent) = path.parent() {
|
||||
std::fs::create_dir_all(parent).unwrap();
|
||||
}
|
||||
let serialized = serde_json::to_string_pretty(&GoldenFile { rows }).unwrap();
|
||||
std::fs::write(&path, serialized + "\n").unwrap();
|
||||
if !regen {
|
||||
panic!(
|
||||
"Generated fixture {} (was missing). Inspect, commit, then re-run.",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let src = std::fs::read_to_string(&path).unwrap();
|
||||
let golden: GoldenFile = serde_json::from_str(&src).unwrap();
|
||||
assert_eq!(
|
||||
rows.len(),
|
||||
golden.rows.len(),
|
||||
"row count differs from {} (live={}, fixture={}). Run with REGEN_GOLDENS=1 if the test cases changed intentionally.",
|
||||
path.display(),
|
||||
rows.len(),
|
||||
golden.rows.len()
|
||||
);
|
||||
for (i, (got, expected)) in rows.iter().zip(golden.rows.iter()).enumerate() {
|
||||
assert_eq!(
|
||||
got, expected,
|
||||
"row {} ({}) differs in {}\n live: {got:#?}\n fixture: {expected:#?}",
|
||||
i,
|
||||
expected.label,
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Encoding helpers ────────────────────────────────────────────────────────
|
||||
// PPC bit numbering: bit 0 is MSB, bit 31 is LSB. Most helpers below emit
|
||||
// instructions in canonical hand-readable form: opcode << 26 | <fields>.
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn xform_xo3(rd: u32, ra: u32, rb: u32, oe: u32, xo: u32, rc: u32) -> u32 {
|
||||
(31 << 26) | (rd << 21) | (ra << 16) | (rb << 11) | (oe << 10) | (xo << 1) | rc
|
||||
}
|
||||
|
||||
fn xform_logic(rs: u32, ra: u32, rb: u32, xo: u32, rc: u32) -> u32 {
|
||||
(31 << 26) | (rs << 21) | (ra << 16) | (rb << 11) | (xo << 1) | rc
|
||||
}
|
||||
|
||||
fn dform(op: u32, rt: u32, ra: u32, imm: i16) -> u32 {
|
||||
(op << 26) | (rt << 21) | (ra << 16) | ((imm as u16) as u32)
|
||||
}
|
||||
|
||||
fn iform_b(target_disp: i32, aa: u32, lk: u32) -> u32 {
|
||||
// I-form: opcode 18 | LI<<2 | AA<<1 | LK
|
||||
let li = (target_disp as u32) & 0x03FF_FFFC;
|
||||
(18 << 26) | li | (aa << 1) | lk
|
||||
}
|
||||
|
||||
fn bform_bc(bo: u32, bi: u32, target_disp: i32, aa: u32, lk: u32) -> u32 {
|
||||
// B-form: opcode 16 | BO<<21 | BI<<16 | BD<<2 | AA<<1 | LK
|
||||
let bd = (target_disp as u32) & 0x0000_FFFC;
|
||||
(16 << 26) | (bo << 21) | (bi << 16) | bd | (aa << 1) | lk
|
||||
}
|
||||
|
||||
fn xlform_bclr(bo: u32, bi: u32, lk: u32) -> u32 {
|
||||
// XL-form: opcode 19 | BO<<21 | BI<<16 | XO=16<<1 | LK
|
||||
(19 << 26) | (bo << 21) | (bi << 16) | (16 << 1) | lk
|
||||
}
|
||||
|
||||
fn xlform_bcctr(bo: u32, bi: u32, lk: u32) -> u32 {
|
||||
(19 << 26) | (bo << 21) | (bi << 16) | (528 << 1) | lk
|
||||
}
|
||||
|
||||
fn rlwinm(rs: u32, ra: u32, sh: u32, mb: u32, me: u32, rc: u32) -> u32 {
|
||||
(21 << 26) | (rs << 21) | (ra << 16) | (sh << 11) | (mb << 6) | (me << 1) | rc
|
||||
}
|
||||
|
||||
fn rldicl(rs: u32, ra: u32, sh: u32, mb: u32, rc: u32) -> u32 {
|
||||
// MD-form: sh[4:0] at PPC bits 16-20 (host bits 11-15); sh[5] at PPC bit 30 (host bit 1).
|
||||
// mb[4:0] at PPC bits 21-25 (host bits 6-10); mb[5] at PPC bit 26 (host bit 5).
|
||||
let sh_lo = sh & 0x1F;
|
||||
let sh_hi = (sh >> 5) & 1;
|
||||
let mb_lo = mb & 0x1F;
|
||||
let mb_hi = (mb >> 5) & 1;
|
||||
(30 << 26)
|
||||
| (rs << 21)
|
||||
| (ra << 16)
|
||||
| (sh_lo << 11)
|
||||
| (mb_lo << 6)
|
||||
| (mb_hi << 5)
|
||||
| (0 << 2)
|
||||
| (sh_hi << 1)
|
||||
| rc
|
||||
}
|
||||
|
||||
fn mfspr(rd: u32, spr: u32) -> u32 {
|
||||
let spr_swapped = ((spr & 0x1F) << 5) | ((spr >> 5) & 0x1F);
|
||||
(31 << 26) | (rd << 21) | (spr_swapped << 11) | (339 << 1)
|
||||
}
|
||||
|
||||
fn mtspr(rs: u32, spr: u32) -> u32 {
|
||||
let spr_swapped = ((spr & 0x1F) << 5) | ((spr >> 5) & 0x1F);
|
||||
(31 << 26) | (rs << 21) | (spr_swapped << 11) | (467 << 1)
|
||||
}
|
||||
|
||||
// ── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn base_mnemonics() {
|
||||
let cases: &[(u32, u32, &str)] = &[
|
||||
// X-form ALU (Rc and OE bits)
|
||||
(xform_xo3(3, 4, 5, 0, 266, 0), 0x82000000, "add r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 5, 0, 266, 1), 0x82000000, "add. r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 5, 1, 266, 0), 0x82000000, "addo r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 5, 1, 266, 1), 0x82000000, "addo. r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 0, 0, 104, 0), 0x82000000, "neg r3,r4"),
|
||||
(xform_xo3(3, 4, 5, 0, 235, 0), 0x82000000, "mullw r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 5, 0, 491, 0), 0x82000000, "divw r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 5, 0, 75, 1), 0x82000000, "mulhw. r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 5, 0, 11, 1), 0x82000000, "mulhwu. r3,r4,r5"),
|
||||
(xform_xo3(3, 4, 5, 0, 233, 0), 0x82000000, "mulld r3,r4,r5"),
|
||||
// X-form logical
|
||||
(xform_logic(4, 3, 5, 28, 0), 0x82000000, "and r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 444, 0), 0x82000000, "or r3,r4,r5 (non-mr: rs!=rb)"),
|
||||
(xform_logic(4, 3, 5, 316, 0), 0x82000000, "xor r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 124, 0), 0x82000000, "nor r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 476, 0), 0x82000000, "nand r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 284, 0), 0x82000000, "eqv r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 60, 0), 0x82000000, "andc r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 412, 0), 0x82000000, "orc r3,r4,r5"),
|
||||
// X-form shift
|
||||
(xform_logic(4, 3, 5, 24, 0), 0x82000000, "slw r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 536, 0), 0x82000000, "srw r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 792, 0), 0x82000000, "sraw r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 27, 0), 0x82000000, "sld r3,r4,r5"),
|
||||
(xform_logic(4, 3, 5, 539, 0), 0x82000000, "srd r3,r4,r5"),
|
||||
// srawi / sradi (immediate shifts)
|
||||
((31 << 26) | (4 << 21) | (3 << 16) | (16 << 11) | (824 << 1), 0x82000000, "srawi r3,r4,16"),
|
||||
// Atomics
|
||||
((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (150 << 1) | 1, 0x82000000, "stwcx. r3,r4,r5"),
|
||||
((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (214 << 1) | 1, 0x82000000, "stdcx. r3,r4,r5"),
|
||||
((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1), 0x82000000, "lwarx r3,r4,r5"),
|
||||
((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (84 << 1), 0x82000000, "ldarx r3,r4,r5"),
|
||||
// Compares
|
||||
(dform(11, 0, 3, 16), 0x82000000, "cmpwi cr0, r3, 16"),
|
||||
(dform(11, 2 << 2, 3, 16), 0x82000000, "cmpwi cr2, r3, 16"),
|
||||
(dform(10, 0, 3, 16), 0x82000000, "cmplwi cr0, r3, 16"),
|
||||
((31 << 26) | (3 << 16) | (4 << 11), 0x82000000, "cmpw r3,r4 in cr0"),
|
||||
((31 << 26) | (1 << 21) | (3 << 16) | (4 << 11), 0x82000000, "cmpd r3,r4"),
|
||||
((31 << 26) | (3 << 16) | (4 << 11) | (32 << 1), 0x82000000, "cmplw r3,r4"),
|
||||
// D-form ALU/load/store
|
||||
(dform(14, 3, 1, 16), 0x82000000, "addi r3, r1, 16"),
|
||||
(dform(15, 3, 1, 0x100), 0x82000000, "addis r3, r1, 0x100 (ra!=0)"),
|
||||
(dform(7, 3, 4, 5), 0x82000000, "mulli r3, r4, 5"),
|
||||
(dform(8, 3, 4, 5), 0x82000000, "subfic r3, r4, 5"),
|
||||
(dform(12, 3, 4, 16), 0x82000000, "addic r3, r4, 16"),
|
||||
(dform(13, 3, 4, 16), 0x82000000, "addic. r3, r4, 16"),
|
||||
(dform(24, 3, 4, 0x10), 0x82000000, "ori r4, r3, 0x10 (non-nop)"),
|
||||
(dform(25, 3, 4, 0x10), 0x82000000, "oris r4, r3, 0x10"),
|
||||
(dform(26, 3, 4, 0x10), 0x82000000, "xori r4, r3, 0x10"),
|
||||
(dform(28, 3, 4, 0x10), 0x82000000, "andi. r4, r3, 0x10"),
|
||||
// Loads/stores D-form
|
||||
(dform(32, 5, 1, 0x20), 0x82000000, "lwz r5, 0x20(r1)"),
|
||||
(dform(36, 5, 1, 0x20), 0x82000000, "stw r5, 0x20(r1)"),
|
||||
(dform(34, 5, 1, 0x20), 0x82000000, "lbz r5, 0x20(r1)"),
|
||||
(dform(40, 5, 1, 0x20), 0x82000000, "lhz r5, 0x20(r1)"),
|
||||
(dform(48, 5, 1, 0x20), 0x82000000, "lfs f5, 0x20(r1)"),
|
||||
(dform(50, 5, 1, 0x20), 0x82000000, "lfd f5, 0x20(r1)"),
|
||||
(dform(54, 5, 1, 0x20), 0x82000000, "stfd f5, 0x20(r1)"),
|
||||
// DS-form 64-bit loads
|
||||
((58u32 << 26) | (5 << 21) | (1 << 16) | 0x20, 0x82000000, "ld r5, 0x20(r1)"),
|
||||
((62u32 << 26) | (5 << 21) | (1 << 16) | 0x20, 0x82000000, "std r5, 0x20(r1)"),
|
||||
// Sync / barrier (parameterless)
|
||||
((31 << 26) | (598 << 1), 0x82000000, "sync 0 (extends to sync)"),
|
||||
((19 << 26) | (150 << 1), 0x82000000, "isync"),
|
||||
((31 << 26) | (854 << 1), 0x82000000, "eieio"),
|
||||
// Cache hints
|
||||
((31 << 26) | (1 << 16) | (2 << 11) | (54 << 1), 0x82000000, "dcbst r1, r2"),
|
||||
((31 << 26) | (1 << 16) | (2 << 11) | (86 << 1), 0x82000000, "dcbf r1, r2"),
|
||||
((31 << 26) | (1 << 16) | (2 << 11) | (278 << 1), 0x82000000, "dcbt r1, r2"),
|
||||
((31 << 26) | (1 << 16) | (2 << 11) | (1014 << 1), 0x82000000, "dcbz r1, r2"),
|
||||
((31 << 26) | (1 << 21) | (1 << 16) | (2 << 11) | (1014 << 1), 0x82000000, "dcbz128 r1, r2"),
|
||||
// CR logical (without simplification triggers)
|
||||
((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (33 << 1), 0x82000000, "crnor 4,5,6 (no simplify)"),
|
||||
((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (257 << 1), 0x82000000, "crand 4,5,6"),
|
||||
((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (449 << 1), 0x82000000, "cror 4,5,6 (no simplify)"),
|
||||
// Trap (no simplification: TO=11 doesn't match the table)
|
||||
((31 << 26) | (11 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "tw 11, r3, r4 (uncommon TO)"),
|
||||
((2u32 << 26) | (11 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "tdi 11, r3, 123"),
|
||||
// mtcr (extended): mtcrf 0xFF, r5
|
||||
((31 << 26) | (5 << 21) | (0xFF << 12) | (144 << 1), 0x82000000, "mtcrf 0xFF, r5 → mtcr"),
|
||||
// mfcr / mfmsr / mtmsr / mtmsrd
|
||||
((31 << 26) | (5 << 21) | (19 << 1), 0x82000000, "mfcr r5"),
|
||||
((31 << 26) | (5 << 21) | (83 << 1), 0x82000000, "mfmsr r5"),
|
||||
((31 << 26) | (5 << 21) | (146 << 1), 0x82000000, "mtmsr r5"),
|
||||
((31 << 26) | (5 << 21) | (178 << 1), 0x82000000, "mtmsrd r5"),
|
||||
// FPU base
|
||||
((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (21 << 1), 0x82000000, "fadd f3, f4, f5"),
|
||||
((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1), 0x82000000, "fsub f3, f4, f5"),
|
||||
((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (18 << 1), 0x82000000, "fdiv f3, f4, f5"),
|
||||
((63u32 << 26) | (3 << 21) | (5 << 21) | (5 << 11) | (25 << 1), 0x82000000, "fmul f3, f0, f5 (encoded)"),
|
||||
((63u32 << 26) | (3 << 21) | (4 << 16) | (40 << 1), 0x82000000, "fneg f3, f4"),
|
||||
((63u32 << 26) | (3 << 21) | (4 << 16) | (72 << 1), 0x82000000, "fmr f3, f4"),
|
||||
// mtfsf — XFL form (Fix 1). FM at LSB bits 17-24 (PPC bits 7-14).
|
||||
// Encoding: opcode 63 | FM<<17 | frB<<11 | XO=711<<1 | Rc.
|
||||
((63u32 << 26) | (0xFF << 17) | (5 << 11) | (711 << 1), 0x82000000, "mtfsf 0xFF, f5 (Rc=0)"),
|
||||
((63u32 << 26) | (0xFF << 17) | (5 << 11) | (711 << 1) | 1, 0x82000000, "mtfsf. 0xFF, f5 (Rc=1)"),
|
||||
];
|
||||
assert_or_regen("base_mnemonics.json", cases);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extended_mnemonics() {
|
||||
let cases: &[(u32, u32, &str)] = &[
|
||||
// ori r0, r0, 0 → nop
|
||||
(dform(24, 0, 0, 0), 0x82000000, "nop"),
|
||||
// addi r3, r0, imm → li
|
||||
(dform(14, 3, 0, 16), 0x82000000, "li r3, 16"),
|
||||
(dform(14, 3, 0, -1), 0x82000000, "li r3, -1"),
|
||||
// addi r3, r4, neg → subi
|
||||
(dform(14, 3, 4, -16), 0x82000000, "subi r3, r4, 16"),
|
||||
// addis r3, r0, imm → lis
|
||||
(dform(15, 3, 0, 0x1234), 0x82000000, "lis r3, 0x1234"),
|
||||
// addis r3, r4, neg → subis
|
||||
(dform(15, 3, 4, -1), 0x82000000, "subis r3, r4, 0xFFFF"),
|
||||
// or rA, rS, rS → mr
|
||||
(xform_logic(4, 3, 4, 444, 0), 0x82000000, "mr r3, r4"),
|
||||
(xform_logic(4, 3, 4, 444, 1), 0x82000000, "mr. r3, r4"),
|
||||
// and rA, rS, rS → mr (also)
|
||||
(xform_logic(4, 3, 4, 28, 0), 0x82000000, "mr (via and)"),
|
||||
// nor rA, rS, rS → not
|
||||
(xform_logic(4, 3, 4, 124, 0), 0x82000000, "not r3, r4"),
|
||||
// subf → sub (operand swap)
|
||||
(xform_xo3(3, 4, 5, 0, 40, 0), 0x82000000, "subf → sub r3, r5, r4"),
|
||||
// rlwinm simplifications
|
||||
(rlwinm(4, 3, 4, 0, 31 - 4, 0), 0x82000000, "slwi r3, r4, 4"),
|
||||
(rlwinm(4, 3, 32 - 4, 4, 31, 0), 0x82000000, "srwi r3, r4, 4"),
|
||||
(rlwinm(4, 3, 8, 0, 31, 0), 0x82000000, "rotlwi r3, r4, 8"),
|
||||
(rlwinm(4, 3, 0, 4, 31, 0), 0x82000000, "clrlwi r3, r4, 4"),
|
||||
(rlwinm(4, 3, 0, 0, 27, 0), 0x82000000, "clrrwi r3, r4, 4"),
|
||||
(rlwinm(4, 3, 8, 0, 7, 0), 0x82000000, "extlwi r3, r4, 8, 8"),
|
||||
// rlwinm with Rc
|
||||
(rlwinm(4, 3, 4, 0, 31 - 4, 1), 0x82000000, "slwi. r3, r4, 4"),
|
||||
// rlwinm Sylpheed regression
|
||||
(rlwinm(11, 11, 0, 31, 31, 1), 0x82000000, "rlwinm. r11,r11,0,31,31 (no simplify)"),
|
||||
// rldicl simplifications
|
||||
(rldicl(4, 3, 0, 32, 0), 0x82000000, "clrldi r3, r4, 32"),
|
||||
(rldicl(4, 3, 64u32 - 8, 8, 0), 0x82000000, "srdi r3, r4, 8"),
|
||||
(rldicl(4, 3, 8, 0, 0), 0x82000000, "rotldi r3, r4, 8"),
|
||||
// cmpi / cmpli → cmpwi/cmpdi/cmplwi/cmpldi
|
||||
(dform(11, 0, 3, 16), 0x82000000, "cmpwi cr0, r3, 16"),
|
||||
(dform(11, (1 << 21) | (2 << 23), 3, 16) | (1 << 21), 0x82000000, "cmpdi (L=1) variant"),
|
||||
// bclr 20, 0 → blr
|
||||
(xlform_bclr(20, 0, 0), 0x82000000, "blr"),
|
||||
(xlform_bclr(20, 0, 1), 0x82000000, "blrl"),
|
||||
// bcctr 20, 0 → bctr
|
||||
(xlform_bcctr(20, 0, 0), 0x82000000, "bctr"),
|
||||
(xlform_bcctr(20, 0, 1), 0x82000000, "bctrl"),
|
||||
// bclr conditional
|
||||
(xlform_bclr(12, 2, 0), 0x82000000, "beqlr (BO=12, BI=2 → cr0.eq true)"),
|
||||
(xlform_bclr(4, 2, 0), 0x82000000, "bnelr"),
|
||||
// bc with full BO/BI: branch always (BO=20)
|
||||
(bform_bc(20, 0, 0x40, 0, 0), 0x82000000, "bc → b 0x82000040"),
|
||||
(bform_bc(20, 0, 0x40, 0, 1), 0x82000000, "bc l → bl 0x82000040"),
|
||||
// Conditional bc → beq/bne/etc
|
||||
(bform_bc(12, 2, 0x40, 0, 0), 0x82000000, "bc 12,cr0.eq → beq 0x82000040"),
|
||||
(bform_bc(4, 2, 0x40, 0, 0), 0x82000000, "bc 4,cr0.eq → bne 0x82000040"),
|
||||
(bform_bc(12, 0, 0x40, 0, 0), 0x82000000, "bc 12,cr0.lt → blt 0x82000040"),
|
||||
(bform_bc(4, 0, 0x40, 0, 0), 0x82000000, "bc 4,cr0.lt → bge 0x82000040"),
|
||||
(bform_bc(12, 1, 0x40, 0, 0), 0x82000000, "bc 12,cr0.gt → bgt 0x82000040"),
|
||||
(bform_bc(4, 1, 0x40, 0, 0), 0x82000000, "bc 4,cr0.gt → ble 0x82000040"),
|
||||
// Conditional with non-zero CR field
|
||||
(bform_bc(12, 2 + 8, 0x40, 0, 0), 0x82000000, "bc 12, cr2.eq → beq cr2, 0x...040"),
|
||||
// bdnz / bdz (decrement-CTR branches)
|
||||
(bform_bc(16, 0, 0x40, 0, 0), 0x82000000, "bdnz 0x82000040"),
|
||||
(bform_bc(18, 0, 0x40, 0, 0), 0x82000000, "bdz 0x82000040"),
|
||||
// I-form branches
|
||||
(iform_b(0x40, 0, 0), 0x82000000, "b +0x40 → 0x82000040"),
|
||||
(iform_b(0x40, 0, 1), 0x82000000, "bl +0x40 → 0x82000040"),
|
||||
(iform_b(0x40, 1, 0), 0x82000000, "ba 0x40 absolute"),
|
||||
(iform_b(0x40, 1, 1), 0x82000000, "bla 0x40 absolute"),
|
||||
// Trap immediate simplifications
|
||||
((2u32 << 26) | (4 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "tdeqi r3, 123"),
|
||||
((3u32 << 26) | (16 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "twlti r3, 123"),
|
||||
// mfspr → mflr / mfctr / mfxer
|
||||
(mfspr(3, 8), 0x82000000, "mflr r3"),
|
||||
(mfspr(3, 9), 0x82000000, "mfctr r3"),
|
||||
(mfspr(3, 1), 0x82000000, "mfxer r3"),
|
||||
// mtspr → mtlr / mtctr / mtxer
|
||||
(mtspr(3, 8), 0x82000000, "mtlr r3"),
|
||||
(mtspr(3, 9), 0x82000000, "mtctr r3"),
|
||||
(mtspr(3, 1), 0x82000000, "mtxer r3"),
|
||||
// crnor with same source bits → crnot
|
||||
((19 << 26) | (4 << 21) | (5 << 16) | (5 << 11) | (33 << 1), 0x82000000, "crnot 4, 5"),
|
||||
// crxor with all same → crclr
|
||||
((19 << 26) | (4 << 21) | (4 << 16) | (4 << 11) | (193 << 1), 0x82000000, "crclr 4"),
|
||||
// creqv with all same → crset
|
||||
((19 << 26) | (4 << 21) | (4 << 16) | (4 << 11) | (289 << 1), 0x82000000, "crset 4"),
|
||||
// cror with same source bits → crmove
|
||||
((19 << 26) | (4 << 21) | (5 << 16) | (5 << 11) | (449 << 1), 0x82000000, "crmove 4, 5"),
|
||||
// sync L=1 → lwsync
|
||||
((31 << 26) | (1 << 21) | (598 << 1), 0x82000000, "lwsync"),
|
||||
// tw 31, 0, 0 → trap
|
||||
((31 << 26) | (31 << 21) | (4 << 1), 0x82000000, "trap"),
|
||||
// Fix 2: bclr/bcctr with BO=20 and BI≠0 still emits blr/bctr ext.
|
||||
// BO=20 ignores both CTR test and CR test, so BI is don't-care.
|
||||
(xlform_bclr(20, 4, 0), 0x82000000, "blr (BO=20, BI=4 — BI is don't-care)"),
|
||||
(xlform_bclr(20, 7, 1), 0x82000000, "blrl (BO=20, BI=7)"),
|
||||
(xlform_bcctr(20, 4, 0), 0x82000000, "bctr (BO=20, BI=4)"),
|
||||
// Fix 3: trap unsigned simplified mnemonics (TO=1, 2, 5, 6 — logical
|
||||
// compare conditions). Register form (tw/td) and immediate (twi/tdi).
|
||||
((31u32 << 26) | (2 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twllt r3, r4 (TO=2)"),
|
||||
((31u32 << 26) | (1 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twlgt r3, r4 (TO=1)"),
|
||||
((31u32 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (68 << 1), 0x82000000, "tdlge r3, r4 (TO=5)"),
|
||||
((31u32 << 26) | (6 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twlle r3, r4 (TO=6)"),
|
||||
((3u32 << 26) | (2 << 21) | (3 << 16) | (16u32 & 0xFFFF), 0x82000000, "twllti r3, 16"),
|
||||
((2u32 << 26) | (5 << 21) | (3 << 16) | (16u32 & 0xFFFF), 0x82000000, "tdlgei r3, 16"),
|
||||
];
|
||||
assert_or_regen("extended_mnemonics.json", cases);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_registers() {
|
||||
// Standard VMX (op=4) — 5-bit registers v0..v31. Verifies that the
|
||||
// low-register path renders correctly through the new formatter.
|
||||
let std_vmx = [
|
||||
// vaddubm v3, v4, v5 : op=4, 3-op key=0
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 0, 0x82000000, "vaddubm v3, v4, v5"),
|
||||
// vaddfp v3, v4, v5 : op=4, vx=10
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 10, 0x82000000, "vaddfp v3, v4, v5"),
|
||||
// vand v3, v4, v5 : vx=1028
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1028, 0x82000000, "vand v3, v4, v5"),
|
||||
// vor v3, v4, v5 : vx=1156
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1156, 0x82000000, "vor v3, v4, v5"),
|
||||
// vxor v3, v4, v5 : vx=1220
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1220, 0x82000000, "vxor v3, v4, v5"),
|
||||
// vsel v3, v4, v5, v6 : op=4, va_key=42 (4-op)
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 42, 0x82000000, "vsel v3,v4,v5,v6"),
|
||||
// vperm v3, v4, v5, v6 : va_key=43
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 43, 0x82000000, "vperm v3,v4,v5,v6"),
|
||||
// vmaddfp v3, v4, v5, v6 : va_key=46 (operand swap: vd, va, vc, vb)
|
||||
((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 46, 0x82000000, "vmaddfp v3, v4, v6, v5 (swap)"),
|
||||
// mfvscr v3 : vx=1540
|
||||
((4u32 << 26) | (3 << 21) | 1540, 0x82000000, "mfvscr v3"),
|
||||
// mtvscr v5 : vx=1604, vb=v5
|
||||
((4u32 << 26) | (5 << 11) | 1604, 0x82000000, "mtvscr v5"),
|
||||
];
|
||||
|
||||
// VMX128 op=5: vperm128 v3, v4, v5, vc=0. Canary FormatVX128: VD low
|
||||
// at PPC 6-10, VA low at PPC 11-15, VB low at PPC 16-20, VC at PPC 23-25.
|
||||
// key1 = (bit22<<5)|bit27 = 0 selects vperm128.
|
||||
let vmx128_op5 = [
|
||||
(encode_vx128(5, 3, 4, 5, 0), 0x82000000, "vperm128 v3, v4, v5, 0 (canary)"),
|
||||
];
|
||||
|
||||
// VMX128 op=6 — exercise full 0-127 vd128 range under canary's layout.
|
||||
// VD128h is at PPC 28-29 (host 2-3): no overlap with secondary opcode key,
|
||||
// so vd can be freely 0-127 for any op6 instruction.
|
||||
let vsrw128 = |vd: u32, vb: u32| -> u32 {
|
||||
// vsrw128 secondary: 0x000001D0 (decode_op6 key5 = 0b011101).
|
||||
encode_vx128(6, vd, 0, vb, 0x000001D0)
|
||||
};
|
||||
let vpermwi128 = |vd: u32, vb: u32, perm: u32| -> u32 {
|
||||
// vpermwi128: PERMl at PPC 11-15, PERMh at PPC 23-25, key1 sets bit 22 + bit 27.
|
||||
let perml = perm & 0x1F;
|
||||
let permh = (perm >> 5) & 0x7;
|
||||
let mut raw = (6u32 << 26)
|
||||
| ((vd & 0x1F) << 21)
|
||||
| (((vd >> 5) & 0x3) << 2) // VD128h
|
||||
| (perml << 16)
|
||||
| ((vb & 0x1F) << 11)
|
||||
| (((vb >> 5) & 0x3) << 0) // VB128h
|
||||
| (permh << 6) // PERMh at PPC 23-25
|
||||
| (1 << 9) // bit 22 (key1 high)
|
||||
| (1 << 4); // bit 27 (key1 low)
|
||||
raw &= !(1 << 10); // PPC 21 = 0 for vpermwi128
|
||||
raw
|
||||
};
|
||||
let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 {
|
||||
// vrlimi128: IMM at PPC 11-15, z at PPC 24-25, key2 = 0b1110001 over
|
||||
// bits 21-23 + 26-27 → bits 21,22,23 = 1, bit 26 = 0, bit 27 = 1.
|
||||
(6u32 << 26)
|
||||
| ((vd & 0x1F) << 21)
|
||||
| (((vd >> 5) & 0x3) << 2) // VD128h
|
||||
| ((imm & 0x1F) << 16)
|
||||
| ((vb & 0x1F) << 11)
|
||||
| (((vb >> 5) & 0x3) << 0) // VB128h
|
||||
| ((z & 0x3) << 6) // z at PPC 24-25 = host 6-7
|
||||
| (1 << 8) // bit 23 (key2)
|
||||
| (1 << 9) // bit 22 (key2)
|
||||
| (1 << 10) // bit 21 (key2)
|
||||
| (1 << 4) // bit 27 (key2)
|
||||
};
|
||||
let vmx128_high = [
|
||||
(vsrw128(0, 12), 0x82000000, "vsrw128 v0, v0, v12 (canary, vd_hi=00)"),
|
||||
(vsrw128(32, 12), 0x82000000, "vsrw128 v32, v0, v12 (canary, VD128h=01)"),
|
||||
(vpermwi128(64, 12, 0xE4), 0x82000000, "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)"),
|
||||
(vrlimi128(96, 12, 4, 3), 0x82000000, "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)"),
|
||||
(vrlimi128(127, 95, 4, 3), 0x82000000, "vrlimi128 v127, v95, 4, 3 (canary)"),
|
||||
];
|
||||
|
||||
// Fix 4: VMX128 multiply-add 4-operand layouts. Per canary, the addend
|
||||
// is the VD register re-used; operand order differs between the three
|
||||
// mnemonics. Encodings hand-built to satisfy decode_op5's key2 secondary
|
||||
// opcode (vmaddfp128=0b001101, vmaddcfp128=0b010001, vnmsubfp128=0b010101)
|
||||
// with bit 22=0 (forced by key2's high nibble) so vd128 high bit 1 = 0.
|
||||
// vd128 low = 3 (bits 6-10); va128 = 3 | (bit29<<5) = 35; vb128 = 5.
|
||||
// Distinct VD vs VA verifies the layout isn't trivially aliasing VD.
|
||||
//
|
||||
// layout (canary):
|
||||
// vmaddfp128 VD, VA, VB, VD → "v3, v35, v5, v3"
|
||||
// vmaddcfp128 VD, VA, VD, VB → "v3, v35, v3, v5"
|
||||
// vnmsubfp128 VD, VA, VD, VB → "v3, v35, v3, v5"
|
||||
let vmx128_4op = [
|
||||
// Canary FormatVX128 layout: vd=3 (PPC 6-10), va=35 (low 3 at PPC 11-15 + VA128h=1 at PPC 26),
|
||||
// vb=5 (PPC 16-20), key2 at PPC 22-25 + bit 27.
|
||||
(0x146328F0u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
|
||||
(0x14632930u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
|
||||
(0x14632970u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
|
||||
];
|
||||
|
||||
let mut all = Vec::new();
|
||||
all.extend_from_slice(&std_vmx);
|
||||
all.extend_from_slice(&vmx128_op5);
|
||||
all.extend_from_slice(&vmx128_high);
|
||||
all.extend_from_slice(&vmx128_4op);
|
||||
assert_or_regen("vmx128_registers.json", &all);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sradi_shift_32_decodes_to_32() {
|
||||
// sradi rA, rS, 32: sh=32 → sh[4:0]=0, sh[5]=1
|
||||
// After PPCBUG-040 fix, sh64() must return 32, not 1.
|
||||
let instr: DecodedInstr = decode(rldicl(3, 4, 32, 63, 0), 0);
|
||||
// rldicl with mb=63 is not sradi, but tests sh64() extraction.
|
||||
assert_eq!(instr.sh64(), 32, "sh64 must return 32 for sh=32 (sh5=1, sh_lo=0)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sh64_shift_1_decodes_correctly() {
|
||||
// sh=1: sh[4:0]=1, sh[5]=0 → sh64() must return 1
|
||||
let instr: DecodedInstr = decode(rldicl(3, 4, 1, 0, 0), 0);
|
||||
assert_eq!(instr.sh64(), 1, "sh64 must return 1 for sh=1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sh64_shift_63_decodes_correctly() {
|
||||
// sh=63: sh[4:0]=31=0x1F, sh[5]=1 → sh64() must return 63
|
||||
let instr: DecodedInstr = decode(rldicl(3, 4, 63, 0, 0), 0);
|
||||
assert_eq!(instr.sh64(), 63, "sh64 must return 63 for sh=63");
|
||||
}
|
||||
571
crates/xenia-cpu/tests/golden/base_mnemonics.json
Normal file
571
crates/xenia-cpu/tests/golden/base_mnemonics.json
Normal file
@@ -0,0 +1,571 @@
|
||||
{
|
||||
"rows": [
|
||||
{
|
||||
"label": "add r3,r4,r5",
|
||||
"raw": "0x7C642A14",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "add",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "add. r3,r4,r5",
|
||||
"raw": "0x7C642A15",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "add.",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "addo r3,r4,r5",
|
||||
"raw": "0x7C642E14",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addo",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "addo. r3,r4,r5",
|
||||
"raw": "0x7C642E15",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addo.",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "neg r3,r4",
|
||||
"raw": "0x7C6400D0",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "neg",
|
||||
"operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "mullw r3,r4,r5",
|
||||
"raw": "0x7C6429D6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mullw",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "divw r3,r4,r5",
|
||||
"raw": "0x7C642BD6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "divw",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "mulhw. r3,r4,r5",
|
||||
"raw": "0x7C642897",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mulhw.",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "mulhwu. r3,r4,r5",
|
||||
"raw": "0x7C642817",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mulhwu.",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "mulld r3,r4,r5",
|
||||
"raw": "0x7C6429D2",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mulld",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "and r3,r4,r5",
|
||||
"raw": "0x7C832838",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "and",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "or r3,r4,r5 (non-mr: rs!=rb)",
|
||||
"raw": "0x7C832B78",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "or",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "xor r3,r4,r5",
|
||||
"raw": "0x7C832A78",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "xor",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "nor r3,r4,r5",
|
||||
"raw": "0x7C8328F8",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "nor",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "nand r3,r4,r5",
|
||||
"raw": "0x7C832BB8",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "nand",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "eqv r3,r4,r5",
|
||||
"raw": "0x7C832A38",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "eqv",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "andc r3,r4,r5",
|
||||
"raw": "0x7C832878",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "andc",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "orc r3,r4,r5",
|
||||
"raw": "0x7C832B38",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "orc",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "slw r3,r4,r5",
|
||||
"raw": "0x7C832830",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "slw",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "srw r3,r4,r5",
|
||||
"raw": "0x7C832C30",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "srw",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "sraw r3,r4,r5",
|
||||
"raw": "0x7C832E30",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "sraw",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "sld r3,r4,r5",
|
||||
"raw": "0x7C832836",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "sld",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "srd r3,r4,r5",
|
||||
"raw": "0x7C832C36",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "srd",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "srawi r3,r4,16",
|
||||
"raw": "0x7C838670",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "srawi",
|
||||
"operands": "r3, r4, 16"
|
||||
},
|
||||
{
|
||||
"label": "stwcx. r3,r4,r5",
|
||||
"raw": "0x7C64292D",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "stwcx.",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "stdcx. r3,r4,r5",
|
||||
"raw": "0x7C6429AD",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "stdcx.",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "lwarx r3,r4,r5",
|
||||
"raw": "0x7C642828",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "lwarx",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "ldarx r3,r4,r5",
|
||||
"raw": "0x7C6428A8",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "ldarx",
|
||||
"operands": "r3, r4, r5"
|
||||
},
|
||||
{
|
||||
"label": "cmpwi cr0, r3, 16",
|
||||
"raw": "0x2C030010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmpi",
|
||||
"operands": "0, r3, 16",
|
||||
"ext_mnemonic": "cmpwi",
|
||||
"ext_operands": "r3, 16"
|
||||
},
|
||||
{
|
||||
"label": "cmpwi cr2, r3, 16",
|
||||
"raw": "0x2D030010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmpi",
|
||||
"operands": "cr2, 0, r3, 16",
|
||||
"ext_mnemonic": "cmpwi",
|
||||
"ext_operands": "cr2, r3, 16"
|
||||
},
|
||||
{
|
||||
"label": "cmplwi cr0, r3, 16",
|
||||
"raw": "0x28030010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmpli",
|
||||
"operands": "0, r3, 0x10",
|
||||
"ext_mnemonic": "cmplwi",
|
||||
"ext_operands": "r3, 0x10"
|
||||
},
|
||||
{
|
||||
"label": "cmpw r3,r4 in cr0",
|
||||
"raw": "0x7C032000",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmp",
|
||||
"operands": "0, r3, r4",
|
||||
"ext_mnemonic": "cmpw",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "cmpd r3,r4",
|
||||
"raw": "0x7C232000",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmp",
|
||||
"operands": "1, r3, r4",
|
||||
"ext_mnemonic": "cmpd",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "cmplw r3,r4",
|
||||
"raw": "0x7C032040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmpl",
|
||||
"operands": "0, r3, r4",
|
||||
"ext_mnemonic": "cmplw",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "addi r3, r1, 16",
|
||||
"raw": "0x38610010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addi",
|
||||
"operands": "r3, r1, 16"
|
||||
},
|
||||
{
|
||||
"label": "addis r3, r1, 0x100 (ra!=0)",
|
||||
"raw": "0x3C610100",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addis",
|
||||
"operands": "r3, r1, 0x100"
|
||||
},
|
||||
{
|
||||
"label": "mulli r3, r4, 5",
|
||||
"raw": "0x1C640005",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mulli",
|
||||
"operands": "r3, r4, 5"
|
||||
},
|
||||
{
|
||||
"label": "subfic r3, r4, 5",
|
||||
"raw": "0x20640005",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "subfic",
|
||||
"operands": "r3, r4, 5"
|
||||
},
|
||||
{
|
||||
"label": "addic r3, r4, 16",
|
||||
"raw": "0x30640010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addic",
|
||||
"operands": "r3, r4, 16"
|
||||
},
|
||||
{
|
||||
"label": "addic. r3, r4, 16",
|
||||
"raw": "0x34640010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addic.",
|
||||
"operands": "r3, r4, 16"
|
||||
},
|
||||
{
|
||||
"label": "ori r4, r3, 0x10 (non-nop)",
|
||||
"raw": "0x60640010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "ori",
|
||||
"operands": "r4, r3, 0x10"
|
||||
},
|
||||
{
|
||||
"label": "oris r4, r3, 0x10",
|
||||
"raw": "0x64640010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "oris",
|
||||
"operands": "r4, r3, 0x10"
|
||||
},
|
||||
{
|
||||
"label": "xori r4, r3, 0x10",
|
||||
"raw": "0x68640010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "xori",
|
||||
"operands": "r4, r3, 0x10"
|
||||
},
|
||||
{
|
||||
"label": "andi. r4, r3, 0x10",
|
||||
"raw": "0x70640010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "andi.",
|
||||
"operands": "r4, r3, 0x10"
|
||||
},
|
||||
{
|
||||
"label": "lwz r5, 0x20(r1)",
|
||||
"raw": "0x80A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "lwz",
|
||||
"operands": "r5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "stw r5, 0x20(r1)",
|
||||
"raw": "0x90A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "stw",
|
||||
"operands": "r5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "lbz r5, 0x20(r1)",
|
||||
"raw": "0x88A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "lbz",
|
||||
"operands": "r5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "lhz r5, 0x20(r1)",
|
||||
"raw": "0xA0A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "lhz",
|
||||
"operands": "r5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "lfs f5, 0x20(r1)",
|
||||
"raw": "0xC0A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "lfs",
|
||||
"operands": "f5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "lfd f5, 0x20(r1)",
|
||||
"raw": "0xC8A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "lfd",
|
||||
"operands": "f5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "stfd f5, 0x20(r1)",
|
||||
"raw": "0xD8A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "stfd",
|
||||
"operands": "f5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "ld r5, 0x20(r1)",
|
||||
"raw": "0xE8A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "ld",
|
||||
"operands": "r5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "std r5, 0x20(r1)",
|
||||
"raw": "0xF8A10020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "std",
|
||||
"operands": "r5, 32(r1)"
|
||||
},
|
||||
{
|
||||
"label": "sync 0 (extends to sync)",
|
||||
"raw": "0x7C0004AC",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "sync",
|
||||
"operands": ""
|
||||
},
|
||||
{
|
||||
"label": "isync",
|
||||
"raw": "0x4C00012C",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "isync",
|
||||
"operands": ""
|
||||
},
|
||||
{
|
||||
"label": "eieio",
|
||||
"raw": "0x7C0006AC",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "eieio",
|
||||
"operands": ""
|
||||
},
|
||||
{
|
||||
"label": "dcbst r1, r2",
|
||||
"raw": "0x7C01106C",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "dcbst",
|
||||
"operands": "r1, r2"
|
||||
},
|
||||
{
|
||||
"label": "dcbf r1, r2",
|
||||
"raw": "0x7C0110AC",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "dcbf",
|
||||
"operands": "r1, r2"
|
||||
},
|
||||
{
|
||||
"label": "dcbt r1, r2",
|
||||
"raw": "0x7C01122C",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "dcbt",
|
||||
"operands": "r1, r2"
|
||||
},
|
||||
{
|
||||
"label": "dcbz r1, r2",
|
||||
"raw": "0x7C0117EC",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "dcbz",
|
||||
"operands": "r1, r2"
|
||||
},
|
||||
{
|
||||
"label": "dcbz128 r1, r2",
|
||||
"raw": "0x7C2117EC",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "dcbz128",
|
||||
"operands": "r1, r2"
|
||||
},
|
||||
{
|
||||
"label": "crnor 4,5,6 (no simplify)",
|
||||
"raw": "0x4C853042",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "crnor",
|
||||
"operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq"
|
||||
},
|
||||
{
|
||||
"label": "crand 4,5,6",
|
||||
"raw": "0x4C853202",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "crand",
|
||||
"operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq"
|
||||
},
|
||||
{
|
||||
"label": "cror 4,5,6 (no simplify)",
|
||||
"raw": "0x4C853382",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cror",
|
||||
"operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq"
|
||||
},
|
||||
{
|
||||
"label": "tw 11, r3, r4 (uncommon TO)",
|
||||
"raw": "0x7D632008",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tw",
|
||||
"operands": "11, r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "tdi 11, r3, 123",
|
||||
"raw": "0x0963007B",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tdi",
|
||||
"operands": "11, r3, 123"
|
||||
},
|
||||
{
|
||||
"label": "mtcrf 0xFF, r5 → mtcr",
|
||||
"raw": "0x7CAFF120",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtcrf",
|
||||
"operands": "0xFF, r5",
|
||||
"ext_mnemonic": "mtcr",
|
||||
"ext_operands": "r5"
|
||||
},
|
||||
{
|
||||
"label": "mfcr r5",
|
||||
"raw": "0x7CA00026",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mfcr",
|
||||
"operands": "r5"
|
||||
},
|
||||
{
|
||||
"label": "mfmsr r5",
|
||||
"raw": "0x7CA000A6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mfmsr",
|
||||
"operands": "r5"
|
||||
},
|
||||
{
|
||||
"label": "mtmsr r5",
|
||||
"raw": "0x7CA00124",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtmsr",
|
||||
"operands": "r5"
|
||||
},
|
||||
{
|
||||
"label": "mtmsrd r5",
|
||||
"raw": "0x7CA00164",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtmsrd",
|
||||
"operands": "r5"
|
||||
},
|
||||
{
|
||||
"label": "fadd f3, f4, f5",
|
||||
"raw": "0xFC64282A",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "fadd",
|
||||
"operands": "f3, f4, f5"
|
||||
},
|
||||
{
|
||||
"label": "fsub f3, f4, f5",
|
||||
"raw": "0xFC642828",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "fsub",
|
||||
"operands": "f3, f4, f5"
|
||||
},
|
||||
{
|
||||
"label": "fdiv f3, f4, f5",
|
||||
"raw": "0xFC642824",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "fdiv",
|
||||
"operands": "f3, f4, f5"
|
||||
},
|
||||
{
|
||||
"label": "fmul f3, f0, f5 (encoded)",
|
||||
"raw": "0xFCE02832",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "fmul",
|
||||
"operands": "f7, f0, f0"
|
||||
},
|
||||
{
|
||||
"label": "fneg f3, f4",
|
||||
"raw": "0xFC640050",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "fneg",
|
||||
"operands": "f3, f0"
|
||||
},
|
||||
{
|
||||
"label": "fmr f3, f4",
|
||||
"raw": "0xFC640090",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "fmr",
|
||||
"operands": "f3, f0"
|
||||
},
|
||||
{
|
||||
"label": "mtfsf 0xFF, f5 (Rc=0)",
|
||||
"raw": "0xFDFE2D8E",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtfsf",
|
||||
"operands": "0xFF, f5"
|
||||
},
|
||||
{
|
||||
"label": "mtfsf. 0xFF, f5 (Rc=1)",
|
||||
"raw": "0xFDFE2D8F",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtfsf.",
|
||||
"operands": "0xFF, f5"
|
||||
}
|
||||
]
|
||||
}
|
||||
623
crates/xenia-cpu/tests/golden/extended_mnemonics.json
Normal file
623
crates/xenia-cpu/tests/golden/extended_mnemonics.json
Normal file
@@ -0,0 +1,623 @@
|
||||
{
|
||||
"rows": [
|
||||
{
|
||||
"label": "nop",
|
||||
"raw": "0x60000000",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "ori",
|
||||
"operands": "r0, r0, 0x0",
|
||||
"ext_mnemonic": "nop",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "li r3, 16",
|
||||
"raw": "0x38600010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addi",
|
||||
"operands": "r3, r0, 16",
|
||||
"ext_mnemonic": "li",
|
||||
"ext_operands": "r3, 16"
|
||||
},
|
||||
{
|
||||
"label": "li r3, -1",
|
||||
"raw": "0x3860FFFF",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addi",
|
||||
"operands": "r3, r0, -1",
|
||||
"ext_mnemonic": "li",
|
||||
"ext_operands": "r3, -1"
|
||||
},
|
||||
{
|
||||
"label": "subi r3, r4, 16",
|
||||
"raw": "0x3864FFF0",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addi",
|
||||
"operands": "r3, r4, -16",
|
||||
"ext_mnemonic": "subi",
|
||||
"ext_operands": "r3, r4, 16"
|
||||
},
|
||||
{
|
||||
"label": "lis r3, 0x1234",
|
||||
"raw": "0x3C601234",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addis",
|
||||
"operands": "r3, r0, 0x1234",
|
||||
"ext_mnemonic": "lis",
|
||||
"ext_operands": "r3, 0x1234"
|
||||
},
|
||||
{
|
||||
"label": "subis r3, r4, 0xFFFF",
|
||||
"raw": "0x3C64FFFF",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "addis",
|
||||
"operands": "r3, r4, 0xFFFF",
|
||||
"ext_mnemonic": "subis",
|
||||
"ext_operands": "r3, r4, 0x1"
|
||||
},
|
||||
{
|
||||
"label": "mr r3, r4",
|
||||
"raw": "0x7C832378",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "or",
|
||||
"operands": "r3, r4, r4",
|
||||
"ext_mnemonic": "mr",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "mr. r3, r4",
|
||||
"raw": "0x7C832379",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "or.",
|
||||
"operands": "r3, r4, r4",
|
||||
"ext_mnemonic": "mr.",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "mr (via and)",
|
||||
"raw": "0x7C832038",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "and",
|
||||
"operands": "r3, r4, r4",
|
||||
"ext_mnemonic": "mr",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "not r3, r4",
|
||||
"raw": "0x7C8320F8",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "nor",
|
||||
"operands": "r3, r4, r4",
|
||||
"ext_mnemonic": "not",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "subf → sub r3, r5, r4",
|
||||
"raw": "0x7C642850",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "subf",
|
||||
"operands": "r3, r4, r5",
|
||||
"ext_mnemonic": "sub",
|
||||
"ext_operands": "r3, r5, r4"
|
||||
},
|
||||
{
|
||||
"label": "slwi r3, r4, 4",
|
||||
"raw": "0x54832036",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm",
|
||||
"operands": "r3, r4, 4, 0, 27",
|
||||
"ext_mnemonic": "slwi",
|
||||
"ext_operands": "r3, r4, 4"
|
||||
},
|
||||
{
|
||||
"label": "srwi r3, r4, 4",
|
||||
"raw": "0x5483E13E",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm",
|
||||
"operands": "r3, r4, 28, 4, 31",
|
||||
"ext_mnemonic": "srwi",
|
||||
"ext_operands": "r3, r4, 4"
|
||||
},
|
||||
{
|
||||
"label": "rotlwi r3, r4, 8",
|
||||
"raw": "0x5483403E",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm",
|
||||
"operands": "r3, r4, 8, 0, 31",
|
||||
"ext_mnemonic": "rotlwi",
|
||||
"ext_operands": "r3, r4, 8"
|
||||
},
|
||||
{
|
||||
"label": "clrlwi r3, r4, 4",
|
||||
"raw": "0x5483013E",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm",
|
||||
"operands": "r3, r4, 0, 4, 31",
|
||||
"ext_mnemonic": "clrlwi",
|
||||
"ext_operands": "r3, r4, 4"
|
||||
},
|
||||
{
|
||||
"label": "clrrwi r3, r4, 4",
|
||||
"raw": "0x54830036",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm",
|
||||
"operands": "r3, r4, 0, 0, 27",
|
||||
"ext_mnemonic": "clrrwi",
|
||||
"ext_operands": "r3, r4, 4"
|
||||
},
|
||||
{
|
||||
"label": "extlwi r3, r4, 8, 8",
|
||||
"raw": "0x5483400E",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm",
|
||||
"operands": "r3, r4, 8, 0, 7",
|
||||
"ext_mnemonic": "extlwi",
|
||||
"ext_operands": "r3, r4, 8, 8"
|
||||
},
|
||||
{
|
||||
"label": "slwi. r3, r4, 4",
|
||||
"raw": "0x54832037",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm.",
|
||||
"operands": "r3, r4, 4, 0, 27",
|
||||
"ext_mnemonic": "slwi.",
|
||||
"ext_operands": "r3, r4, 4"
|
||||
},
|
||||
{
|
||||
"label": "rlwinm. r11,r11,0,31,31 (no simplify)",
|
||||
"raw": "0x556B07FF",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rlwinm.",
|
||||
"operands": "r11, r11, 0, 31, 31",
|
||||
"ext_mnemonic": "clrlwi.",
|
||||
"ext_operands": "r11, r11, 31"
|
||||
},
|
||||
{
|
||||
"label": "clrldi r3, r4, 32",
|
||||
"raw": "0x78830020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rldicl",
|
||||
"operands": "r3, r4, 0, 32",
|
||||
"ext_mnemonic": "clrldi",
|
||||
"ext_operands": "r3, r4, 32"
|
||||
},
|
||||
{
|
||||
"label": "srdi r3, r4, 8",
|
||||
"raw": "0x7883C202",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rldicl",
|
||||
"operands": "r3, r4, 56, 8",
|
||||
"ext_mnemonic": "srdi",
|
||||
"ext_operands": "r3, r4, 8"
|
||||
},
|
||||
{
|
||||
"label": "rotldi r3, r4, 8",
|
||||
"raw": "0x78834000",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "rldicl",
|
||||
"operands": "r3, r4, 8, 0",
|
||||
"ext_mnemonic": "rotldi",
|
||||
"ext_operands": "r3, r4, 8"
|
||||
},
|
||||
{
|
||||
"label": "cmpwi cr0, r3, 16",
|
||||
"raw": "0x2C030010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmpi",
|
||||
"operands": "0, r3, 16",
|
||||
"ext_mnemonic": "cmpwi",
|
||||
"ext_operands": "r3, 16"
|
||||
},
|
||||
{
|
||||
"label": "cmpdi (L=1) variant",
|
||||
"raw": "0x2C230010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cmpi",
|
||||
"operands": "1, r3, 16",
|
||||
"ext_mnemonic": "cmpdi",
|
||||
"ext_operands": "r3, 16"
|
||||
},
|
||||
{
|
||||
"label": "blr",
|
||||
"raw": "0x4E800020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bclr",
|
||||
"operands": "20, lt",
|
||||
"ext_mnemonic": "blr",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "blrl",
|
||||
"raw": "0x4E800021",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bclrl",
|
||||
"operands": "20, lt",
|
||||
"ext_mnemonic": "blrl",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "bctr",
|
||||
"raw": "0x4E800420",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bcctr",
|
||||
"operands": "20, lt",
|
||||
"ext_mnemonic": "bctr",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "bctrl",
|
||||
"raw": "0x4E800421",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bcctrl",
|
||||
"operands": "20, lt",
|
||||
"ext_mnemonic": "bctrl",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "beqlr (BO=12, BI=2 → cr0.eq true)",
|
||||
"raw": "0x4D820020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bclr",
|
||||
"operands": "12, eq",
|
||||
"ext_mnemonic": "beqlr",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "bnelr",
|
||||
"raw": "0x4C820020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bclr",
|
||||
"operands": "4, eq",
|
||||
"ext_mnemonic": "bnelr",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "bc → b 0x82000040",
|
||||
"raw": "0x42800040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "20, lt, 0x82000040",
|
||||
"ext_mnemonic": "b",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc l → bl 0x82000040",
|
||||
"raw": "0x42800041",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bcl",
|
||||
"operands": "20, lt, 0x82000040",
|
||||
"ext_mnemonic": "bl",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc 12,cr0.eq → beq 0x82000040",
|
||||
"raw": "0x41820040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "12, eq, 0x82000040",
|
||||
"ext_mnemonic": "beq",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc 4,cr0.eq → bne 0x82000040",
|
||||
"raw": "0x40820040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "4, eq, 0x82000040",
|
||||
"ext_mnemonic": "bne",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc 12,cr0.lt → blt 0x82000040",
|
||||
"raw": "0x41800040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "12, lt, 0x82000040",
|
||||
"ext_mnemonic": "blt",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc 4,cr0.lt → bge 0x82000040",
|
||||
"raw": "0x40800040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "4, lt, 0x82000040",
|
||||
"ext_mnemonic": "bge",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc 12,cr0.gt → bgt 0x82000040",
|
||||
"raw": "0x41810040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "12, gt, 0x82000040",
|
||||
"ext_mnemonic": "bgt",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc 4,cr0.gt → ble 0x82000040",
|
||||
"raw": "0x40810040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "4, gt, 0x82000040",
|
||||
"ext_mnemonic": "ble",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bc 12, cr2.eq → beq cr2, 0x...040",
|
||||
"raw": "0x418A0040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "12, 4*cr2+eq, 0x82000040",
|
||||
"ext_mnemonic": "beq",
|
||||
"ext_operands": "cr2, 0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bdnz 0x82000040",
|
||||
"raw": "0x42000040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "16, lt, 0x82000040",
|
||||
"ext_mnemonic": "bdnz",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bdz 0x82000040",
|
||||
"raw": "0x42400040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bc",
|
||||
"operands": "18, lt, 0x82000040",
|
||||
"ext_mnemonic": "bdz",
|
||||
"ext_operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "b +0x40 → 0x82000040",
|
||||
"raw": "0x48000040",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "b",
|
||||
"operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "bl +0x40 → 0x82000040",
|
||||
"raw": "0x48000041",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bl",
|
||||
"operands": "0x82000040",
|
||||
"branch_target": "0x82000040"
|
||||
},
|
||||
{
|
||||
"label": "ba 0x40 absolute",
|
||||
"raw": "0x48000042",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "ba",
|
||||
"operands": "0x00000040",
|
||||
"branch_target": "0x00000040"
|
||||
},
|
||||
{
|
||||
"label": "bla 0x40 absolute",
|
||||
"raw": "0x48000043",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bla",
|
||||
"operands": "0x00000040",
|
||||
"branch_target": "0x00000040"
|
||||
},
|
||||
{
|
||||
"label": "tdeqi r3, 123",
|
||||
"raw": "0x0883007B",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tdi",
|
||||
"operands": "4, r3, 123",
|
||||
"ext_mnemonic": "tdeqi",
|
||||
"ext_operands": "r3, 123"
|
||||
},
|
||||
{
|
||||
"label": "twlti r3, 123",
|
||||
"raw": "0x0E03007B",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "twi",
|
||||
"operands": "16, r3, 123",
|
||||
"ext_mnemonic": "twlti",
|
||||
"ext_operands": "r3, 123"
|
||||
},
|
||||
{
|
||||
"label": "mflr r3",
|
||||
"raw": "0x7C6802A6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mfspr",
|
||||
"operands": "r3, LR",
|
||||
"ext_mnemonic": "mflr",
|
||||
"ext_operands": "r3"
|
||||
},
|
||||
{
|
||||
"label": "mfctr r3",
|
||||
"raw": "0x7C6902A6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mfspr",
|
||||
"operands": "r3, CTR",
|
||||
"ext_mnemonic": "mfctr",
|
||||
"ext_operands": "r3"
|
||||
},
|
||||
{
|
||||
"label": "mfxer r3",
|
||||
"raw": "0x7C6102A6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mfspr",
|
||||
"operands": "r3, XER",
|
||||
"ext_mnemonic": "mfxer",
|
||||
"ext_operands": "r3"
|
||||
},
|
||||
{
|
||||
"label": "mtlr r3",
|
||||
"raw": "0x7C6803A6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtspr",
|
||||
"operands": "LR, r3",
|
||||
"ext_mnemonic": "mtlr",
|
||||
"ext_operands": "r3"
|
||||
},
|
||||
{
|
||||
"label": "mtctr r3",
|
||||
"raw": "0x7C6903A6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtspr",
|
||||
"operands": "CTR, r3",
|
||||
"ext_mnemonic": "mtctr",
|
||||
"ext_operands": "r3"
|
||||
},
|
||||
{
|
||||
"label": "mtxer r3",
|
||||
"raw": "0x7C6103A6",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtspr",
|
||||
"operands": "XER, r3",
|
||||
"ext_mnemonic": "mtxer",
|
||||
"ext_operands": "r3"
|
||||
},
|
||||
{
|
||||
"label": "crnot 4, 5",
|
||||
"raw": "0x4C852842",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "crnor",
|
||||
"operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+gt",
|
||||
"ext_mnemonic": "crnot",
|
||||
"ext_operands": "4*cr1+lt, 4*cr1+gt"
|
||||
},
|
||||
{
|
||||
"label": "crclr 4",
|
||||
"raw": "0x4C842182",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "crxor",
|
||||
"operands": "4*cr1+lt, 4*cr1+lt, 4*cr1+lt",
|
||||
"ext_mnemonic": "crclr",
|
||||
"ext_operands": "4*cr1+lt"
|
||||
},
|
||||
{
|
||||
"label": "crset 4",
|
||||
"raw": "0x4C842242",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "creqv",
|
||||
"operands": "4*cr1+lt, 4*cr1+lt, 4*cr1+lt",
|
||||
"ext_mnemonic": "crset",
|
||||
"ext_operands": "4*cr1+lt"
|
||||
},
|
||||
{
|
||||
"label": "crmove 4, 5",
|
||||
"raw": "0x4C852B82",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "cror",
|
||||
"operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+gt",
|
||||
"ext_mnemonic": "crmove",
|
||||
"ext_operands": "4*cr1+lt, 4*cr1+gt"
|
||||
},
|
||||
{
|
||||
"label": "lwsync",
|
||||
"raw": "0x7C2004AC",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "sync",
|
||||
"operands": "",
|
||||
"ext_mnemonic": "lwsync",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "trap",
|
||||
"raw": "0x7FE00008",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tw",
|
||||
"operands": "31, r0, r0",
|
||||
"ext_mnemonic": "trap",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "blr (BO=20, BI=4 — BI is don't-care)",
|
||||
"raw": "0x4E840020",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bclr",
|
||||
"operands": "20, 4*cr1+lt",
|
||||
"ext_mnemonic": "blr",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "blrl (BO=20, BI=7)",
|
||||
"raw": "0x4E870021",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bclrl",
|
||||
"operands": "20, 4*cr1+so",
|
||||
"ext_mnemonic": "blrl",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "bctr (BO=20, BI=4)",
|
||||
"raw": "0x4E840420",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "bcctr",
|
||||
"operands": "20, 4*cr1+lt",
|
||||
"ext_mnemonic": "bctr",
|
||||
"ext_operands": ""
|
||||
},
|
||||
{
|
||||
"label": "twllt r3, r4 (TO=2)",
|
||||
"raw": "0x7C432008",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tw",
|
||||
"operands": "2, r3, r4",
|
||||
"ext_mnemonic": "twllt",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "twlgt r3, r4 (TO=1)",
|
||||
"raw": "0x7C232008",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tw",
|
||||
"operands": "1, r3, r4",
|
||||
"ext_mnemonic": "twlgt",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "tdlge r3, r4 (TO=5)",
|
||||
"raw": "0x7CA32088",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "td",
|
||||
"operands": "5, r3, r4",
|
||||
"ext_mnemonic": "tdlge",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "twlle r3, r4 (TO=6)",
|
||||
"raw": "0x7CC32008",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tw",
|
||||
"operands": "6, r3, r4",
|
||||
"ext_mnemonic": "twlle",
|
||||
"ext_operands": "r3, r4"
|
||||
},
|
||||
{
|
||||
"label": "twllti r3, 16",
|
||||
"raw": "0x0C430010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "twi",
|
||||
"operands": "2, r3, 16",
|
||||
"ext_mnemonic": "twllti",
|
||||
"ext_operands": "r3, 16"
|
||||
},
|
||||
{
|
||||
"label": "tdlgei r3, 16",
|
||||
"raw": "0x08A30010",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "tdi",
|
||||
"operands": "5, r3, 16",
|
||||
"ext_mnemonic": "tdlgei",
|
||||
"ext_operands": "r3, 16"
|
||||
}
|
||||
]
|
||||
}
|
||||
137
crates/xenia-cpu/tests/golden/vmx128_registers.json
Normal file
137
crates/xenia-cpu/tests/golden/vmx128_registers.json
Normal file
@@ -0,0 +1,137 @@
|
||||
{
|
||||
"rows": [
|
||||
{
|
||||
"label": "vaddubm v3, v4, v5",
|
||||
"raw": "0x10642800",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vaddubm",
|
||||
"operands": "v3, v4, v5"
|
||||
},
|
||||
{
|
||||
"label": "vaddfp v3, v4, v5",
|
||||
"raw": "0x1064280A",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vaddfp",
|
||||
"operands": "v3, v4, v5"
|
||||
},
|
||||
{
|
||||
"label": "vand v3, v4, v5",
|
||||
"raw": "0x10642C04",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vand",
|
||||
"operands": "v3, v4, v5"
|
||||
},
|
||||
{
|
||||
"label": "vor v3, v4, v5",
|
||||
"raw": "0x10642C84",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vor",
|
||||
"operands": "v3, v4, v5"
|
||||
},
|
||||
{
|
||||
"label": "vxor v3, v4, v5",
|
||||
"raw": "0x10642CC4",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vxor",
|
||||
"operands": "v3, v4, v5"
|
||||
},
|
||||
{
|
||||
"label": "vsel v3,v4,v5,v6",
|
||||
"raw": "0x106429AA",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vsel",
|
||||
"operands": "v3, v4, v5, v6"
|
||||
},
|
||||
{
|
||||
"label": "vperm v3,v4,v5,v6",
|
||||
"raw": "0x106429AB",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vperm",
|
||||
"operands": "v3, v4, v5, v6"
|
||||
},
|
||||
{
|
||||
"label": "vmaddfp v3, v4, v6, v5 (swap)",
|
||||
"raw": "0x106429AE",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vmaddfp",
|
||||
"operands": "v3, v4, v6, v5"
|
||||
},
|
||||
{
|
||||
"label": "mfvscr v3",
|
||||
"raw": "0x10600604",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mfvscr",
|
||||
"operands": "v3"
|
||||
},
|
||||
{
|
||||
"label": "mtvscr v5",
|
||||
"raw": "0x10002E44",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "mtvscr",
|
||||
"operands": "v5"
|
||||
},
|
||||
{
|
||||
"label": "vperm128 v3, v4, v5, 0 (canary)",
|
||||
"raw": "0x14642800",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vperm128",
|
||||
"operands": "v3, v4, v5, 0"
|
||||
},
|
||||
{
|
||||
"label": "vsrw128 v0, v0, v12 (canary, vd_hi=00)",
|
||||
"raw": "0x180061D0",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vsrw128",
|
||||
"operands": "v0, v0, v12"
|
||||
},
|
||||
{
|
||||
"label": "vsrw128 v32, v0, v12 (canary, VD128h=01)",
|
||||
"raw": "0x180061D4",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vsrw128",
|
||||
"operands": "v32, v0, v12"
|
||||
},
|
||||
{
|
||||
"label": "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)",
|
||||
"raw": "0x180463D8",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vpermwi128",
|
||||
"operands": "v64, v12, 0xE4"
|
||||
},
|
||||
{
|
||||
"label": "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)",
|
||||
"raw": "0x180467DC",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vrlimi128",
|
||||
"operands": "v96, v12, 4, 3"
|
||||
},
|
||||
{
|
||||
"label": "vrlimi128 v127, v95, 4, 3 (canary)",
|
||||
"raw": "0x1BE4FFDE",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vrlimi128",
|
||||
"operands": "v127, v95, 4, 3"
|
||||
},
|
||||
{
|
||||
"label": "vmaddfp128 v3, v35, v5, v3",
|
||||
"raw": "0x146328F0",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vmaddfp128",
|
||||
"operands": "v3, v35, v5, v3"
|
||||
},
|
||||
{
|
||||
"label": "vmaddcfp128 v3, v35, v3, v5",
|
||||
"raw": "0x14632930",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vmaddcfp128",
|
||||
"operands": "v3, v35, v3, v5"
|
||||
},
|
||||
{
|
||||
"label": "vnmsubfp128 v3, v35, v3, v5",
|
||||
"raw": "0x14632970",
|
||||
"addr": "0x82000000",
|
||||
"mnemonic": "vnmsubfp128",
|
||||
"operands": "v3, v35, v3, v5"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -42,15 +42,30 @@ impl Debugger {
|
||||
}
|
||||
}
|
||||
|
||||
/// Tier-3 perf: single branch that the hot interpreter loop checks
|
||||
/// before dispatching to [`pre_step`]/[`post_step`]. When the
|
||||
/// debugger is in "cold run" mode (not paused, no breakpoints,
|
||||
/// `StepMode::Run`, in-memory trace off), both hooks become dead
|
||||
/// code and we can skip the HashMap lookup + step-mode match + Vec
|
||||
/// maintenance entirely. The compiler reliably branch-predicts the
|
||||
/// stable branch direction across millions of instructions.
|
||||
#[inline]
|
||||
pub fn wants_hooks(&self) -> bool {
|
||||
self.trace_enabled
|
||||
|| self.paused
|
||||
|| self.break_pending
|
||||
|| !matches!(self.step_mode, StepMode::Run)
|
||||
|| !self.breakpoints.is_empty()
|
||||
}
|
||||
|
||||
/// Called before each instruction executes.
|
||||
pub fn pre_step(&mut self, ctx: &PpcContext, _mem: &dyn MemoryAccess) {
|
||||
// Check breakpoints
|
||||
if let Some(bp) = self.breakpoints.get(&ctx.pc) {
|
||||
if bp.enabled {
|
||||
if let Some(bp) = self.breakpoints.get(&ctx.pc)
|
||||
&& bp.enabled {
|
||||
self.break_pending = true;
|
||||
tracing::info!("Breakpoint hit at {:#010x}", ctx.pc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Called after each instruction executes.
|
||||
|
||||
@@ -11,3 +11,11 @@ tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
byteorder = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
bytemuck = { workspace = true }
|
||||
crossbeam-channel = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
# Used to validate bundled WGSL placeholders compile cleanly. Matches the
|
||||
# wgpu-22 transitive dep so we don't pull in a second naga version.
|
||||
naga = { version = "22", features = ["wgsl-in"] }
|
||||
|
||||
1129
crates/xenia-gpu/src/draw_state.rs
Normal file
1129
crates/xenia-gpu/src/draw_state.rs
Normal file
File diff suppressed because it is too large
Load Diff
506
crates/xenia-gpu/src/edram.rs
Normal file
506
crates/xenia-gpu/src/edram.rs
Normal file
@@ -0,0 +1,506 @@
|
||||
//! CPU-side shadow of the Xenos GPU's 10 MiB EDRAM.
|
||||
//!
|
||||
//! The real console has 10 MiB of embedded DRAM organised as 2048 tiles,
|
||||
//! each 80 × 16 samples wide at 32 bits per sample (`xenos.h:223-285`,
|
||||
//! `kEdramTileCount = 2048`). 64-bpp formats pack two adjacent EDRAM tiles
|
||||
//! per color value.
|
||||
//!
|
||||
//! xenia-rs does not currently render through a real EDRAM (host draws go
|
||||
//! straight to wgpu attachments), but the resolve path still needs a
|
||||
//! concrete byte source. We keep a linear 10 MiB `Vec<u8>` here so:
|
||||
//!
|
||||
//! * clear-resolves can paint `RB_COLOR_CLEAR` / `RB_DEPTH_CLEAR` into the
|
||||
//! source tiles, which the resolve loop then copies into guest memory
|
||||
//! (this is the Sylpheed-first-pixels path);
|
||||
//! * future host→EDRAM readback code has a place to deposit pixels without
|
||||
//! touching the resolve API.
|
||||
//!
|
||||
//! Byte layout inside one tile: row-major, `80 * 16 * bpp` bytes. At 32bpp,
|
||||
//! offset `= y * 80 * 4 + x * 4` from the tile base. Samples are stored in
|
||||
//! native-u32 byte order; any Xenon big-endian vs little-endian shuffling
|
||||
//! happens at the resolve write boundary, not inside EDRAM.
|
||||
//!
|
||||
//! Indexing wraps mod 2048 (`XE_GPU_REGISTER` `RB_COLOR_INFO.color_base` is
|
||||
//! 11-bit). Canary relies on this wraparound for tall surfaces that
|
||||
//! exceed the 10 MiB region.
|
||||
|
||||
/// Number of tiles in EDRAM. `xenos::kEdramTileCount`.
|
||||
pub const EDRAM_TILE_COUNT: u32 = 2048;
|
||||
|
||||
/// Samples per tile along X. `xenos::kEdramTileWidthSamples`.
|
||||
pub const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80;
|
||||
|
||||
/// Samples per tile along Y. `xenos::kEdramTileHeightSamples`.
|
||||
pub const EDRAM_TILE_HEIGHT_SAMPLES: u32 = 16;
|
||||
|
||||
/// Bytes per tile at 32bpp: 80 × 16 × 4 = 5120.
|
||||
pub const EDRAM_TILE_BYTES_32BPP: u32 =
|
||||
EDRAM_TILE_WIDTH_SAMPLES * EDRAM_TILE_HEIGHT_SAMPLES * 4;
|
||||
|
||||
/// Bytes per tile at 64bpp: 80 × 16 × 8 = 10_240 (two adjacent 32bpp tiles).
|
||||
pub const EDRAM_TILE_BYTES_64BPP: u32 = EDRAM_TILE_BYTES_32BPP * 2;
|
||||
|
||||
/// Total EDRAM size in bytes: 2048 × 5120 = 10_485_760 (exactly 10 MiB).
|
||||
pub const EDRAM_SIZE_BYTES: usize = (EDRAM_TILE_COUNT * EDRAM_TILE_BYTES_32BPP) as usize;
|
||||
|
||||
/// 10 MiB shadow of the console's EDRAM. Owned by `GpuSystem` and lives for
|
||||
/// the lifetime of the GPU; no per-frame allocation.
|
||||
pub struct ShadowEdram {
|
||||
bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Default for ShadowEdram {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ShadowEdram {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
bytes: vec![0u8; EDRAM_SIZE_BYTES],
|
||||
}
|
||||
}
|
||||
|
||||
/// Raw byte offset of a tile within the shadow buffer, wrapped mod 2048.
|
||||
#[inline]
|
||||
fn tile_byte_offset(tile_index: u32) -> usize {
|
||||
((tile_index % EDRAM_TILE_COUNT) * EDRAM_TILE_BYTES_32BPP) as usize
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.bytes
|
||||
}
|
||||
|
||||
pub fn tile(&self, tile_index: u32) -> &[u8] {
|
||||
let off = Self::tile_byte_offset(tile_index);
|
||||
&self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
|
||||
}
|
||||
|
||||
pub fn tile_mut(&mut self, tile_index: u32) -> &mut [u8] {
|
||||
let off = Self::tile_byte_offset(tile_index);
|
||||
&mut self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
|
||||
}
|
||||
|
||||
/// Sample-space byte offset within the shadow buffer for one 32bpp
|
||||
/// sample at `(x_samples, y_samples)` in a surface whose EDRAM origin
|
||||
/// is `base_tiles` and whose row pitch is `pitch_tiles` 32bpp tiles.
|
||||
///
|
||||
/// Tile layout: a surface of pitch `P` tiles is laid out as a row of
|
||||
/// `P` tiles followed by the next 16-sample-tall row, etc. Sample
|
||||
/// `(x, y)` lives in tile `(y/16)*P + (x/80)`, at row `y % 16` and
|
||||
/// column `x % 80` within that tile.
|
||||
#[inline]
|
||||
fn sample_offset_32bpp(base_tiles: u16, pitch_tiles: u32, x: u32, y: u32) -> Option<usize> {
|
||||
if pitch_tiles == 0 {
|
||||
return None;
|
||||
}
|
||||
let tile_row = y / EDRAM_TILE_HEIGHT_SAMPLES;
|
||||
let tile_col = x / EDRAM_TILE_WIDTH_SAMPLES;
|
||||
let within_y = y % EDRAM_TILE_HEIGHT_SAMPLES;
|
||||
let within_x = x % EDRAM_TILE_WIDTH_SAMPLES;
|
||||
let tile_index =
|
||||
(base_tiles as u32).wrapping_add(tile_row * pitch_tiles + tile_col);
|
||||
let off = Self::tile_byte_offset(tile_index)
|
||||
+ (within_y * EDRAM_TILE_WIDTH_SAMPLES * 4 + within_x * 4) as usize;
|
||||
Some(off)
|
||||
}
|
||||
|
||||
/// Fill a `(w × h)`-sample rectangle at `(x, y)` with a constant 32bpp
|
||||
/// pattern. Coordinates are in *sample space* (already scaled through
|
||||
/// `sample_count_log2_x/y` for MSAA). Wraps mod 2048 tiles via
|
||||
/// `tile_byte_offset`.
|
||||
///
|
||||
/// The pattern is written as host-native little-endian bytes — the
|
||||
/// endian swap in [`crate::resolve::apply_endian_128`] converts to the
|
||||
/// byte order expected by the destination.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn fill_rect_32bpp(
|
||||
&mut self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
pattern: u32,
|
||||
) {
|
||||
if w == 0 || h == 0 {
|
||||
return;
|
||||
}
|
||||
let le = pattern.to_le_bytes();
|
||||
for dy in 0..h {
|
||||
for dx in 0..w {
|
||||
if let Some(off) = Self::sample_offset_32bpp(
|
||||
base_tiles,
|
||||
pitch_tiles,
|
||||
x + dx,
|
||||
y + dy,
|
||||
) && off + 4 <= self.bytes.len()
|
||||
{
|
||||
self.bytes[off..off + 4].copy_from_slice(&le);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Read one 32bpp sample at `(x, y)` in sample coordinates. Returns 0
|
||||
/// if the surface pitch is zero (degenerate; caller should skip the
|
||||
/// resolve).
|
||||
pub fn read_sample_32bpp(
|
||||
&self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
) -> u32 {
|
||||
match Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y) {
|
||||
Some(off) if off + 4 <= self.bytes.len() => u32::from_le_bytes([
|
||||
self.bytes[off],
|
||||
self.bytes[off + 1],
|
||||
self.bytes[off + 2],
|
||||
self.bytes[off + 3],
|
||||
]),
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Write one 32bpp sample at `(x, y)` in sample coordinates. Mirror of
|
||||
/// [`Self::read_sample_32bpp`]. Used by the wgpu→ShadowEdram readback
|
||||
/// retile path and unit tests.
|
||||
pub fn write_sample_32bpp(
|
||||
&mut self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
sample: u32,
|
||||
) {
|
||||
if let Some(off) = Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y)
|
||||
&& off + 4 <= self.bytes.len()
|
||||
{
|
||||
self.bytes[off..off + 4].copy_from_slice(&sample.to_le_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
/// Bulk write a `(w × h)`-sample rectangle at `(x, y)` from a row-major
|
||||
/// linear `samples` buffer. The buffer length must be at least `w * h`;
|
||||
/// extra entries are ignored. Order: `samples[dy * w + dx]` lands at
|
||||
/// (x + dx, y + dy). This is the format the wgpu→ShadowEdram readback
|
||||
/// path uses after stripping wgpu's 256-byte row alignment.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn write_rect_32bpp(
|
||||
&mut self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
samples: &[u32],
|
||||
) {
|
||||
if w == 0 || h == 0 {
|
||||
return;
|
||||
}
|
||||
let needed = (w as usize).saturating_mul(h as usize);
|
||||
debug_assert!(samples.len() >= needed, "write_rect_32bpp: samples too short");
|
||||
for dy in 0..h {
|
||||
let row_base = (dy as usize) * (w as usize);
|
||||
for dx in 0..w {
|
||||
let idx = row_base + dx as usize;
|
||||
if idx >= samples.len() {
|
||||
return;
|
||||
}
|
||||
self.write_sample_32bpp(base_tiles, pitch_tiles, x + dx, y + dy, samples[idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- 64bpp helpers ----------------------------------------------------
|
||||
//
|
||||
// 64bpp formats (`k_16_16_16_16`, `k_16_16_16_16_FLOAT`, `k_32_32_FLOAT`)
|
||||
// occupy two adjacent EDRAM tiles per logical tile, doubling the row
|
||||
// pitch in tiles. Per Canary `xenos.h:321-325 IsColorRenderTargetFormat64bpp`
|
||||
// and `draw_util.cc:1260-1262` (`pitch_tiles = surface_pitch_tiles << is_64bpp`).
|
||||
//
|
||||
// Convention: callers pass the *32bpp-equivalent* `base_tiles` and
|
||||
// `pitch_tiles_32bpp` (i.e. the `RB_COLOR_INFO.color_base` and
|
||||
// `surface_pitch_tiles` decoded from registers). The 64bpp helpers
|
||||
// multiply both by 2 internally so the lo/hi pair lands in adjacent
|
||||
// tiles. `lo` is the lower-addressed 32bpp word; `hi` is the upper.
|
||||
|
||||
/// Read one 64bpp sample as `(lo, hi)` u32 pair. Doubled-tile addressing
|
||||
/// per Canary's `is_64bpp` convention.
|
||||
pub fn read_sample_64bpp(
|
||||
&self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles_32bpp: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
) -> (u32, u32) {
|
||||
let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
|
||||
let base64 = (base_tiles as u32).saturating_mul(2) as u16;
|
||||
let lo = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2), y);
|
||||
let hi = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y);
|
||||
(lo, hi)
|
||||
}
|
||||
|
||||
/// Write one 64bpp sample as `(lo, hi)` u32 pair.
|
||||
pub fn write_sample_64bpp(
|
||||
&mut self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles_32bpp: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
lo: u32,
|
||||
hi: u32,
|
||||
) {
|
||||
let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
|
||||
let base64 = (base_tiles as u32).saturating_mul(2) as u16;
|
||||
self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2), y, lo);
|
||||
self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y, hi);
|
||||
}
|
||||
|
||||
/// Bulk write a 64bpp rectangle from a row-major `(lo, hi)` linear
|
||||
/// buffer.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn write_rect_64bpp(
|
||||
&mut self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles_32bpp: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
samples: &[(u32, u32)],
|
||||
) {
|
||||
if w == 0 || h == 0 {
|
||||
return;
|
||||
}
|
||||
for dy in 0..h {
|
||||
let row_base = (dy as usize) * (w as usize);
|
||||
for dx in 0..w {
|
||||
let idx = row_base + dx as usize;
|
||||
if idx >= samples.len() {
|
||||
return;
|
||||
}
|
||||
let (lo, hi) = samples[idx];
|
||||
self.write_sample_64bpp(base_tiles, pitch_tiles_32bpp, x + dx, y + dy, lo, hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Fill a `(w × h)`-sample rectangle with a constant 64bpp pattern.
|
||||
/// `lo` lands at the low-addressed 32bpp word, `hi` at the high one
|
||||
/// — i.e. for clears, callers pass `(lo = RB_COLOR_CLEAR_LO,
|
||||
/// hi = RB_COLOR_CLEAR)` per Canary `draw_util.cc:1302-1303`.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn fill_rect_64bpp(
|
||||
&mut self,
|
||||
base_tiles: u16,
|
||||
pitch_tiles_32bpp: u32,
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
lo: u32,
|
||||
hi: u32,
|
||||
) {
|
||||
if w == 0 || h == 0 {
|
||||
return;
|
||||
}
|
||||
for dy in 0..h {
|
||||
for dx in 0..w {
|
||||
self.write_sample_64bpp(
|
||||
base_tiles,
|
||||
pitch_tiles_32bpp,
|
||||
x + dx,
|
||||
y + dy,
|
||||
lo,
|
||||
hi,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn shadow_edram_is_exactly_10_mib() {
|
||||
assert_eq!(EDRAM_SIZE_BYTES, 10 * 1024 * 1024);
|
||||
let e = ShadowEdram::new();
|
||||
assert_eq!(e.as_bytes().len(), 10 * 1024 * 1024);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fill_rect_writes_the_whole_first_tile() {
|
||||
let mut e = ShadowEdram::new();
|
||||
e.fill_rect_32bpp(0, 1, 0, 0, 80, 16, 0x11223344);
|
||||
// Every 4-byte sample in tile 0 should be 0x11223344 (LE).
|
||||
let expected = 0x11223344u32.to_le_bytes();
|
||||
let tile = e.tile(0);
|
||||
for chunk in tile.chunks_exact(4) {
|
||||
assert_eq!(chunk, expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fill_rect_respects_pitch_and_base() {
|
||||
let mut e = ShadowEdram::new();
|
||||
// Surface: pitch=2 tiles, base=5. A 160x16 fill should land in
|
||||
// tiles 5 and 6 — and leave tile 4 / tile 7 / tile 0 untouched.
|
||||
e.fill_rect_32bpp(5, 2, 0, 0, 160, 16, 0xAABBCCDD);
|
||||
let expected = 0xAABBCCDDu32.to_le_bytes();
|
||||
for chunk in e.tile(5).chunks_exact(4) {
|
||||
assert_eq!(chunk, expected);
|
||||
}
|
||||
for chunk in e.tile(6).chunks_exact(4) {
|
||||
assert_eq!(chunk, expected);
|
||||
}
|
||||
assert!(e.tile(4).iter().all(|&b| b == 0));
|
||||
assert!(e.tile(7).iter().all(|&b| b == 0));
|
||||
assert!(e.tile(0).iter().all(|&b| b == 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fill_rect_wraps_mod_2048() {
|
||||
let mut e = ShadowEdram::new();
|
||||
// base=2047, pitch=2: first tile is 2047, second wraps to 0.
|
||||
e.fill_rect_32bpp(2047, 2, 0, 0, 160, 16, 0xDEAD_BEEF);
|
||||
let expected = 0xDEAD_BEEFu32.to_le_bytes();
|
||||
for chunk in e.tile(2047).chunks_exact(4) {
|
||||
assert_eq!(chunk, expected);
|
||||
}
|
||||
for chunk in e.tile(0).chunks_exact(4) {
|
||||
assert_eq!(chunk, expected);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn read_sample_roundtrips_fill_rect() {
|
||||
let mut e = ShadowEdram::new();
|
||||
e.fill_rect_32bpp(3, 1, 0, 0, 80, 16, 0xCAFE_F00D);
|
||||
// Sample any interior point.
|
||||
assert_eq!(e.read_sample_32bpp(3, 1, 0, 0), 0xCAFE_F00D);
|
||||
assert_eq!(e.read_sample_32bpp(3, 1, 79, 15), 0xCAFE_F00D);
|
||||
// Untouched neighbouring tile.
|
||||
assert_eq!(e.read_sample_32bpp(4, 1, 0, 0), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn zero_pitch_is_a_noop_read() {
|
||||
let e = ShadowEdram::new();
|
||||
assert_eq!(e.read_sample_32bpp(0, 0, 10, 10), 0);
|
||||
}
|
||||
|
||||
/// `write_sample_32bpp` round-trips through `read_sample_32bpp`.
|
||||
#[test]
|
||||
fn write_sample_32bpp_round_trips() {
|
||||
let mut e = ShadowEdram::new();
|
||||
for x in 0..80u32 {
|
||||
for y in 0..16u32 {
|
||||
e.write_sample_32bpp(0, 1, x, y, 0xABCD_0000 | (y << 8) | x);
|
||||
}
|
||||
}
|
||||
for x in 0..80u32 {
|
||||
for y in 0..16u32 {
|
||||
assert_eq!(
|
||||
e.read_sample_32bpp(0, 1, x, y),
|
||||
0xABCD_0000 | (y << 8) | x,
|
||||
"round-trip mismatch at ({x},{y})"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `write_rect_32bpp` writes row-major samples into the right
|
||||
/// sample-offsets, including across tile boundaries.
|
||||
#[test]
|
||||
fn write_rect_32bpp_crosses_tile_boundary() {
|
||||
let mut e = ShadowEdram::new();
|
||||
// Surface pitch = 2 tiles → x in [0, 160), y in [0, 16). A 100x4
|
||||
// rect at (40, 4) crosses x=80 (tile boundary).
|
||||
let w = 100u32;
|
||||
let h = 4u32;
|
||||
let mut samples = Vec::with_capacity((w * h) as usize);
|
||||
for dy in 0..h {
|
||||
for dx in 0..w {
|
||||
samples.push(0x10000 | (dy << 8) | dx);
|
||||
}
|
||||
}
|
||||
e.write_rect_32bpp(0, 2, 40, 4, w, h, &samples);
|
||||
// Spot-check: (40, 4) lands in tile 0; (140, 4) in tile 1.
|
||||
assert_eq!(e.read_sample_32bpp(0, 2, 40, 4), 0x1_0000);
|
||||
assert_eq!(
|
||||
e.read_sample_32bpp(0, 2, 139, 7),
|
||||
0x10000 | (3 << 8) | 99
|
||||
);
|
||||
}
|
||||
|
||||
/// `read_sample_64bpp` round-trips through `write_sample_64bpp` —
|
||||
/// doubled-pitch addressing keeps lo/hi adjacent in EDRAM bytes.
|
||||
#[test]
|
||||
fn write_read_sample_64bpp_roundtrips() {
|
||||
let mut e = ShadowEdram::new();
|
||||
// Use 32bpp pitch=1, base=0 → 64bpp pitch=2, base=0. A single-tile
|
||||
// 64bpp surface fits 80x16 logical 64bpp samples? No — 80x16 32bpp
|
||||
// samples per tile, 80 logical 64bpp samples per *pair* of tiles,
|
||||
// and our 80×16 region needs 2 tiles. Stick to 16x4 logical 64bpp.
|
||||
for x in 0..16u32 {
|
||||
for y in 0..4u32 {
|
||||
e.write_sample_64bpp(0, 1, x, y, 0xAAAA_0000 | x, 0xBBBB_0000 | y);
|
||||
}
|
||||
}
|
||||
for x in 0..16u32 {
|
||||
for y in 0..4u32 {
|
||||
let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
|
||||
assert_eq!(lo, 0xAAAA_0000 | x);
|
||||
assert_eq!(hi, 0xBBBB_0000 | y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `fill_rect_64bpp` writes both the lo and hi clear words across
|
||||
/// a 64bpp surface — matches the `RB_COLOR_CLEAR_LO`/`RB_COLOR_CLEAR`
|
||||
/// convention.
|
||||
#[test]
|
||||
fn fill_rect_64bpp_writes_both_words() {
|
||||
let mut e = ShadowEdram::new();
|
||||
// 16x4 logical 64bpp samples; pitch=1 32bpp tile → 2 64bpp tiles.
|
||||
e.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xCAFE_F00D, 0xDEAD_BEEF);
|
||||
for x in 0..16u32 {
|
||||
for y in 0..4u32 {
|
||||
let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
|
||||
assert_eq!(lo, 0xCAFE_F00D);
|
||||
assert_eq!(hi, 0xDEAD_BEEF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// 64bpp helpers must respect the doubled tile pitch — adjacent logical
|
||||
/// 64bpp samples must land at adjacent 32bpp samples in EDRAM.
|
||||
#[test]
|
||||
fn sixty_four_bpp_uses_doubled_pitch() {
|
||||
let mut e = ShadowEdram::new();
|
||||
e.write_sample_64bpp(0, 1, 5, 0, 0x1111_1111, 0x2222_2222);
|
||||
// The lo word must sit at 32bpp x=10 (5 << 1), hi at x=11.
|
||||
// Doubled pitch -> base=0, pitch=2 32bpp.
|
||||
assert_eq!(e.read_sample_32bpp(0, 2, 10, 0), 0x1111_1111);
|
||||
assert_eq!(e.read_sample_32bpp(0, 2, 11, 0), 0x2222_2222);
|
||||
}
|
||||
|
||||
/// `write_rect_*` with empty dimensions is a no-op.
|
||||
#[test]
|
||||
fn write_rect_empty_is_noop() {
|
||||
let mut e = ShadowEdram::new();
|
||||
e.write_rect_32bpp(0, 1, 0, 0, 0, 5, &[1, 2, 3]);
|
||||
e.write_rect_32bpp(0, 1, 0, 0, 5, 0, &[1, 2, 3]);
|
||||
e.fill_rect_64bpp(0, 1, 0, 0, 0, 5, 1, 2);
|
||||
e.fill_rect_64bpp(0, 1, 0, 0, 5, 0, 1, 2);
|
||||
// Nothing should have been written.
|
||||
assert!(e.as_bytes().iter().all(|&b| b == 0));
|
||||
}
|
||||
}
|
||||
1823
crates/xenia-gpu/src/gpu_system.rs
Normal file
1823
crates/xenia-gpu/src/gpu_system.rs
Normal file
File diff suppressed because it is too large
Load Diff
1044
crates/xenia-gpu/src/handle.rs
Normal file
1044
crates/xenia-gpu/src/handle.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,21 +1,49 @@
|
||||
//! Xenos GPU emulation for xenia-rs.
|
||||
//!
|
||||
//! Modules:
|
||||
//! - [`pm4`]: packet format decoder + Type-3 opcode set.
|
||||
//! - [`ring_view`]: ring-buffer bookkeeping (base/size/read/write pointers).
|
||||
//! - [`register_file`]: 0x6000-entry register array backing the CP + state.
|
||||
//! - [`gpu_system`]: top-level `GpuSystem` + PM4 executor running one packet
|
||||
//! per call (see the plan's P2 for the design rationale).
|
||||
//!
|
||||
//! Legacy module `ring_drain` and `command_processor` are retained while P3+
|
||||
//! migrations finish; they will be removed once every caller is on
|
||||
//! [`gpu_system::GpuSystem`].
|
||||
|
||||
pub mod command_processor;
|
||||
pub mod draw_state;
|
||||
pub mod edram;
|
||||
pub mod gpu_system;
|
||||
pub mod handle;
|
||||
pub mod mmio_region;
|
||||
pub mod pm4;
|
||||
pub mod primitive;
|
||||
pub mod register_file;
|
||||
pub mod ring_drain;
|
||||
pub mod ring_view;
|
||||
pub mod render_target_cache;
|
||||
pub mod resolve;
|
||||
pub mod shader_metrics;
|
||||
pub mod shaders;
|
||||
pub mod texture_cache;
|
||||
pub mod tiled_address;
|
||||
pub mod translator;
|
||||
pub mod ucode;
|
||||
pub mod xenos_constants;
|
||||
|
||||
/// Stub GPU system for initial implementation.
|
||||
pub struct GpuSystem {
|
||||
pub register_file: register_file::RegisterFile,
|
||||
}
|
||||
|
||||
impl GpuSystem {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
register_file: register_file::RegisterFile::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for GpuSystem {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
pub use gpu_system::{
|
||||
ExecOutcome, GpuBlock, GpuMmio, GpuStats, GpuSystem, InterruptSource, PendingInterrupt,
|
||||
ShaderBlob, SwapNotification, WaitCmp,
|
||||
};
|
||||
pub use handle::{
|
||||
DrainReply, GpuBackend, GpuCommand, GpuDigestSnapshot, GpuHandle, GpuWorker,
|
||||
shutdown_and_join_with_timeout, spawn_gpu_worker, spawn_noop_worker,
|
||||
};
|
||||
pub use mmio_region::build_region as build_mmio_region;
|
||||
pub use pm4::{
|
||||
PacketHeader, PacketKind, PM4_INTERRUPT, PM4_NOP, PM4_XE_SWAP, SWAP_SIGNATURE,
|
||||
type3_opcode_name,
|
||||
};
|
||||
pub use ring_drain::{DrainResult, drain};
|
||||
pub use ring_view::RingBufferView;
|
||||
|
||||
217
crates/xenia-gpu/src/mmio_region.rs
Normal file
217
crates/xenia-gpu/src/mmio_region.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Construct an `xenia_memory::MmioRegion` that backs the Xenos GPU register
|
||||
//! aperture at guest physical `0x7FC80000` (per canary
|
||||
//! `graphics_system.cc:141-144` — `memory_->AddVirtualMappedRange(0x7FC80000,
|
||||
//! 0xFFFF0000, 0x0000FFFF, …)`).
|
||||
//!
|
||||
//! Only a handful of registers need a round-trip over the bus; everything
|
||||
//! else (the ALU / fetch constants, the RBBM state machine, …) lives inside
|
||||
//! `GpuSystem::register_file` and is driven by PM4 packets from the CP on
|
||||
//! the same host thread.
|
||||
//!
|
||||
//! The read/write closures capture `Arc<AtomicU32>` mailboxes cloned from
|
||||
//! [`crate::GpuMmio`]; [`crate::GpuSystem::sync_with_mmio`] samples them
|
||||
//! each scheduler round.
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use xenia_memory::MmioRegion;
|
||||
|
||||
use crate::gpu_system::{reg, GpuMmio};
|
||||
|
||||
/// Xenos GPU register aperture base (guest physical address). Matches
|
||||
/// canary's `graphics_system.cc:141`.
|
||||
pub const APERTURE_BASE: u32 = 0x7FC8_0000;
|
||||
/// Mask used by `MmioRegion::contains` so any `0x7FC8xxxx` address hits.
|
||||
pub const APERTURE_MASK: u32 = 0xFFFF_0000;
|
||||
/// Total aperture size in bytes (enough for the low 16-bit register window).
|
||||
pub const APERTURE_SIZE: u32 = 0x0001_0000;
|
||||
|
||||
/// Build the [`MmioRegion`] to install on the guest memory.
|
||||
pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
|
||||
let read_wptr = mmio.cp_rb_wptr.clone();
|
||||
let read_rptr = mmio.cp_rb_rptr.clone();
|
||||
let read_int_status = mmio.cp_int_status.clone();
|
||||
let read_int_ack = mmio.cp_int_ack.clone();
|
||||
let read_vblank_status = mmio.d1mode_vblank_vline_status.clone();
|
||||
let write_wptr = mmio.cp_rb_wptr.clone();
|
||||
let write_int_ack = mmio.cp_int_ack.clone();
|
||||
let write_vblank_status = mmio.d1mode_vblank_vline_status.clone();
|
||||
// M1.7 parker — captured into the WPTR write closure to wake a
|
||||
// parked GPU worker on every guest WPTR write. In inline mode the
|
||||
// mutex holds `None`, so the unpark site is a brief lock + no-op.
|
||||
let wake_pending = mmio.wake_pending.clone();
|
||||
let worker_thread = mmio.worker_thread.clone();
|
||||
|
||||
MmioRegion {
|
||||
base_address: APERTURE_BASE,
|
||||
mask: APERTURE_MASK,
|
||||
size: APERTURE_SIZE,
|
||||
read_callback: Box::new(move |addr: u32| {
|
||||
let reg_index = (addr & 0xFFFF) / 4;
|
||||
match reg_index {
|
||||
reg::CP_RB_WPTR => read_wptr.load(Ordering::Relaxed),
|
||||
reg::CP_RB_RPTR => read_rptr.load(Ordering::Relaxed),
|
||||
reg::CP_INT_STATUS => read_int_status.load(Ordering::Relaxed),
|
||||
// Games sometimes read-back the ack register to check interrupt ownership
|
||||
// — serve the last-written value.
|
||||
reg::CP_INT_ACK => read_int_ack.load(Ordering::Relaxed),
|
||||
reg::D1MODE_VBLANK_VLINE_STATUS => {
|
||||
read_vblank_status.load(Ordering::Relaxed)
|
||||
}
|
||||
_ => {
|
||||
tracing::trace!(
|
||||
reg = format_args!("{reg_index:#x}"),
|
||||
addr = format_args!("{addr:#010x}"),
|
||||
"gpu mmio: unmapped read (returning 0)"
|
||||
);
|
||||
0
|
||||
}
|
||||
}
|
||||
}),
|
||||
write_callback: Box::new(move |addr: u32, value: u32| {
|
||||
let reg_index = (addr & 0xFFFF) / 4;
|
||||
match reg_index {
|
||||
reg::CP_RB_WPTR => {
|
||||
// Release: any prior writes to ring memory the guest
|
||||
// performed before bumping WPTR must be visible to
|
||||
// the GPU consumer that Acquire-loads this atomic.
|
||||
write_wptr.store(value, Ordering::Release);
|
||||
// M1.7 parker wake: set the pending bit (Release) so
|
||||
// a worker swapping it on its way to `park_timeout`
|
||||
// sees `was_pending == true` and skips the park; AND
|
||||
// unpark the worker if it's already parked. Both are
|
||||
// necessary to defend against the race window between
|
||||
// the worker's `swap(false)` and `park_timeout()`.
|
||||
wake_pending.store(true, Ordering::Release);
|
||||
if let Ok(g) = worker_thread.lock() {
|
||||
if let Some(t) = g.as_ref() {
|
||||
t.unpark();
|
||||
}
|
||||
}
|
||||
tracing::trace!(
|
||||
value,
|
||||
addr = format_args!("{addr:#010x}"),
|
||||
"gpu mmio: CP_RB_WPTR write"
|
||||
);
|
||||
}
|
||||
// CP_INT_ACK clears interrupt bits; we just echo the value.
|
||||
reg::CP_INT_ACK => {
|
||||
write_int_ack.store(value, Ordering::Relaxed);
|
||||
}
|
||||
// D1MODE_VBLANK_VLINE_STATUS is write-1-to-clear per the
|
||||
// AMD M56 display-controller ref. Clear any bit the guest
|
||||
// writes a 1 to (leaving other bits untouched).
|
||||
reg::D1MODE_VBLANK_VLINE_STATUS => {
|
||||
let prev = write_vblank_status.load(Ordering::Relaxed);
|
||||
write_vblank_status.store(prev & !value, Ordering::Relaxed);
|
||||
}
|
||||
_ => {
|
||||
tracing::trace!(
|
||||
reg = format_args!("{reg_index:#x}"),
|
||||
addr = format_args!("{addr:#010x}"),
|
||||
value = format_args!("{value:#x}"),
|
||||
"gpu mmio: unmapped write (dropping)"
|
||||
);
|
||||
}
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn build() -> (GpuMmio, MmioRegion) {
|
||||
let mmio = GpuMmio::new();
|
||||
let region = build_region(&mmio);
|
||||
(mmio, region)
|
||||
}
|
||||
|
||||
/// `D1MODE_VBLANK_VLINE_STATUS` read must surface the atomic's current
|
||||
/// value — Sylpheed's graphics-interrupt callback reads bit 0 to decide
|
||||
/// whether vblank actually fired; if we always return 0 the callback
|
||||
/// silently skips every frame's work.
|
||||
#[test]
|
||||
fn vblank_status_read_returns_stored_value() {
|
||||
let (mmio, region) = build();
|
||||
mmio.d1mode_vblank_vline_status
|
||||
.store(0x1, Ordering::Relaxed);
|
||||
let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
|
||||
assert_eq!((region.read_callback)(offset), 0x1);
|
||||
}
|
||||
|
||||
/// Guest clears the flag by writing 1 back. Classic write-1-to-clear —
|
||||
/// AMD M56 display-controller ref and Canary's behavior. We preserve
|
||||
/// unrelated bits so higher-bit status (VLINE_INT_OCCURRED etc.) can
|
||||
/// coexist with a concurrent clear of bit 0.
|
||||
#[test]
|
||||
fn vblank_status_write_1_to_clear() {
|
||||
let (mmio, region) = build();
|
||||
mmio.d1mode_vblank_vline_status
|
||||
.store(0b11, Ordering::Relaxed);
|
||||
let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
|
||||
(region.write_callback)(offset, 0b01);
|
||||
assert_eq!(
|
||||
mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
|
||||
0b10,
|
||||
"bit 0 cleared, bit 1 preserved"
|
||||
);
|
||||
}
|
||||
|
||||
/// Write-0-to-a-bit must NOT clear that bit — classic W1TC semantics.
|
||||
#[test]
|
||||
fn vblank_status_write_0_is_noop() {
|
||||
let (mmio, region) = build();
|
||||
mmio.d1mode_vblank_vline_status
|
||||
.store(0b11, Ordering::Relaxed);
|
||||
let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
|
||||
(region.write_callback)(offset, 0x0);
|
||||
assert_eq!(
|
||||
mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
|
||||
0b11
|
||||
);
|
||||
}
|
||||
|
||||
/// Regression: prior to the fix, `reg::CP_RB_WPTR` held a byte offset
|
||||
/// (`0x0714`) while the match arm compared against a *register index*
|
||||
/// (`(addr & 0xFFFF) / 4 == 0x01C5`). Guest MMIO writes to the WPTR
|
||||
/// therefore fell through to "unmapped" and the atomic never moved;
|
||||
/// only `VdInitializeRingBuffer` / `extend_write_ptr` paths worked.
|
||||
///
|
||||
/// Verify every CP register lands in its atomic when the guest writes
|
||||
/// at the canonical `APERTURE_BASE + index*4` byte address.
|
||||
#[test]
|
||||
fn cp_rb_wptr_write_via_mmio_bus_reaches_atomic() {
|
||||
let (mmio, region) = build();
|
||||
let offset = APERTURE_BASE + reg::CP_RB_WPTR * 4;
|
||||
assert_eq!(offset, 0x7FC8_0714, "byte offset must match Canary CP_RB_WPTR");
|
||||
(region.write_callback)(offset, 0x1234_5678);
|
||||
assert_eq!(mmio.cp_rb_wptr.load(Ordering::Relaxed), 0x1234_5678);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cp_int_ack_write_via_mmio_bus_reaches_atomic() {
|
||||
let (mmio, region) = build();
|
||||
let offset = APERTURE_BASE + reg::CP_INT_ACK * 4;
|
||||
assert_eq!(offset, 0x7FC8_07D0, "byte offset must match Canary CP_INT_ACK");
|
||||
(region.write_callback)(offset, 0xDEAD_BEEF);
|
||||
assert_eq!(mmio.cp_int_ack.load(Ordering::Relaxed), 0xDEAD_BEEF);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cp_rb_rptr_read_via_mmio_bus_returns_atomic() {
|
||||
let (mmio, region) = build();
|
||||
mmio.cp_rb_rptr.store(0xCAFE_F00D, Ordering::Relaxed);
|
||||
let offset = APERTURE_BASE + reg::CP_RB_RPTR * 4;
|
||||
assert_eq!((region.read_callback)(offset), 0xCAFE_F00D);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cp_int_status_read_via_mmio_bus_returns_atomic() {
|
||||
let (mmio, region) = build();
|
||||
mmio.cp_int_status.store(0x0000_0001, Ordering::Relaxed);
|
||||
let offset = APERTURE_BASE + reg::CP_INT_STATUS * 4;
|
||||
assert_eq!((region.read_callback)(offset), 0x0000_0001);
|
||||
}
|
||||
}
|
||||
286
crates/xenia-gpu/src/pm4.rs
Normal file
286
crates/xenia-gpu/src/pm4.rs
Normal file
@@ -0,0 +1,286 @@
|
||||
//! PM4 packet format — header decoding + Type-3 opcode set.
|
||||
//!
|
||||
//! Xenos PM4 packet layout mirrors `xenia-canary/src/xenia/gpu/packet_disassembler.cc`:
|
||||
//!
|
||||
//! - **Type 0** (`packet >> 30 == 0`): register-write run.
|
||||
//! `count = ((packet >> 16) & 0x3FFF) + 1`. Total dwords = `1 + count`.
|
||||
//! With `(packet >> 15) & 1 == 1`, all writes target the same register.
|
||||
//! - **Type 1** (`packet >> 30 == 1`): two-register write. Total dwords = 3.
|
||||
//! - **Type 2** (`packet >> 30 == 2`): NOP — a single skipped dword.
|
||||
//! - **Type 3** (`packet >> 30 == 3`): command.
|
||||
//! `opcode = (packet >> 8) & 0x7F`,
|
||||
//! `count = ((packet >> 16) & 0x3FFF) + 1`.
|
||||
//! Total dwords = `1 + count`.
|
||||
|
||||
/// The cookie canary writes alongside `PM4_XE_SWAP` so tooling can recognize
|
||||
/// swap packets. `'X','E','N','X'` big-endian (`kSwapSignature`).
|
||||
pub const SWAP_SIGNATURE: u32 = 0x584E_4558;
|
||||
|
||||
// ── Named Type-3 opcodes (from xenia-canary/src/xenia/gpu/xenos.h:1617-1679) ──
|
||||
|
||||
pub const PM4_ME_INIT: u8 = 0x48;
|
||||
pub const PM4_NOP: u8 = 0x10;
|
||||
pub const PM4_INDIRECT_BUFFER: u8 = 0x3F;
|
||||
pub const PM4_INDIRECT_BUFFER_PFD: u8 = 0x37;
|
||||
pub const PM4_WAIT_FOR_IDLE: u8 = 0x26;
|
||||
pub const PM4_WAIT_REG_MEM: u8 = 0x3C;
|
||||
pub const PM4_REG_RMW: u8 = 0x21;
|
||||
pub const PM4_REG_TO_MEM: u8 = 0x3E;
|
||||
pub const PM4_MEM_WRITE: u8 = 0x3D;
|
||||
pub const PM4_COND_WRITE: u8 = 0x45;
|
||||
pub const PM4_EVENT_WRITE: u8 = 0x46;
|
||||
pub const PM4_EVENT_WRITE_SHD: u8 = 0x58;
|
||||
pub const PM4_EVENT_WRITE_EXT: u8 = 0x5A;
|
||||
pub const PM4_EVENT_WRITE_ZPD: u8 = 0x5B;
|
||||
pub const PM4_DRAW_INDX: u8 = 0x22;
|
||||
pub const PM4_DRAW_INDX_2: u8 = 0x36;
|
||||
pub const PM4_VIZ_QUERY: u8 = 0x23;
|
||||
pub const PM4_SET_CONSTANT: u8 = 0x2D;
|
||||
pub const PM4_SET_CONSTANT2: u8 = 0x55;
|
||||
pub const PM4_SET_SHADER_CONSTANTS: u8 = 0x56;
|
||||
pub const PM4_LOAD_ALU_CONSTANT: u8 = 0x2F;
|
||||
pub const PM4_IM_LOAD: u8 = 0x27;
|
||||
pub const PM4_IM_LOAD_IMMEDIATE: u8 = 0x2B;
|
||||
pub const PM4_LOAD_CONSTANT_CONTEXT: u8 = 0x2E;
|
||||
pub const PM4_INVALIDATE_STATE: u8 = 0x3B;
|
||||
pub const PM4_INTERRUPT: u8 = 0x54;
|
||||
pub const PM4_SET_SHADER_BASES: u8 = 0x4A;
|
||||
pub const PM4_SET_BIN_MASK_LO: u8 = 0x60;
|
||||
pub const PM4_SET_BIN_MASK_HI: u8 = 0x61;
|
||||
pub const PM4_SET_BIN_SELECT_LO: u8 = 0x62;
|
||||
pub const PM4_SET_BIN_SELECT_HI: u8 = 0x63;
|
||||
pub const PM4_SET_BIN_MASK: u8 = 0x50;
|
||||
pub const PM4_SET_BIN_SELECT: u8 = 0x51;
|
||||
pub const PM4_CONTEXT_UPDATE: u8 = 0x5E;
|
||||
/// Xenia-specific: `VdSwap` writes this to trigger a present.
|
||||
pub const PM4_XE_SWAP: u8 = 0x64;
|
||||
|
||||
/// Human-readable name for a Type-3 opcode. Used for tracing spans.
|
||||
pub fn type3_opcode_name(op: u8) -> &'static str {
|
||||
match op {
|
||||
PM4_ME_INIT => "ME_INIT",
|
||||
PM4_NOP => "NOP",
|
||||
PM4_INDIRECT_BUFFER => "INDIRECT_BUFFER",
|
||||
PM4_INDIRECT_BUFFER_PFD => "INDIRECT_BUFFER_PFD",
|
||||
PM4_WAIT_FOR_IDLE => "WAIT_FOR_IDLE",
|
||||
PM4_WAIT_REG_MEM => "WAIT_REG_MEM",
|
||||
PM4_REG_RMW => "REG_RMW",
|
||||
PM4_REG_TO_MEM => "REG_TO_MEM",
|
||||
PM4_MEM_WRITE => "MEM_WRITE",
|
||||
PM4_COND_WRITE => "COND_WRITE",
|
||||
PM4_EVENT_WRITE => "EVENT_WRITE",
|
||||
PM4_EVENT_WRITE_SHD => "EVENT_WRITE_SHD",
|
||||
PM4_EVENT_WRITE_EXT => "EVENT_WRITE_EXT",
|
||||
PM4_EVENT_WRITE_ZPD => "EVENT_WRITE_ZPD",
|
||||
PM4_DRAW_INDX => "DRAW_INDX",
|
||||
PM4_DRAW_INDX_2 => "DRAW_INDX_2",
|
||||
PM4_VIZ_QUERY => "VIZ_QUERY",
|
||||
PM4_SET_CONSTANT => "SET_CONSTANT",
|
||||
PM4_SET_CONSTANT2 => "SET_CONSTANT2",
|
||||
PM4_SET_SHADER_CONSTANTS => "SET_SHADER_CONSTANTS",
|
||||
PM4_LOAD_ALU_CONSTANT => "LOAD_ALU_CONSTANT",
|
||||
PM4_LOAD_CONSTANT_CONTEXT => "LOAD_CONSTANT_CONTEXT",
|
||||
PM4_IM_LOAD => "IM_LOAD",
|
||||
PM4_IM_LOAD_IMMEDIATE => "IM_LOAD_IMMEDIATE",
|
||||
PM4_INVALIDATE_STATE => "INVALIDATE_STATE",
|
||||
PM4_INTERRUPT => "INTERRUPT",
|
||||
PM4_SET_SHADER_BASES => "SET_SHADER_BASES",
|
||||
PM4_SET_BIN_MASK_LO => "SET_BIN_MASK_LO",
|
||||
PM4_SET_BIN_MASK_HI => "SET_BIN_MASK_HI",
|
||||
PM4_SET_BIN_SELECT_LO => "SET_BIN_SELECT_LO",
|
||||
PM4_SET_BIN_SELECT_HI => "SET_BIN_SELECT_HI",
|
||||
PM4_SET_BIN_MASK => "SET_BIN_MASK",
|
||||
PM4_SET_BIN_SELECT => "SET_BIN_SELECT",
|
||||
PM4_CONTEXT_UPDATE => "CONTEXT_UPDATE",
|
||||
PM4_XE_SWAP => "XE_SWAP",
|
||||
_ => "UNKNOWN",
|
||||
}
|
||||
}
|
||||
|
||||
/// Decoded single PM4 packet header.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct PacketHeader {
|
||||
pub kind: PacketKind,
|
||||
/// Total size of the packet (including header) in dwords.
|
||||
pub total_dwords: u32,
|
||||
}
|
||||
|
||||
/// Classification of a PM4 packet.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum PacketKind {
|
||||
/// Type-0 register-write run. `base_index` is the first register index
|
||||
/// (the register offset / 4). `write_one` is true if all `count` data
|
||||
/// dwords write to the same register.
|
||||
Type0 {
|
||||
base_index: u32,
|
||||
count: u32,
|
||||
write_one: bool,
|
||||
},
|
||||
/// Type-1 two-register write.
|
||||
Type1 { reg_index_1: u32, reg_index_2: u32 },
|
||||
/// Type-2 NOP (a single skipped dword).
|
||||
Type2,
|
||||
/// Type-3 command.
|
||||
Type3 {
|
||||
opcode: u8,
|
||||
count: u32,
|
||||
predicated: bool,
|
||||
},
|
||||
}
|
||||
|
||||
/// Build a Type-0 register-write packet header. Mirrors canary's
|
||||
/// `MakePacketType0` at `xenia-canary/src/xenia/gpu/xenos.h:1682`.
|
||||
/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
|
||||
pub fn make_packet_type0(reg_index: u16, count: u16) -> u32 {
|
||||
debug_assert!(reg_index <= 0x7FFF);
|
||||
debug_assert!(count >= 1 && count as u32 <= 0x4000);
|
||||
(0u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | (reg_index as u32 & 0x7FFF)
|
||||
}
|
||||
|
||||
/// Build a Type-2 NOP packet header. Single dword, no payload.
|
||||
pub const fn make_packet_type2() -> u32 {
|
||||
2u32 << 30
|
||||
}
|
||||
|
||||
/// Build a Type-3 command packet header. Mirrors canary's `MakePacketType3`.
|
||||
/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
|
||||
pub fn make_packet_type3(opcode: u8, count: u16) -> u32 {
|
||||
debug_assert!(count >= 1 && count as u32 <= 0x4000);
|
||||
(3u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | ((opcode as u32 & 0x7F) << 8)
|
||||
}
|
||||
|
||||
/// Decode a single PM4 packet header.
|
||||
pub fn decode(header: u32) -> PacketHeader {
|
||||
match header >> 30 {
|
||||
0 => {
|
||||
let count = ((header >> 16) & 0x3FFF) + 1;
|
||||
PacketHeader {
|
||||
kind: PacketKind::Type0 {
|
||||
base_index: header & 0x7FFF,
|
||||
count,
|
||||
write_one: (header >> 15) & 1 != 0,
|
||||
},
|
||||
total_dwords: 1 + count,
|
||||
}
|
||||
}
|
||||
1 => PacketHeader {
|
||||
kind: PacketKind::Type1 {
|
||||
reg_index_1: header & 0x7FF,
|
||||
reg_index_2: (header >> 11) & 0x7FF,
|
||||
},
|
||||
total_dwords: 3,
|
||||
},
|
||||
2 => PacketHeader {
|
||||
kind: PacketKind::Type2,
|
||||
total_dwords: 1,
|
||||
},
|
||||
3 => {
|
||||
let count = ((header >> 16) & 0x3FFF) + 1;
|
||||
PacketHeader {
|
||||
kind: PacketKind::Type3 {
|
||||
opcode: ((header >> 8) & 0x7F) as u8,
|
||||
count,
|
||||
predicated: (header & 1) != 0,
|
||||
},
|
||||
total_dwords: 1 + count,
|
||||
}
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn type2_is_one_dword() {
|
||||
// 0x80000000 == type 2 header (bits 31:30 = 10)
|
||||
let hdr = decode(0x8000_0000);
|
||||
assert_eq!(hdr.kind, PacketKind::Type2);
|
||||
assert_eq!(hdr.total_dwords, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn type0_count_is_inclusive() {
|
||||
// count field (bits 29:16) = 5 → 6 data dwords. base_index = 0x100.
|
||||
// write_one = 0.
|
||||
let hdr = decode((5 << 16) | 0x100);
|
||||
match hdr.kind {
|
||||
PacketKind::Type0 {
|
||||
base_index,
|
||||
count,
|
||||
write_one,
|
||||
} => {
|
||||
assert_eq!(base_index, 0x100);
|
||||
assert_eq!(count, 6);
|
||||
assert!(!write_one);
|
||||
}
|
||||
_ => panic!("expected Type0"),
|
||||
}
|
||||
assert_eq!(hdr.total_dwords, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn type3_swap_packet() {
|
||||
// Build the exact header canary's VdSwap emits:
|
||||
// MakePacketType3(PM4_XE_SWAP, 4) → ((3<<30) | ((4-1)<<16) | (0x64<<8))
|
||||
let hdr_word = (3u32 << 30) | ((4u32 - 1) << 16) | ((PM4_XE_SWAP as u32) << 8);
|
||||
let hdr = decode(hdr_word);
|
||||
match hdr.kind {
|
||||
PacketKind::Type3 {
|
||||
opcode,
|
||||
count,
|
||||
predicated,
|
||||
} => {
|
||||
assert_eq!(opcode, PM4_XE_SWAP);
|
||||
assert_eq!(count, 4);
|
||||
assert!(!predicated);
|
||||
}
|
||||
_ => panic!("expected Type3"),
|
||||
}
|
||||
assert_eq!(hdr.total_dwords, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn opcode_names_are_present_for_common_ops() {
|
||||
assert_eq!(type3_opcode_name(PM4_NOP), "NOP");
|
||||
assert_eq!(type3_opcode_name(PM4_DRAW_INDX), "DRAW_INDX");
|
||||
assert_eq!(type3_opcode_name(PM4_XE_SWAP), "XE_SWAP");
|
||||
assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
|
||||
assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn make_packet_helpers_round_trip_through_decode() {
|
||||
// Type-0: SHADER_CONSTANT_FETCH_00_0 (0x4800), count=6.
|
||||
let t0 = make_packet_type0(0x4800, 6);
|
||||
match decode(t0).kind {
|
||||
PacketKind::Type0 { base_index, count, write_one } => {
|
||||
assert_eq!(base_index, 0x4800);
|
||||
assert_eq!(count, 6);
|
||||
assert!(!write_one);
|
||||
}
|
||||
other => panic!("expected Type0, got {other:?}"),
|
||||
}
|
||||
assert_eq!(decode(t0).total_dwords, 7);
|
||||
|
||||
// Type-3: PM4_XE_SWAP, count=4 (signature + addr + W + H).
|
||||
let t3 = make_packet_type3(PM4_XE_SWAP, 4);
|
||||
match decode(t3).kind {
|
||||
PacketKind::Type3 { opcode, count, predicated } => {
|
||||
assert_eq!(opcode, PM4_XE_SWAP);
|
||||
assert_eq!(count, 4);
|
||||
assert!(!predicated);
|
||||
}
|
||||
other => panic!("expected Type3, got {other:?}"),
|
||||
}
|
||||
assert_eq!(decode(t3).total_dwords, 5);
|
||||
|
||||
// Type-2: NOP.
|
||||
let t2 = make_packet_type2();
|
||||
assert_eq!(t2, 0x8000_0000);
|
||||
assert_eq!(decode(t2).kind, PacketKind::Type2);
|
||||
assert_eq!(decode(t2).total_dwords, 1);
|
||||
}
|
||||
}
|
||||
229
crates/xenia-gpu/src/primitive.rs
Normal file
229
crates/xenia-gpu/src/primitive.rs
Normal file
@@ -0,0 +1,229 @@
|
||||
//! Primitive processor — normalize Xenos primitives into host-GPU forms.
|
||||
//!
|
||||
//! wgpu only exposes `PrimitiveTopology::{PointList, LineList, LineStrip,
|
||||
//! TriangleList, TriangleStrip}`. For everything else (fans, quads,
|
||||
//! rectangles) we rewrite indices on the CPU side so the host just sees a
|
||||
//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
|
||||
//!
|
||||
//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
|
||||
//! (list, strip, fan). Rectangle + quad expansions are stubs logged via
|
||||
//! `tracing::warn!` for later.
|
||||
|
||||
use crate::draw_state::{IndexSize, PrimitiveType};
|
||||
|
||||
/// Host primitive topology — a subset of wgpu's that we commit to.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum HostTopology {
|
||||
PointList,
|
||||
LineList,
|
||||
LineStrip,
|
||||
TriangleList,
|
||||
TriangleStrip,
|
||||
}
|
||||
|
||||
/// Result of primitive processing.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ProcessedPrimitive {
|
||||
pub topology: HostTopology,
|
||||
/// When the Xenos primitive needed client-side rewriting (fans, quads),
|
||||
/// this buffer holds the rewritten 16-bit or 32-bit index sequence.
|
||||
/// `None` means the input index buffer is usable as-is.
|
||||
pub rewritten_indices: Option<Vec<u32>>,
|
||||
/// Post-processing vertex count — equals the input count when indices
|
||||
/// pass through unchanged.
|
||||
pub host_vertex_count: u32,
|
||||
/// `true` if we rejected the primitive (unsupported shape) and the
|
||||
/// caller should skip this draw. Logged via `tracing::warn!`.
|
||||
pub rejected: bool,
|
||||
}
|
||||
|
||||
/// Normalize a draw.
|
||||
///
|
||||
/// `indices` is `None` for `AutoIndex` draws; otherwise it's the decoded
|
||||
/// index stream (already endian-converted / widened to u32 by the caller).
|
||||
pub fn process(
|
||||
primitive: PrimitiveType,
|
||||
vertex_count: u32,
|
||||
indices: Option<&[u32]>,
|
||||
) -> ProcessedPrimitive {
|
||||
match primitive {
|
||||
PrimitiveType::PointList => pass_through(HostTopology::PointList, vertex_count),
|
||||
PrimitiveType::LineList => pass_through(HostTopology::LineList, vertex_count),
|
||||
PrimitiveType::LineStrip => pass_through(HostTopology::LineStrip, vertex_count),
|
||||
PrimitiveType::TriangleList => pass_through(HostTopology::TriangleList, vertex_count),
|
||||
PrimitiveType::TriangleStrip => pass_through(HostTopology::TriangleStrip, vertex_count),
|
||||
PrimitiveType::TriangleFan => expand_fan(indices, vertex_count),
|
||||
PrimitiveType::RectangleList => expand_rectangles(indices, vertex_count),
|
||||
PrimitiveType::QuadList => expand_quads(indices, vertex_count),
|
||||
PrimitiveType::None | PrimitiveType::Unknown(_) => {
|
||||
tracing::warn!(?primitive, "gpu: rejecting unsupported primitive");
|
||||
metrics::counter!("gpu.primitive.rejected").increment(1);
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: None,
|
||||
host_vertex_count: 0,
|
||||
rejected: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn pass_through(topology: HostTopology, vertex_count: u32) -> ProcessedPrimitive {
|
||||
ProcessedPrimitive {
|
||||
topology,
|
||||
rewritten_indices: None,
|
||||
host_vertex_count: vertex_count,
|
||||
rejected: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a triangle fan to a triangle list. Fan indices `[0, 1, 2, 3, 4]`
|
||||
/// expand to triangles `(0,1,2), (0,2,3), (0,3,4)` — 3 × (n-2) host indices.
|
||||
fn expand_fan(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||
if vertex_count < 3 {
|
||||
return ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(Vec::new()),
|
||||
host_vertex_count: 0,
|
||||
rejected: false,
|
||||
};
|
||||
}
|
||||
let mut out = Vec::with_capacity(3 * (vertex_count as usize - 2));
|
||||
let get = |i: u32| -> u32 {
|
||||
match indices {
|
||||
Some(buf) => buf[i as usize],
|
||||
None => i,
|
||||
}
|
||||
};
|
||||
let apex = get(0);
|
||||
for i in 1..vertex_count.saturating_sub(1) {
|
||||
out.push(apex);
|
||||
out.push(get(i));
|
||||
out.push(get(i + 1));
|
||||
}
|
||||
let host_vertex_count = out.len() as u32;
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(out),
|
||||
host_vertex_count,
|
||||
rejected: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a quad list (groups of 4) to a triangle list (groups of 6).
|
||||
fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||
let quad_count = vertex_count / 4;
|
||||
let mut out = Vec::with_capacity(6 * quad_count as usize);
|
||||
let get = |i: u32| -> u32 {
|
||||
match indices {
|
||||
Some(buf) => buf[i as usize],
|
||||
None => i,
|
||||
}
|
||||
};
|
||||
for q in 0..quad_count {
|
||||
let base = q * 4;
|
||||
let a = get(base);
|
||||
let b = get(base + 1);
|
||||
let c = get(base + 2);
|
||||
let d = get(base + 3);
|
||||
out.extend_from_slice(&[a, b, c, a, c, d]);
|
||||
}
|
||||
let host_vertex_count = out.len() as u32;
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(out),
|
||||
host_vertex_count,
|
||||
rejected: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Rectangle lists: a Xenos-specific primitive where each group of 3
|
||||
/// vertices defines a right-angle rectangle by its three non-repeated
|
||||
/// corners (the 4th is derived). The uber-shader doesn't support this yet;
|
||||
/// the ucode translator will emulate it as a geometry-stage fake. For P3
|
||||
/// we emit an empty draw.
|
||||
fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
|
||||
tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
|
||||
metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(Vec::new()),
|
||||
host_vertex_count: 0,
|
||||
rejected: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Widen a u16 index buffer to u32. The primitive processor normalizes to
|
||||
/// u32 so downstream wgpu pipeline descriptors stay simple.
|
||||
pub fn widen_indices(raw: &[u8], size: IndexSize, count: u32) -> Vec<u32> {
|
||||
let mut out = Vec::with_capacity(count as usize);
|
||||
match size {
|
||||
IndexSize::Sixteen => {
|
||||
for i in 0..count as usize {
|
||||
let off = i * 2;
|
||||
if off + 2 > raw.len() {
|
||||
break;
|
||||
}
|
||||
// Xenos indices are big-endian on the wire.
|
||||
let be = u16::from_be_bytes([raw[off], raw[off + 1]]);
|
||||
out.push(be as u32);
|
||||
}
|
||||
}
|
||||
IndexSize::ThirtyTwo => {
|
||||
for i in 0..count as usize {
|
||||
let off = i * 4;
|
||||
if off + 4 > raw.len() {
|
||||
break;
|
||||
}
|
||||
let be = u32::from_be_bytes([raw[off], raw[off + 1], raw[off + 2], raw[off + 3]]);
|
||||
out.push(be);
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn triangle_list_passes_through() {
|
||||
let p = process(PrimitiveType::TriangleList, 6, None);
|
||||
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||
assert!(p.rewritten_indices.is_none());
|
||||
assert_eq!(p.host_vertex_count, 6);
|
||||
assert!(!p.rejected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fan_to_list_expands_correctly() {
|
||||
// Fan of 5 vertices → triangles (0,1,2), (0,2,3), (0,3,4)
|
||||
let p = process(PrimitiveType::TriangleFan, 5, None);
|
||||
let idx = p.rewritten_indices.unwrap();
|
||||
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 0, 3, 4]);
|
||||
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||
assert_eq!(p.host_vertex_count, 9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quad_list_expansion() {
|
||||
let p = process(PrimitiveType::QuadList, 8, None);
|
||||
let idx = p.rewritten_indices.unwrap();
|
||||
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn widen_u16_indices_big_endian() {
|
||||
// 3 indices [1, 2, 0x1234] in BE u16.
|
||||
let raw = [0, 1, 0, 2, 0x12, 0x34];
|
||||
let out = widen_indices(&raw, IndexSize::Sixteen, 3);
|
||||
assert_eq!(out, vec![1, 2, 0x1234]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_unknown_primitive() {
|
||||
let p = process(PrimitiveType::Unknown(0x2A), 3, None);
|
||||
assert!(p.rejected);
|
||||
}
|
||||
}
|
||||
384
crates/xenia-gpu/src/render_target_cache.rs
Normal file
384
crates/xenia-gpu/src/render_target_cache.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! EDRAM tile book + render-target key bookkeeping.
|
||||
//!
|
||||
//! Mirrors `xenia-canary/src/xenia/gpu/render_target_cache.h` at the data-
|
||||
//! structure level. Xenos's 10 MiB EDRAM is divided into 2048 "tiles" of
|
||||
//! 80×16 samples each; render targets claim a contiguous range of those
|
||||
//! tiles based on `(base_tiles, pitch_tiles_at_32bpp, msaa_samples, format,
|
||||
//! is_depth)`. Two render targets with overlapping tile ranges share the
|
||||
//! underlying EDRAM — canary tracks this with per-tile "Host vs Shared"
|
||||
//! ownership, which is what this module's `TileOwner` captures.
|
||||
//!
|
||||
//! P4 ships the **bookkeeping**. Actual host texture allocation per key (so
|
||||
//! the host can draw into a wgpu texture matching the guest's RT) is left to
|
||||
//! a future host-side cache built on top of this module; the same for
|
||||
//! format-conversion compute shaders (the plan's P5 territory).
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Number of EDRAM tiles on Xenos. Matches canary's `xenos::kEdramTileCount`.
|
||||
pub const EDRAM_TILE_COUNT: usize = 2048;
|
||||
|
||||
/// MSAA sample count encoded into [`RenderTargetKey`]. Canary uses this as
|
||||
/// `xenos::MsaaSamples` (1×/2×/4×).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum MsaaSamples {
|
||||
X1 = 0,
|
||||
X2 = 1,
|
||||
X4 = 2,
|
||||
}
|
||||
|
||||
impl MsaaSamples {
|
||||
pub fn from_raw(raw: u32) -> Self {
|
||||
match raw & 0x3 {
|
||||
1 => MsaaSamples::X2,
|
||||
2 => MsaaSamples::X4,
|
||||
_ => MsaaSamples::X1,
|
||||
}
|
||||
}
|
||||
pub fn count(self) -> u32 {
|
||||
1u32 << (self as u32)
|
||||
}
|
||||
}
|
||||
|
||||
/// The packed EDRAM render-target identity. Bit layout matches
|
||||
/// `render_target_cache.h:251-321`'s `RenderTargetKey` union (26 bits used,
|
||||
/// stored as a single `u32` so it hashes cheaply). `pitch_tiles_at_32bpp`
|
||||
/// is always the 32bpp-equivalent pitch — 64bpp targets halve their tile
|
||||
/// pitch from the nominal tile grid (canary's `GetPitchTiles()` handles
|
||||
/// that).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct RenderTargetKey {
|
||||
pub base_tiles: u16, // [0..2048)
|
||||
pub pitch_tiles_at_32bpp: u16, // 0..=256 in practice
|
||||
pub msaa_samples: MsaaSamples,
|
||||
pub is_depth: bool,
|
||||
/// Color format: `xenos::ColorRenderTargetFormat` when !is_depth.
|
||||
/// Depth format: `xenos::DepthRenderTargetFormat` when is_depth.
|
||||
pub resource_format: u8, // 4 bits
|
||||
}
|
||||
|
||||
impl RenderTargetKey {
|
||||
/// Pack into canary's 26-bit layout. Useful for compact storage /
|
||||
/// hashing when we add a LRU cache later on.
|
||||
pub fn pack(&self) -> u32 {
|
||||
(self.base_tiles as u32 & 0x7FF)
|
||||
| (((self.pitch_tiles_at_32bpp as u32) & 0xFF) << 11)
|
||||
| (((self.msaa_samples as u32) & 0x3) << 19)
|
||||
| ((self.is_depth as u32) << 21)
|
||||
| (((self.resource_format as u32) & 0xF) << 22)
|
||||
}
|
||||
|
||||
pub fn unpack(raw: u32) -> Self {
|
||||
Self {
|
||||
base_tiles: (raw & 0x7FF) as u16,
|
||||
pitch_tiles_at_32bpp: ((raw >> 11) & 0xFF) as u16,
|
||||
msaa_samples: MsaaSamples::from_raw((raw >> 19) & 0x3),
|
||||
is_depth: ((raw >> 21) & 1) != 0,
|
||||
resource_format: ((raw >> 22) & 0xF) as u8,
|
||||
}
|
||||
}
|
||||
|
||||
/// How many EDRAM tiles the whole surface occupies (rough estimate; a
|
||||
/// real height-aware calc needs viewport info). We conservatively use
|
||||
/// `pitch_tiles_at_32bpp * 1` until a draw tells us otherwise; callers
|
||||
/// that know the height can call [`tile_footprint_with_height`].
|
||||
pub fn tile_pitch(&self) -> u16 {
|
||||
// 64bpp formats pack two 32bpp tiles into one 64bpp tile.
|
||||
if self.is_64bpp() {
|
||||
self.pitch_tiles_at_32bpp / 2
|
||||
} else {
|
||||
self.pitch_tiles_at_32bpp
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_64bpp(&self) -> bool {
|
||||
if self.is_depth {
|
||||
false
|
||||
} else {
|
||||
// Canary: `ColorRenderTargetFormat::{k_16_16_16_16,
|
||||
// k_16_16_16_16_FLOAT, k_32_32_FLOAT}` are 64bpp; indices 4, 5, 7
|
||||
// in the enum. (Kept narrow because the enum is 4 bits wide.)
|
||||
matches!(self.resource_format, 4 | 5 | 7)
|
||||
}
|
||||
}
|
||||
|
||||
/// Tiles claimed by this RT if its surface height is `rows_of_tiles`
|
||||
/// (i.e. `ceil(height_in_samples / 16)`).
|
||||
pub fn tile_footprint_with_height(&self, rows_of_tiles: u16) -> u16 {
|
||||
self.tile_pitch().saturating_mul(rows_of_tiles)
|
||||
}
|
||||
}
|
||||
|
||||
/// Who currently owns a tile of EDRAM.
|
||||
///
|
||||
/// `None`: untouched; free to claim.
|
||||
/// `Host(idx)`: a single RT has exclusive ownership.
|
||||
/// `Shared(idx)`: two+ RT keys map to the same tile (usually after a
|
||||
/// format change without an intervening clear); the named RT is the most
|
||||
/// recent owner whose format should be honored for readback.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[derive(Default)]
|
||||
pub enum TileOwner {
|
||||
#[default]
|
||||
None,
|
||||
Host(u32),
|
||||
Shared(u32),
|
||||
}
|
||||
|
||||
|
||||
/// Bookkeeping across the 2048 EDRAM tiles. Not a GPU resource by itself —
|
||||
/// tracks which render target (by index) currently owns each tile.
|
||||
pub struct EdramTileBook {
|
||||
tiles: Vec<TileOwner>,
|
||||
}
|
||||
|
||||
impl Default for EdramTileBook {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl EdramTileBook {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tiles: vec![TileOwner::None; EDRAM_TILE_COUNT],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn who_owns(&self, tile: u16) -> TileOwner {
|
||||
self.tiles
|
||||
.get(tile as usize)
|
||||
.copied()
|
||||
.unwrap_or(TileOwner::None)
|
||||
}
|
||||
|
||||
/// Mark `[base, base+count)` as owned by `rt_idx`. Pre-existing owners
|
||||
/// in the range are demoted to `Shared` (format reinterpretation).
|
||||
/// Returns the number of tiles newly claimed (not previously the same
|
||||
/// owner).
|
||||
pub fn claim(&mut self, base: u16, count: u16, rt_idx: u32) -> u32 {
|
||||
let mut newly_claimed = 0u32;
|
||||
for i in 0..(count as usize) {
|
||||
let t = base as usize + i;
|
||||
if t >= self.tiles.len() {
|
||||
break;
|
||||
}
|
||||
let prev = self.tiles[t];
|
||||
let already_ours = matches!(
|
||||
prev,
|
||||
TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
|
||||
);
|
||||
match prev {
|
||||
TileOwner::None => {
|
||||
self.tiles[t] = TileOwner::Host(rt_idx);
|
||||
}
|
||||
TileOwner::Host(idx) if idx == rt_idx => {
|
||||
// re-claim of same RT — no-op
|
||||
}
|
||||
_ => {
|
||||
// Format change / shared range.
|
||||
self.tiles[t] = TileOwner::Shared(rt_idx);
|
||||
}
|
||||
}
|
||||
if !already_ours {
|
||||
newly_claimed += 1;
|
||||
}
|
||||
}
|
||||
newly_claimed
|
||||
}
|
||||
|
||||
/// Drop `rt_idx` from any tile it owns; tiles revert to `None` unless
|
||||
/// they were `Shared(rt_idx)` (in which case they also revert to
|
||||
/// `None`; the other sharer's ownership is lost — `release` is a
|
||||
/// coarse "this RT is gone" operation).
|
||||
pub fn release(&mut self, rt_idx: u32) {
|
||||
for t in self.tiles.iter_mut() {
|
||||
if matches!(
|
||||
*t,
|
||||
TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
|
||||
) {
|
||||
*t = TileOwner::None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Count tiles currently assigned to any RT (Host or Shared).
|
||||
pub fn occupied_count(&self) -> u32 {
|
||||
self.tiles
|
||||
.iter()
|
||||
.filter(|o| !matches!(o, TileOwner::None))
|
||||
.count() as u32
|
||||
}
|
||||
}
|
||||
|
||||
/// Minimal per-RT descriptor stored alongside the tile book. P5's texture
|
||||
/// cache will expand this with the actual wgpu texture handle.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct RtDescriptor {
|
||||
pub key: RenderTargetKey,
|
||||
/// Number of times this key has been bound since creation. Rough
|
||||
/// proxy for activity / hot-RT identification.
|
||||
pub bind_count: u32,
|
||||
/// Draw index on first bind — handy for debugging divergence.
|
||||
pub first_draw_index: u32,
|
||||
}
|
||||
|
||||
/// Top-level cache: maps packed keys to small descriptors + the tile book.
|
||||
pub struct RenderTargetCache {
|
||||
next_idx: u32,
|
||||
by_key: HashMap<u32, u32>,
|
||||
descriptors: HashMap<u32, RtDescriptor>,
|
||||
pub tiles: EdramTileBook,
|
||||
}
|
||||
|
||||
impl Default for RenderTargetCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl RenderTargetCache {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
next_idx: 0,
|
||||
by_key: HashMap::new(),
|
||||
descriptors: HashMap::new(),
|
||||
tiles: EdramTileBook::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up or allocate an RT descriptor for `key`. `draw_index` is the
|
||||
/// current monotonic draw counter — recorded on first insert for
|
||||
/// provenance.
|
||||
pub fn bind(&mut self, key: RenderTargetKey, draw_index: u32) -> u32 {
|
||||
let packed = key.pack();
|
||||
if let Some(&idx) = self.by_key.get(&packed) {
|
||||
if let Some(d) = self.descriptors.get_mut(&idx) {
|
||||
d.bind_count += 1;
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
let idx = self.next_idx;
|
||||
self.next_idx += 1;
|
||||
self.by_key.insert(packed, idx);
|
||||
self.descriptors.insert(
|
||||
idx,
|
||||
RtDescriptor {
|
||||
key,
|
||||
bind_count: 1,
|
||||
first_draw_index: draw_index,
|
||||
},
|
||||
);
|
||||
idx
|
||||
}
|
||||
|
||||
pub fn descriptor(&self, idx: u32) -> Option<&RtDescriptor> {
|
||||
self.descriptors.get(&idx)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.descriptors.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.descriptors.is_empty()
|
||||
}
|
||||
|
||||
/// Claim tiles for the descriptor at `rt_idx`. `height_tiles` is
|
||||
/// `ceil(viewport_height_samples / 16)` — callers supply it because
|
||||
/// the key itself doesn't carry height.
|
||||
pub fn claim_tiles(&mut self, rt_idx: u32, height_tiles: u16) -> u32 {
|
||||
if let Some(d) = self.descriptors.get(&rt_idx) {
|
||||
let footprint = d.key.tile_footprint_with_height(height_tiles);
|
||||
self.tiles.claim(d.key.base_tiles, footprint, rt_idx)
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn render_target_key_pack_roundtrip() {
|
||||
let k = RenderTargetKey {
|
||||
base_tiles: 1600,
|
||||
pitch_tiles_at_32bpp: 80,
|
||||
msaa_samples: MsaaSamples::X4,
|
||||
is_depth: true,
|
||||
resource_format: 0b1010,
|
||||
};
|
||||
let packed = k.pack();
|
||||
let round = RenderTargetKey::unpack(packed);
|
||||
assert_eq!(round, k);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tile_book_claim_marks_owners() {
|
||||
let mut book = EdramTileBook::new();
|
||||
assert_eq!(book.occupied_count(), 0);
|
||||
let new_count = book.claim(100, 10, 42);
|
||||
assert_eq!(new_count, 10);
|
||||
assert_eq!(book.who_owns(100), TileOwner::Host(42));
|
||||
assert_eq!(book.who_owns(109), TileOwner::Host(42));
|
||||
assert_eq!(book.who_owns(110), TileOwner::None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tile_book_claim_demotes_to_shared() {
|
||||
let mut book = EdramTileBook::new();
|
||||
book.claim(100, 10, 1);
|
||||
book.claim(105, 10, 2);
|
||||
// Overlap: tiles 105..110 should be Shared(2); 100..105 stay Host(1);
|
||||
// tiles 110..115 are fresh Host(2).
|
||||
assert_eq!(book.who_owns(104), TileOwner::Host(1));
|
||||
assert_eq!(book.who_owns(105), TileOwner::Shared(2));
|
||||
assert_eq!(book.who_owns(110), TileOwner::Host(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tile_book_release_frees_all() {
|
||||
let mut book = EdramTileBook::new();
|
||||
book.claim(0, 50, 7);
|
||||
book.release(7);
|
||||
assert_eq!(book.occupied_count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rt_cache_bind_is_idempotent_by_key() {
|
||||
let mut cache = RenderTargetCache::new();
|
||||
let k = RenderTargetKey {
|
||||
base_tiles: 0,
|
||||
pitch_tiles_at_32bpp: 80,
|
||||
msaa_samples: MsaaSamples::X1,
|
||||
is_depth: false,
|
||||
resource_format: 0,
|
||||
};
|
||||
let a = cache.bind(k, 0);
|
||||
let b = cache.bind(k, 1);
|
||||
assert_eq!(a, b);
|
||||
let d = cache.descriptor(a).unwrap();
|
||||
assert_eq!(d.bind_count, 2);
|
||||
assert_eq!(d.first_draw_index, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rt_cache_claim_tiles_tracks_footprint() {
|
||||
let mut cache = RenderTargetCache::new();
|
||||
let k = RenderTargetKey {
|
||||
base_tiles: 0,
|
||||
pitch_tiles_at_32bpp: 80, // 32bpp 1280-wide target
|
||||
msaa_samples: MsaaSamples::X1,
|
||||
is_depth: false,
|
||||
resource_format: 0,
|
||||
};
|
||||
let idx = cache.bind(k, 0);
|
||||
// 720 samples tall / 16 per tile = 45 rows → 80 * 45 = 3600 tiles;
|
||||
// caps out at 2048. Verify clamping.
|
||||
let newly = cache.claim_tiles(idx, 45);
|
||||
assert_eq!(newly, 2048);
|
||||
assert_eq!(cache.tiles.occupied_count(), 2048);
|
||||
}
|
||||
}
|
||||
1260
crates/xenia-gpu/src/resolve.rs
Normal file
1260
crates/xenia-gpu/src/resolve.rs
Normal file
File diff suppressed because it is too large
Load Diff
169
crates/xenia-gpu/src/ring_drain.rs
Normal file
169
crates/xenia-gpu/src/ring_drain.rs
Normal file
@@ -0,0 +1,169 @@
|
||||
//! Ring-buffer drainer.
|
||||
//!
|
||||
//! Walks a guest PM4 ring buffer from `start_offset` forward, classifying each
|
||||
//! packet via [`crate::pm4`] and stopping when it either reaches the end of
|
||||
//! the window it was asked to scan, walks off a NOP-fill region, or hits a
|
||||
//! malformed header.
|
||||
//!
|
||||
//! It does **not** execute draws — that's deferred to a later phase. Its job
|
||||
//! is to (a) advance the read pointer far enough that games keep making
|
||||
//! progress, and (b) surface `PM4_XE_SWAP` packets so `VdSwap` can forward
|
||||
//! them to the host UI.
|
||||
|
||||
use xenia_memory::MemoryAccess;
|
||||
|
||||
use crate::pm4::{self, PacketKind};
|
||||
|
||||
/// Outcome of a [`drain`] call.
|
||||
#[derive(Default, Debug, Clone, Copy)]
|
||||
pub struct DrainResult {
|
||||
/// Dword offset reached, relative to the start of the ring buffer.
|
||||
pub new_offset: u32,
|
||||
/// How many packets were walked in this call.
|
||||
pub packets_walked: u32,
|
||||
/// True if we saw `PM4_XE_SWAP` during the walk.
|
||||
pub swap_seen: bool,
|
||||
/// If `swap_seen`, the guest frontbuffer *physical* address written next
|
||||
/// to `PM4_XE_SWAP` (dword 2 of the 4-payload packet).
|
||||
pub swap_frontbuffer_phys: u32,
|
||||
/// If `swap_seen`, the width written at dword 3.
|
||||
pub swap_width: u32,
|
||||
/// If `swap_seen`, the height written at dword 4.
|
||||
pub swap_height: u32,
|
||||
}
|
||||
|
||||
/// Walk `max_packets` packets starting at dword offset `start_offset` in the
|
||||
/// ring buffer at guest address `ring_base` of size `ring_size_dwords`.
|
||||
///
|
||||
/// The offset is treated modulo `ring_size_dwords`. Walking stops when:
|
||||
/// - `max_packets` have been walked,
|
||||
/// - a `PM4_XE_SWAP` has been consumed (the swap is reported and we stop so
|
||||
/// the UI sees the frame boundary before further drain),
|
||||
/// - a header's declared total size would exceed the remaining budget,
|
||||
/// - the ring size is zero (drainer is a no-op).
|
||||
pub fn drain<M: MemoryAccess + ?Sized>(
|
||||
mem: &M,
|
||||
ring_base: u32,
|
||||
ring_size_dwords: u32,
|
||||
start_offset: u32,
|
||||
max_packets: u32,
|
||||
) -> DrainResult {
|
||||
if ring_size_dwords == 0 || ring_base == 0 {
|
||||
return DrainResult::default();
|
||||
}
|
||||
let mut result = DrainResult {
|
||||
new_offset: start_offset % ring_size_dwords,
|
||||
..DrainResult::default()
|
||||
};
|
||||
let mut offset = result.new_offset;
|
||||
for _ in 0..max_packets {
|
||||
let header_addr = ring_base.wrapping_add(offset.wrapping_mul(4));
|
||||
let header = mem.read_u32(header_addr);
|
||||
let packet = pm4::decode(header);
|
||||
// Refuse to walk past the ring in a single packet.
|
||||
if packet.total_dwords > ring_size_dwords {
|
||||
break;
|
||||
}
|
||||
// Type-3 PM4_XE_SWAP → record payload and stop.
|
||||
if let PacketKind::Type3 { opcode, .. } = packet.kind
|
||||
&& opcode == pm4::PM4_XE_SWAP {
|
||||
// Payload layout (from canary VdSwap_entry):
|
||||
// [0] XE_SWAP header
|
||||
// [1] kSwapSignature ("XNEX" = 0x584E4558)
|
||||
// [2] frontbuffer physical address
|
||||
// [3] width
|
||||
// [4] height
|
||||
let payload = |i: u32| {
|
||||
let addr =
|
||||
ring_base.wrapping_add(((offset + i) % ring_size_dwords).wrapping_mul(4));
|
||||
mem.read_u32(addr)
|
||||
};
|
||||
result.swap_seen = true;
|
||||
result.swap_frontbuffer_phys = payload(2);
|
||||
result.swap_width = payload(3);
|
||||
result.swap_height = payload(4);
|
||||
offset = (offset + packet.total_dwords) % ring_size_dwords;
|
||||
result.new_offset = offset;
|
||||
result.packets_walked += 1;
|
||||
return result;
|
||||
}
|
||||
offset = (offset + packet.total_dwords) % ring_size_dwords;
|
||||
result.new_offset = offset;
|
||||
result.packets_walked += 1;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use xenia_memory::GuestMemory;
|
||||
use xenia_memory::page_table::MemoryProtect;
|
||||
|
||||
fn build_mem() -> GuestMemory {
|
||||
let mut mem = GuestMemory::new().unwrap();
|
||||
let rw = MemoryProtect::READ | MemoryProtect::WRITE;
|
||||
mem.alloc(0x4000_0000, 0x1000, rw).unwrap();
|
||||
mem
|
||||
}
|
||||
|
||||
fn write_dword(mem: &GuestMemory, addr: u32, val: u32) {
|
||||
mem.write_u32(addr, val);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn walks_nops_until_budget_exhausted() {
|
||||
let mut mem = build_mem();
|
||||
// Fill 10 dwords with Type-2 NOPs.
|
||||
for i in 0..10 {
|
||||
write_dword(&mut mem, 0x4000_0000 + i * 4, 0x8000_0000);
|
||||
}
|
||||
let r = drain(&mem, 0x4000_0000, 0x400, 0, 5);
|
||||
assert_eq!(r.packets_walked, 5);
|
||||
assert_eq!(r.new_offset, 5);
|
||||
assert!(!r.swap_seen);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stops_at_swap_and_reports_payload() {
|
||||
let mut mem = build_mem();
|
||||
// Two NOPs, then a PM4_XE_SWAP packet.
|
||||
write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
|
||||
write_dword(&mut mem, 0x4000_0004, 0x8000_0000);
|
||||
// MakePacketType3(PM4_XE_SWAP, 4) → (3<<30) | (3<<16) | (0x64<<8)
|
||||
let swap_hdr = (3u32 << 30) | (3u32 << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
|
||||
write_dword(&mut mem, 0x4000_0008, swap_hdr);
|
||||
write_dword(&mut mem, 0x4000_000C, pm4::SWAP_SIGNATURE);
|
||||
write_dword(&mut mem, 0x4000_0010, 0xDEAD_F000); // frontbuffer phys
|
||||
write_dword(&mut mem, 0x4000_0014, 1280);
|
||||
write_dword(&mut mem, 0x4000_0018, 720);
|
||||
let r = drain(&mem, 0x4000_0000, 0x400, 0, 16);
|
||||
assert!(r.swap_seen);
|
||||
assert_eq!(r.swap_frontbuffer_phys, 0xDEAD_F000);
|
||||
assert_eq!(r.swap_width, 1280);
|
||||
assert_eq!(r.swap_height, 720);
|
||||
assert_eq!(r.packets_walked, 3);
|
||||
assert_eq!(r.new_offset, 7); // 2 NOPs (1 dword each) + 5-dword swap = 7
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wraps_around_ring() {
|
||||
let mut mem = build_mem();
|
||||
// Ring size = 4 dwords. Start at offset 3 (last dword). Write a NOP
|
||||
// there, then the walker should wrap to offset 0.
|
||||
write_dword(&mut mem, 0x4000_000C, 0x8000_0000);
|
||||
write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
|
||||
let r = drain(&mem, 0x4000_0000, 4, 3, 2);
|
||||
assert_eq!(r.packets_walked, 2);
|
||||
assert_eq!(r.new_offset, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn zero_ring_size_is_noop() {
|
||||
let mem = build_mem();
|
||||
let r = drain(&mem, 0x4000_0000, 0, 0, 10);
|
||||
assert_eq!(r.packets_walked, 0);
|
||||
assert_eq!(r.new_offset, 0);
|
||||
assert!(!r.swap_seen);
|
||||
}
|
||||
}
|
||||
123
crates/xenia-gpu/src/ring_view.rs
Normal file
123
crates/xenia-gpu/src/ring_view.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
//! Primary ring buffer view.
|
||||
//!
|
||||
//! Games allocate a ring buffer in physical memory (via
|
||||
//! `MmAllocatePhysicalMemoryEx` with WRITE_COMBINE), then hand the base
|
||||
//! address + log2(size) to `VdInitializeRingBuffer`. They subsequently push
|
||||
//! PM4 packets into it, advancing the write-pointer by writing to a GPU
|
||||
//! register (`CP_RB_WPTR`) or via kernel-call shims.
|
||||
//!
|
||||
//! The GPU consumes packets from `read_offset_dwords` up to (but not past)
|
||||
//! the write pointer. After consuming enough bytes it writes `read_offset`
|
||||
//! into the guest-memory address registered by `VdEnableRingBufferRPtrWriteBack`
|
||||
//! so the game can know how much of the ring has been consumed.
|
||||
|
||||
/// Tracks the primary ring buffer as set up by the guest.
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct RingBufferView {
|
||||
/// Guest physical/virtual base address. `0` means uninitialized.
|
||||
pub base: u32,
|
||||
/// Size of the ring in dwords. `0` means uninitialized.
|
||||
pub size_dwords: u32,
|
||||
/// Dword offset the GPU has consumed up to (relative to `base`).
|
||||
pub read_offset_dwords: u32,
|
||||
/// Dword offset the guest has last written into (relative to `base`).
|
||||
/// Updated either by an MMIO write to `CP_RB_WPTR` or by the kernel
|
||||
/// (`VdSwap` is a hint — the game reserves a 64-dword slot in the ring
|
||||
/// for it).
|
||||
pub write_offset_dwords: u32,
|
||||
/// Guest address where we mirror `read_offset_dwords` each time we make
|
||||
/// progress. `0` if the game never called `VdEnableRingBufferRPtrWriteBack`.
|
||||
pub rptr_writeback_addr: u32,
|
||||
/// Write-back block granularity in dwords (from the `log2` arg to
|
||||
/// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so
|
||||
/// we don't actually use this for scheduling — kept for observability.
|
||||
pub rptr_writeback_block_dwords: u32,
|
||||
}
|
||||
|
||||
impl RingBufferView {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// True if the guest has provided a base + size.
|
||||
pub fn is_initialized(&self) -> bool {
|
||||
self.base != 0 && self.size_dwords != 0
|
||||
}
|
||||
|
||||
/// True if there is pending unread data to consume.
|
||||
pub fn has_pending(&self) -> bool {
|
||||
self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords
|
||||
}
|
||||
|
||||
/// Number of dwords we can consume without wrapping past the write ptr.
|
||||
pub fn pending_dwords(&self) -> u32 {
|
||||
if !self.is_initialized() {
|
||||
return 0;
|
||||
}
|
||||
if self.write_offset_dwords >= self.read_offset_dwords {
|
||||
self.write_offset_dwords - self.read_offset_dwords
|
||||
} else {
|
||||
// write has wrapped — we can read up to the end of the ring.
|
||||
self.size_dwords - self.read_offset_dwords
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance the read pointer by `dwords`, wrapping at `size_dwords`.
|
||||
pub fn advance_read(&mut self, dwords: u32) {
|
||||
if self.size_dwords == 0 {
|
||||
return;
|
||||
}
|
||||
self.read_offset_dwords =
|
||||
(self.read_offset_dwords + dwords) % self.size_dwords;
|
||||
}
|
||||
|
||||
/// Guest address for the dword at relative offset `i` from the current
|
||||
/// read pointer. `None` if uninitialized.
|
||||
pub fn addr_at_offset(&self, offset_dwords: u32) -> Option<u32> {
|
||||
if !self.is_initialized() {
|
||||
return None;
|
||||
}
|
||||
let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords;
|
||||
Some(self.base.wrapping_add(off.wrapping_mul(4)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn uninitialized_view_reports_empty() {
|
||||
let v = RingBufferView::new();
|
||||
assert!(!v.is_initialized());
|
||||
assert!(!v.has_pending());
|
||||
assert_eq!(v.pending_dwords(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wrap_around_arithmetic() {
|
||||
let mut v = RingBufferView::new();
|
||||
v.base = 0x4000_0000;
|
||||
v.size_dwords = 16;
|
||||
v.read_offset_dwords = 14;
|
||||
v.write_offset_dwords = 2; // wrapped
|
||||
|
||||
// We can only read to end-of-ring in one chunk.
|
||||
assert_eq!(v.pending_dwords(), 2);
|
||||
v.advance_read(2);
|
||||
assert_eq!(v.read_offset_dwords, 0);
|
||||
// Now unwrapped, 2 more to go.
|
||||
assert_eq!(v.pending_dwords(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn addr_at_offset_wraps() {
|
||||
let mut v = RingBufferView::new();
|
||||
v.base = 0x4000_0000;
|
||||
v.size_dwords = 4;
|
||||
v.read_offset_dwords = 3;
|
||||
assert_eq!(v.addr_at_offset(0), Some(0x4000_000C));
|
||||
assert_eq!(v.addr_at_offset(1), Some(0x4000_0000));
|
||||
assert_eq!(v.addr_at_offset(2), Some(0x4000_0004));
|
||||
}
|
||||
}
|
||||
350
crates/xenia-gpu/src/shader_metrics.rs
Normal file
350
crates/xenia-gpu/src/shader_metrics.rs
Normal file
@@ -0,0 +1,350 @@
|
||||
//! Host-side static analysis over a [`ParsedShader`], emitted once per unique
|
||||
//! shader blob. Produces the observability the plan's P3b/P3c sections call
|
||||
//! for (`gpu.shader.interpret{stage,kind}` + `gpu.shader.reject{reason}`), so
|
||||
//! the HUD can show when a game is reaching ops the WGSL interpreter falls
|
||||
//! back on.
|
||||
//!
|
||||
//! Analysis is intentionally cheap: it scans each exec clause's instruction
|
||||
//! triples, classifies them as ALU / vertex-fetch / texture-fetch using the
|
||||
//! owning clause's sequence bitmap, and bumps counters accordingly. No GPU
|
||||
//! readback is required — `reject` reasons are inferred from opcode values
|
||||
//! alone.
|
||||
|
||||
use metrics::counter;
|
||||
|
||||
use crate::ucode::alu::{decode_alu, sop, vop};
|
||||
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||
use crate::ucode::fetch::{FetchInstruction, decode_fetch};
|
||||
use crate::ucode::ParsedShader;
|
||||
|
||||
/// Walk `parsed` once and emit `gpu.shader.interpret` + `gpu.shader.reject`
|
||||
/// counters. `stage` should be `"vs"` or `"ps"`.
|
||||
pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
let mut alu_count: u64 = 0;
|
||||
let mut vfetch_count: u64 = 0;
|
||||
let mut tfetch_count: u64 = 0;
|
||||
let mut rejects: Vec<(&'static str, u64)> = Vec::new();
|
||||
|
||||
let mut features: Vec<&'static str> = Vec::new();
|
||||
for clause in &parsed.cf {
|
||||
match clause {
|
||||
ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
..
|
||||
} => {
|
||||
for i in 0..(*count as usize) {
|
||||
let triple_idx = *address as usize + i;
|
||||
let base = triple_idx * 3;
|
||||
if base + 2 >= parsed.instructions.len() {
|
||||
break;
|
||||
}
|
||||
let words = [
|
||||
parsed.instructions[base],
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(_) => vfetch_count += 1,
|
||||
FetchInstruction::Texture(tf) => {
|
||||
tfetch_count += 1;
|
||||
match tf.dimension {
|
||||
0 => mark_feature(&mut features, "tfetch_1d"),
|
||||
2 => mark_feature(&mut features, "tfetch_3d"),
|
||||
3 => mark_feature(&mut features, "tfetch_cube"),
|
||||
_ => {}
|
||||
}
|
||||
if tf.dimension != 1 {
|
||||
bump(&mut rejects, "texfetch_dimension");
|
||||
}
|
||||
}
|
||||
FetchInstruction::Unknown { .. } => {
|
||||
bump(&mut rejects, "fetch_unknown");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
alu_count += 1;
|
||||
let alu = decode_alu(words);
|
||||
if !vec_op_supported(alu.vector_opcode) {
|
||||
bump(&mut rejects, "alu_vec_unsupported");
|
||||
}
|
||||
if !scl_op_supported(alu.scalar_opcode) {
|
||||
bump(&mut rejects, "alu_scl_unsupported");
|
||||
}
|
||||
// Feature-of-interest detection for future phases.
|
||||
// Transcendentals + kill + setp + cube/max4 are the
|
||||
// high-value signals: they tell us which of the
|
||||
// deferred capabilities Sylpheed actually exercises.
|
||||
match alu.vector_opcode {
|
||||
v if v == vop::CUBE => mark_feature(&mut features, "vec_cube"),
|
||||
v if v == vop::MAX4 => mark_feature(&mut features, "vec_max4"),
|
||||
v if v == vop::KILL_EQ
|
||||
|| v == vop::KILL_GT
|
||||
|| v == vop::KILL_GE
|
||||
|| v == vop::KILL_NE =>
|
||||
{
|
||||
mark_feature(&mut features, "vec_kill");
|
||||
}
|
||||
v if v == vop::CND_EQ
|
||||
|| v == vop::CND_GE
|
||||
|| v == vop::CND_GT =>
|
||||
{
|
||||
mark_feature(&mut features, "vec_cnd");
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
match alu.scalar_opcode {
|
||||
s if s == sop::EXP
|
||||
|| s == sop::LOG
|
||||
|| s == sop::LOGC
|
||||
|| s == sop::SIN
|
||||
|| s == sop::COS =>
|
||||
{
|
||||
mark_feature(&mut features, "scl_transcendental");
|
||||
}
|
||||
s if s == sop::RSQ
|
||||
|| s == sop::RSQC
|
||||
|| s == sop::RSQF
|
||||
|| s == sop::SQRT =>
|
||||
{
|
||||
mark_feature(&mut features, "scl_sqrt_family");
|
||||
}
|
||||
s if s == sop::SETP_EQ
|
||||
|| s == sop::SETP_NE
|
||||
|| s == sop::SETP_GT
|
||||
|| s == sop::SETP_GE
|
||||
|| s == sop::SETP_INV
|
||||
|| s == sop::SETP_POP
|
||||
|| s == sop::SETP_CLR
|
||||
|| s == sop::SETP_RSTR =>
|
||||
{
|
||||
mark_feature(&mut features, "scl_setp");
|
||||
}
|
||||
s if s == sop::KILLS_EQ
|
||||
|| s == sop::KILLS_GT
|
||||
|| s == sop::KILLS_GE
|
||||
|| s == sop::KILLS_NE
|
||||
|| s == sop::KILLS_ONE =>
|
||||
{
|
||||
mark_feature(&mut features, "scl_kills");
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
if alu.predicated {
|
||||
mark_feature(&mut features, "alu_predicated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ControlFlowInstruction::LoopStart { .. }
|
||||
| ControlFlowInstruction::LoopEnd { .. } => {
|
||||
mark_feature(&mut features, "cf_loop");
|
||||
bump(&mut rejects, "cf_loop");
|
||||
}
|
||||
ControlFlowInstruction::CondJmp { .. } => {
|
||||
mark_feature(&mut features, "cf_cond_jmp");
|
||||
bump(&mut rejects, "cf_cond_jmp");
|
||||
}
|
||||
ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
|
||||
mark_feature(&mut features, "cf_call_return");
|
||||
bump(&mut rejects, "cf_call_return");
|
||||
}
|
||||
ControlFlowInstruction::Unknown { .. } => {
|
||||
bump(&mut rejects, "cf_unknown");
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "alu")
|
||||
.increment(alu_count);
|
||||
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "vfetch")
|
||||
.increment(vfetch_count);
|
||||
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "tfetch")
|
||||
.increment(tfetch_count);
|
||||
for (reason, n) in rejects {
|
||||
counter!("gpu.shader.reject", "stage" => stage, "reason" => reason).increment(n);
|
||||
}
|
||||
for name in features {
|
||||
counter!("gpu.feature.used", "stage" => stage, "name" => name).increment(1);
|
||||
}
|
||||
}
|
||||
|
||||
fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
|
||||
if !buf.contains(&name) {
|
||||
buf.push(name);
|
||||
}
|
||||
}
|
||||
|
||||
fn bump(buf: &mut Vec<(&'static str, u64)>, reason: &'static str) {
|
||||
for entry in buf.iter_mut() {
|
||||
if entry.0 == reason {
|
||||
entry.1 += 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
buf.push((reason, 1));
|
||||
}
|
||||
|
||||
fn vec_op_supported(op: u8) -> bool {
|
||||
matches!(
|
||||
op,
|
||||
vop::ADD
|
||||
| vop::MUL
|
||||
| vop::MAX
|
||||
| vop::MIN
|
||||
| vop::SEQ
|
||||
| vop::SGT
|
||||
| vop::SGE
|
||||
| vop::SNE
|
||||
| vop::FRC
|
||||
| vop::TRUNC
|
||||
| vop::FLOOR
|
||||
| vop::MAD
|
||||
| vop::CND_EQ
|
||||
| vop::CND_GE
|
||||
| vop::CND_GT
|
||||
| vop::DOT4
|
||||
| vop::DOT3
|
||||
| vop::DOT2_ADD
|
||||
| vop::MAX4
|
||||
| vop::KILL_EQ
|
||||
| vop::KILL_GT
|
||||
| vop::KILL_GE
|
||||
| vop::KILL_NE
|
||||
| vop::DST
|
||||
)
|
||||
}
|
||||
|
||||
fn scl_op_supported(op: u8) -> bool {
|
||||
matches!(
|
||||
op,
|
||||
sop::ADDS
|
||||
| sop::ADDS_PREV
|
||||
| sop::MULS
|
||||
| sop::MULS_PREV
|
||||
| sop::MAXS
|
||||
| sop::MINS
|
||||
| sop::SEQS
|
||||
| sop::SGTS
|
||||
| sop::SGES
|
||||
| sop::SNES
|
||||
| sop::FRCS
|
||||
| sop::TRUNCS
|
||||
| sop::FLOORS
|
||||
| sop::EXP
|
||||
| sop::LOG
|
||||
| sop::LOGC
|
||||
| sop::RCP
|
||||
| sop::RCPC
|
||||
| sop::RCPF
|
||||
| sop::RSQ
|
||||
| sop::RSQC
|
||||
| sop::RSQF
|
||||
| sop::SQRT
|
||||
| sop::SUBS
|
||||
| sop::SUBS_PREV
|
||||
| sop::SETP_EQ
|
||||
| sop::SETP_NE
|
||||
| sop::SETP_GT
|
||||
| sop::SETP_GE
|
||||
| sop::SETP_INV
|
||||
| sop::SETP_POP
|
||||
| sop::SETP_CLR
|
||||
| sop::SETP_RSTR
|
||||
| sop::KILLS_EQ
|
||||
| sop::KILLS_GT
|
||||
| sop::KILLS_GE
|
||||
| sop::KILLS_NE
|
||||
| sop::KILLS_ONE
|
||||
| sop::SIN
|
||||
| sop::COS
|
||||
| sop::RETAIN_PREV
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::ucode::alu::{sop, vop};
|
||||
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||
|
||||
/// Build a minimal `ParsedShader` with one `Exec` clause containing
|
||||
/// `count` ALU triples and assert the `alu` counter path works.
|
||||
#[test]
|
||||
fn emit_for_runs_on_synthetic_shader() {
|
||||
let alu_w2 = (vop::ADD as u32) | ((sop::ADDS as u32) << 6) | (0xF << 12);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 2,
|
||||
sequence: 0, // all ALU (no is-fetch bits)
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
ControlFlowInstruction::Exit,
|
||||
],
|
||||
instructions: vec![0, 0, alu_w2, 0, 0, alu_w2],
|
||||
};
|
||||
// Just smoke: doesn't panic. Counters are validated via metrics
|
||||
// exporters elsewhere; we only assert this doesn't throw on a
|
||||
// well-formed ParsedShader.
|
||||
emit_for(&shader, "vs");
|
||||
}
|
||||
|
||||
/// P8: a shader containing `LoopStart` should mark `cf_loop` as used
|
||||
/// so the HUD can surface which deferred feature a game triggers.
|
||||
#[test]
|
||||
fn feature_detection_flags_loops_and_kills() {
|
||||
let kill_alu_w2 =
|
||||
(vop::KILL_EQ as u32) | ((sop::RETAIN_PREV as u32) << 6) | (0xF << 12);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::LoopStart {
|
||||
address: 0,
|
||||
loop_id: 0,
|
||||
},
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: true,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![0, 0, kill_alu_w2],
|
||||
};
|
||||
// Smoke: emits cleanly.
|
||||
emit_for(&shader, "ps");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unsupported_ops_classified_as_rejects() {
|
||||
// Opcode 63 is outside our supported sets for both pipes.
|
||||
let alu_w2 = 63u32 | (63u32 << 6) | (0xF << 12);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: true,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![0, 0, alu_w2],
|
||||
};
|
||||
// Again: smoke — but also confirm our static tables reject op 63.
|
||||
assert!(!vec_op_supported(63));
|
||||
assert!(!scl_op_supported(63));
|
||||
emit_for(&shader, "ps");
|
||||
}
|
||||
}
|
||||
36
crates/xenia-gpu/src/shaders/mod.rs
Normal file
36
crates/xenia-gpu/src/shaders/mod.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
//! Embedded WGSL shader sources used by the host pipeline.
|
||||
|
||||
/// Xenos uber-shader scaffold (P3). See the comment at the top of
|
||||
/// [`XENOS_INTERP_WGSL`] for scope/limits and the plan's P3b target state.
|
||||
pub const XENOS_INTERP_WGSL: &str = include_str!("xenos_interp.wgsl");
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Parsing through naga validates the shader against WGSL spec + wgpu's
|
||||
/// type system. We don't need a full pipeline to catch typos and layout
|
||||
/// mistakes — this test is fast and catches regressions at `cargo test`
|
||||
/// time.
|
||||
#[test]
|
||||
fn xenos_interp_wgsl_parses() {
|
||||
let module = naga::front::wgsl::parse_str(XENOS_INTERP_WGSL)
|
||||
.expect("xenos_interp.wgsl must parse cleanly");
|
||||
// Sanity: we declared two entry points.
|
||||
assert!(!module.entry_points.is_empty());
|
||||
assert!(
|
||||
module
|
||||
.entry_points
|
||||
.iter()
|
||||
.any(|e| e.name == "vs_main" && e.stage == naga::ShaderStage::Vertex),
|
||||
"missing vs_main entry"
|
||||
);
|
||||
assert!(
|
||||
module
|
||||
.entry_points
|
||||
.iter()
|
||||
.any(|e| e.name == "fs_main" && e.stage == naga::ShaderStage::Fragment),
|
||||
"missing fs_main entry"
|
||||
);
|
||||
}
|
||||
}
|
||||
1036
crates/xenia-gpu/src/shaders/xenos_interp.wgsl
Normal file
1036
crates/xenia-gpu/src/shaders/xenos_interp.wgsl
Normal file
File diff suppressed because it is too large
Load Diff
970
crates/xenia-gpu/src/texture_cache.rs
Normal file
970
crates/xenia-gpu/src/texture_cache.rs
Normal file
@@ -0,0 +1,970 @@
|
||||
//! Texture cache — P5.
|
||||
//!
|
||||
//! Two-layer design mirroring canary's `TextureCache`:
|
||||
//!
|
||||
//! * **CPU layer** (this module): owns decoded, linear, host-endian texel
|
||||
//! byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the
|
||||
//! guest memory's page-version counter to decide whether the cached
|
||||
//! bytes are still fresh and re-decodes on miss or staleness.
|
||||
//! * **GPU layer** (xenia-ui `texture_cache_host`): owns the
|
||||
//! `wgpu::Texture` + `TextureView` for each cached key; pulls decoded
|
||||
//! bytes from this CPU layer on upload.
|
||||
//!
|
||||
//! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and
|
||||
//! `texture_info_formats.inl` for the format table.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::tiled_address;
|
||||
|
||||
/// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`.
|
||||
/// Values are the raw enum numbers the guest writes into
|
||||
/// `xe_gpu_texture_fetch_t.format`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
#[repr(u8)]
|
||||
pub enum TextureFormat {
|
||||
K1Reverse = 0,
|
||||
K1 = 1,
|
||||
K8 = 2,
|
||||
K1555 = 3,
|
||||
K565 = 4,
|
||||
K6_5_5 = 5,
|
||||
K8888 = 6,
|
||||
K1010102 = 7,
|
||||
K8_8 = 10,
|
||||
K4_4_4_4 = 15,
|
||||
K10_11_11 = 16,
|
||||
K11_11_10 = 17,
|
||||
Dxt1 = 18,
|
||||
Dxt2_3 = 19,
|
||||
Dxt4_5 = 20,
|
||||
K24_8 = 22,
|
||||
K24_8Float = 23,
|
||||
K16 = 24,
|
||||
K16_16 = 25,
|
||||
K16_16_16_16 = 26,
|
||||
K16Float = 30,
|
||||
K16_16Float = 31,
|
||||
K16_16_16_16Float = 32,
|
||||
K32 = 33,
|
||||
K32_32 = 34,
|
||||
K32_32_32_32 = 35,
|
||||
K32Float = 36,
|
||||
K32_32Float = 37,
|
||||
K32_32_32_32Float = 38,
|
||||
Unknown(u8),
|
||||
}
|
||||
|
||||
impl TextureFormat {
|
||||
pub fn from_raw(v: u8) -> Self {
|
||||
use TextureFormat::*;
|
||||
match v & 0x3F {
|
||||
0 => K1Reverse,
|
||||
1 => K1,
|
||||
2 => K8,
|
||||
3 => K1555,
|
||||
4 => K565,
|
||||
5 => K6_5_5,
|
||||
6 => K8888,
|
||||
7 => K1010102,
|
||||
10 => K8_8,
|
||||
15 => K4_4_4_4,
|
||||
16 => K10_11_11,
|
||||
17 => K11_11_10,
|
||||
18 => Dxt1,
|
||||
19 => Dxt2_3,
|
||||
20 => Dxt4_5,
|
||||
22 => K24_8,
|
||||
23 => K24_8Float,
|
||||
24 => K16,
|
||||
25 => K16_16,
|
||||
26 => K16_16_16_16,
|
||||
30 => K16Float,
|
||||
31 => K16_16Float,
|
||||
32 => K16_16_16_16Float,
|
||||
33 => K32,
|
||||
34 => K32_32,
|
||||
35 => K32_32_32_32,
|
||||
36 => K32Float,
|
||||
37 => K32_32Float,
|
||||
38 => K32_32_32_32Float,
|
||||
other => Unknown(other),
|
||||
}
|
||||
}
|
||||
|
||||
/// Block width/height in texels + bytes-per-block. For uncompressed
|
||||
/// formats block_w = block_h = 1. For DXT formats block_w = block_h =
|
||||
/// 4 (one 4×4 compressed block).
|
||||
pub fn block_info(self) -> BlockInfo {
|
||||
use TextureFormat::*;
|
||||
match self {
|
||||
K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte
|
||||
K8 => BlockInfo::new(1, 1, 1),
|
||||
K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2),
|
||||
K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16
|
||||
| K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4),
|
||||
K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8),
|
||||
K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16),
|
||||
Dxt1 => BlockInfo::new(4, 4, 8),
|
||||
Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16),
|
||||
Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback
|
||||
}
|
||||
}
|
||||
|
||||
/// True iff this format lands on a wgpu texture format we can
|
||||
/// natively bind — no CPU-side conversion per frame required. M5
|
||||
/// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still
|
||||
/// counts as supported for the host-cache wiring), `k_DXT2_3`
|
||||
/// (BC2), and `k_DXT4_5` (BC3).
|
||||
pub fn is_host_supported(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
TextureFormat::K8888
|
||||
| TextureFormat::K565
|
||||
| TextureFormat::Dxt1
|
||||
| TextureFormat::Dxt2_3
|
||||
| TextureFormat::Dxt4_5
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct BlockInfo {
|
||||
pub block_w: u8,
|
||||
pub block_h: u8,
|
||||
pub bytes_per_block: u8,
|
||||
}
|
||||
|
||||
impl BlockInfo {
|
||||
pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self {
|
||||
Self {
|
||||
block_w,
|
||||
block_h,
|
||||
bytes_per_block,
|
||||
}
|
||||
}
|
||||
pub fn log2_bpb(self) -> u32 {
|
||||
match self.bytes_per_block {
|
||||
1 => 0,
|
||||
2 => 1,
|
||||
4 => 2,
|
||||
8 => 3,
|
||||
16 => 4,
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Endian {
|
||||
None = 0,
|
||||
Swap8In16 = 1,
|
||||
Swap8In32 = 2,
|
||||
Swap16In32 = 3,
|
||||
}
|
||||
|
||||
impl Endian {
|
||||
pub fn from_raw(v: u8) -> Self {
|
||||
match v & 0x3 {
|
||||
1 => Endian::Swap8In16,
|
||||
2 => Endian::Swap8In32,
|
||||
3 => Endian::Swap16In32,
|
||||
_ => Endian::None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply this endian's byte swap to one 32-bit unit. Matches canary's
|
||||
/// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls
|
||||
/// the same mask-shift pattern.
|
||||
pub fn swap32(self, v: u32) -> u32 {
|
||||
match self {
|
||||
Endian::None => v,
|
||||
Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8),
|
||||
Endian::Swap8In32 => v.swap_bytes(),
|
||||
Endian::Swap16In32 => v.rotate_right(16),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Texture dimensionality (`xenos::DataDimension`).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Dimension {
|
||||
D1 = 0,
|
||||
D2 = 1,
|
||||
D3Stacked = 2,
|
||||
Cube = 3,
|
||||
}
|
||||
|
||||
impl Dimension {
|
||||
pub fn from_raw(v: u8) -> Self {
|
||||
match v & 0x3 {
|
||||
1 => Dimension::D2,
|
||||
2 => Dimension::D3Stacked,
|
||||
3 => Dimension::Cube,
|
||||
_ => Dimension::D1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Identity of a cached texture. Matches canary's `TextureCache::TextureKey`
|
||||
/// at the semantic level — we exclude mip/border state for P5 since neither
|
||||
/// is populated yet.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct TextureKey {
|
||||
/// Guest physical base (byte address — already shifted left by 12 from
|
||||
/// the fetch-constant `base_address` field).
|
||||
pub base_address: u32,
|
||||
pub width: u16,
|
||||
pub height: u16,
|
||||
pub depth_or_slices: u16,
|
||||
pub format: TextureFormat,
|
||||
pub endian: Endian,
|
||||
pub dimension: Dimension,
|
||||
pub tiled: bool,
|
||||
/// Row pitch in texels, already aligned to 32. Canary stores pitch/32
|
||||
/// in the fetch constant; we keep the raw texel count to avoid
|
||||
/// callers remembering to shift.
|
||||
pub pitch_texels: u16,
|
||||
}
|
||||
|
||||
/// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`).
|
||||
/// Returns `None` if the constant is obviously unset (all zeros) or if
|
||||
/// `type` is not the texture-constant marker.
|
||||
pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option<TextureKey> {
|
||||
let d0 = dwords[0];
|
||||
let d1 = dwords[1];
|
||||
let d2 = dwords[2];
|
||||
let d5 = dwords[5];
|
||||
|
||||
// type: low 2 bits of dword 0 should be 2 (texture) per canary —
|
||||
// 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so
|
||||
// `None` filters it out here.
|
||||
let ty = d0 & 0x3;
|
||||
if d0 == 0 && d1 == 0 {
|
||||
return None;
|
||||
}
|
||||
// Not a texture constant (e.g. 0 = vertex fetch constant reused).
|
||||
if ty != 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels
|
||||
let tiled = ((d0 >> 31) & 1) != 0;
|
||||
let format = TextureFormat::from_raw((d1 & 0x3F) as u8);
|
||||
let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8);
|
||||
let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted.
|
||||
let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8);
|
||||
|
||||
// Size decode depends on dimension.
|
||||
let (width, height, depth) = match dim {
|
||||
Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16),
|
||||
Dimension::D2 => (
|
||||
(d2 & 0x1FFF) as u16 + 1,
|
||||
((d2 >> 13) & 0x1FFF) as u16 + 1,
|
||||
((d2 >> 26) & 0x3F) as u16 + 1,
|
||||
),
|
||||
Dimension::D3Stacked | Dimension::Cube => (
|
||||
(d2 & 0x7FF) as u16 + 1,
|
||||
((d2 >> 11) & 0x7FF) as u16 + 1,
|
||||
((d2 >> 22) & 0x3FF) as u16 + 1,
|
||||
),
|
||||
};
|
||||
|
||||
Some(TextureKey {
|
||||
base_address,
|
||||
width,
|
||||
height,
|
||||
depth_or_slices: depth,
|
||||
format,
|
||||
endian,
|
||||
dimension: dim,
|
||||
tiled,
|
||||
pitch_texels: ((pitch_5 as u16) * 32).max(width),
|
||||
})
|
||||
}
|
||||
|
||||
/// Decoded, linear, host-endian texture bytes ready for wgpu upload.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CachedTexture {
|
||||
pub key: TextureKey,
|
||||
pub version_when_uploaded: u64,
|
||||
/// Tightly packed. Layout depends on `key.format`:
|
||||
/// - `K8888` → `width*height*4` bytes in Rgba8Unorm order.
|
||||
/// - `Dxt1` → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after
|
||||
/// block-level detile + dword-endian swap.
|
||||
pub bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
impl CachedTexture {
|
||||
pub fn byte_size(&self) -> usize {
|
||||
self.bytes.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors that can happen during decode. The `ensure_cached` caller maps
|
||||
/// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when
|
||||
/// a texture fell back.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DecodeError {
|
||||
UnsupportedFormat,
|
||||
OutOfBounds,
|
||||
ZeroSize,
|
||||
}
|
||||
|
||||
/// Read `len` bytes from guest memory starting at `addr`. Returns `None`
|
||||
/// if the span would exceed the memory's reported end; otherwise returns
|
||||
/// a freshly-allocated buffer with the bytes.
|
||||
///
|
||||
/// The `MemoryAccess` trait exposes per-byte reads only; we batch them in
|
||||
/// a single pass to avoid the per-byte virtual dispatch overhead for large
|
||||
/// textures (1 MiB frontbuffer = 1M dispatch calls).
|
||||
pub fn read_guest_bytes(
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
addr: u32,
|
||||
len: usize,
|
||||
) -> Vec<u8> {
|
||||
let mut out = Vec::with_capacity(len);
|
||||
for i in 0..len {
|
||||
let a = addr.wrapping_add(i as u32);
|
||||
out.push(mem.read_u8(a));
|
||||
if a < addr {
|
||||
// 32-bit overflow; unmap the tail.
|
||||
break;
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Byte-swap the 32-bit dwords of `buf` in place according to `endian`.
|
||||
/// `buf.len()` should be a multiple of 4; tail bytes are left untouched.
|
||||
pub fn apply_endian_32(buf: &mut [u8], endian: Endian) {
|
||||
if matches!(endian, Endian::None) {
|
||||
return;
|
||||
}
|
||||
let mut i = 0;
|
||||
while i + 4 <= buf.len() {
|
||||
let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]);
|
||||
let swapped = endian.swap32(v);
|
||||
buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes());
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes.
|
||||
/// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory →
|
||||
/// we emit RGBA for wgpu) and the declared endian swap, then detiles via
|
||||
/// the Xenos Tiled2D formula.
|
||||
pub fn decode_k8888_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
if key.width == 0 || key.height == 0 {
|
||||
return Err(DecodeError::ZeroSize);
|
||||
}
|
||||
let w = key.width as u32;
|
||||
let h = key.height as u32;
|
||||
let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
|
||||
let total_bytes = (pitch_aligned * h * 4) as usize;
|
||||
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||
if raw.len() < total_bytes {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
apply_endian_32(&mut raw, key.endian);
|
||||
let mut linear = vec![0u8; (w * h * 4) as usize];
|
||||
if key.tiled {
|
||||
if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
} else {
|
||||
// Non-tiled copy row-by-row honoring pitch.
|
||||
for y in 0..h as usize {
|
||||
let src = y * (pitch_aligned as usize) * 4;
|
||||
let dst = y * (w as usize) * 4;
|
||||
linear[dst..dst + (w as usize) * 4]
|
||||
.copy_from_slice(&raw[src..src + (w as usize) * 4]);
|
||||
}
|
||||
}
|
||||
// Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After
|
||||
// endian.Swap8In32 guests' typical per-dword byte order becomes BGRA
|
||||
// in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu.
|
||||
for px in linear.chunks_exact_mut(4) {
|
||||
px.swap(0, 2);
|
||||
}
|
||||
Ok(linear)
|
||||
}
|
||||
|
||||
/// Decode a DXT-compressed texture to raw block bytes (no format
|
||||
/// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the
|
||||
/// GPU does the actual decompression on upload).
|
||||
///
|
||||
/// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D
|
||||
/// formula, with stride counted in blocks. `bytes_per_block` is 8 for
|
||||
/// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5).
|
||||
pub fn decode_dxt_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
bytes_per_block: u32,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
if key.width == 0 || key.height == 0 {
|
||||
return Err(DecodeError::ZeroSize);
|
||||
}
|
||||
let block_w = 4u32;
|
||||
let block_h = 4u32;
|
||||
let w_blocks = (key.width as u32).div_ceil(block_w);
|
||||
let h_blocks = (key.height as u32).div_ceil(block_h);
|
||||
let pitch_blocks = tiled_address::align_pitch_to_macro_tile(
|
||||
(key.pitch_texels as u32).div_ceil(block_w),
|
||||
);
|
||||
let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize;
|
||||
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||
if raw.len() < total_bytes {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
// DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar
|
||||
// u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word
|
||||
// order requires an endian swap at the u16/u32 level regardless of
|
||||
// which BC-family format.
|
||||
apply_endian_32(&mut raw, key.endian);
|
||||
|
||||
let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize];
|
||||
if key.tiled {
|
||||
if tiled_address::detile_2d(
|
||||
&raw,
|
||||
&mut out,
|
||||
w_blocks,
|
||||
h_blocks,
|
||||
pitch_blocks,
|
||||
bytes_per_block,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
} else {
|
||||
for y in 0..h_blocks as usize {
|
||||
let src = y * (pitch_blocks as usize) * (bytes_per_block as usize);
|
||||
let dst = y * (w_blocks as usize) * (bytes_per_block as usize);
|
||||
out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)]
|
||||
.copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]);
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// BC1 / DXT1 — 8-byte blocks.
|
||||
pub fn decode_dxt1_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
decode_dxt_tiled(key, mem, 8)
|
||||
}
|
||||
|
||||
/// BC2 / DXT2_3 — 16-byte blocks.
|
||||
pub fn decode_dxt23_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
decode_dxt_tiled(key, mem, 16)
|
||||
}
|
||||
|
||||
/// BC3 / DXT4_5 — 16-byte blocks.
|
||||
pub fn decode_dxt45_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
decode_dxt_tiled(key, mem, 16)
|
||||
}
|
||||
|
||||
/// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high
|
||||
/// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of
|
||||
/// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a
|
||||
/// sampled texture format on every backend, so CPU-side conversion is
|
||||
/// the safe path even if it's 2× the texture memory.
|
||||
///
|
||||
/// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes),
|
||||
/// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear
|
||||
/// output buffer.
|
||||
pub fn decode_k565_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
if key.width == 0 || key.height == 0 {
|
||||
return Err(DecodeError::ZeroSize);
|
||||
}
|
||||
let w = key.width as u32;
|
||||
let h = key.height as u32;
|
||||
// Pitch/block counts — block = 1 texel here, 2 bytes.
|
||||
let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
|
||||
let total_bytes = (pitch_aligned * h * 2) as usize;
|
||||
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||
if raw.len() < total_bytes {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
// 16-bit word order is endian-swap-sensitive.
|
||||
apply_endian_32(&mut raw, key.endian);
|
||||
// Step 1: detile (bytes_per_block=2, tile in blocks=texels).
|
||||
let mut linear_u16 = vec![0u8; (w * h * 2) as usize];
|
||||
if key.tiled {
|
||||
if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
} else {
|
||||
for y in 0..h as usize {
|
||||
let src = y * (pitch_aligned as usize) * 2;
|
||||
let dst = y * (w as usize) * 2;
|
||||
linear_u16[dst..dst + (w as usize) * 2]
|
||||
.copy_from_slice(&raw[src..src + (w as usize) * 2]);
|
||||
}
|
||||
}
|
||||
// Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16
|
||||
// is little-endian after `apply_endian_32` has normalized the word
|
||||
// order (we keep host-native byte ordering post-swap).
|
||||
let mut rgba = vec![0u8; (w * h * 4) as usize];
|
||||
for y in 0..h as usize {
|
||||
for x in 0..w as usize {
|
||||
let off = (y * w as usize + x) * 2;
|
||||
let lo = linear_u16[off];
|
||||
let hi = linear_u16[off + 1];
|
||||
let word = u16::from_le_bytes([lo, hi]);
|
||||
// 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4).
|
||||
// Expand to full-range u8: replicate high bits into low
|
||||
// (so 0b11111 → 0xFF, matching the standard 565→888 convention).
|
||||
let r5 = ((word >> 11) & 0x1F) as u8;
|
||||
let g6 = ((word >> 5) & 0x3F) as u8;
|
||||
let b5 = (word & 0x1F) as u8;
|
||||
let r = (r5 << 3) | (r5 >> 2);
|
||||
let g = (g6 << 2) | (g6 >> 4);
|
||||
let b = (b5 << 3) | (b5 >> 2);
|
||||
let o = (y * w as usize + x) * 4;
|
||||
rgba[o] = r;
|
||||
rgba[o + 1] = g;
|
||||
rgba[o + 2] = b;
|
||||
rgba[o + 3] = 0xFF;
|
||||
}
|
||||
}
|
||||
Ok(rgba)
|
||||
}
|
||||
|
||||
/// Version-aware CPU-side texture cache. Entries are keyed on
|
||||
/// `TextureKey.hash` and carry a `version_when_uploaded` watermark against
|
||||
/// the guest memory's page-version counter. `ensure_cached` queries
|
||||
/// `GuestMemory::max_page_version` over the texture's byte span; if the
|
||||
/// span has been written since cache time, the entry is re-decoded.
|
||||
pub struct TextureCache {
|
||||
entries: HashMap<TextureKey, CachedTexture>,
|
||||
/// Monotonic counter of decodes performed — HUD surface.
|
||||
pub decodes_total: u64,
|
||||
/// Count of stale-miss re-decodes.
|
||||
pub restale_total: u64,
|
||||
}
|
||||
|
||||
impl Default for TextureCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl TextureCache {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
entries: HashMap::new(),
|
||||
decodes_total: 0,
|
||||
restale_total: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.entries.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.entries.is_empty()
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> {
|
||||
self.entries.get(key)
|
||||
}
|
||||
|
||||
/// Return a cached (or freshly-decoded) texture. The caller supplies
|
||||
/// the current guest-memory page version covering the texture span;
|
||||
/// see [`max_page_version_for`].
|
||||
pub fn ensure_cached(
|
||||
&mut self,
|
||||
key: TextureKey,
|
||||
current_version: u64,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<&CachedTexture, DecodeError> {
|
||||
// Fast path: fresh entry exists.
|
||||
if let Some(e) = self.entries.get(&key) {
|
||||
if e.version_when_uploaded >= current_version {
|
||||
return Ok(self.entries.get(&key).unwrap());
|
||||
}
|
||||
self.restale_total += 1;
|
||||
}
|
||||
let bytes = match key.format {
|
||||
TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?,
|
||||
TextureFormat::K565 => decode_k565_tiled(&key, mem)?,
|
||||
TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?,
|
||||
TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?,
|
||||
TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?,
|
||||
_ => return Err(DecodeError::UnsupportedFormat),
|
||||
};
|
||||
self.decodes_total += 1;
|
||||
let entry = CachedTexture {
|
||||
key,
|
||||
version_when_uploaded: current_version,
|
||||
bytes,
|
||||
};
|
||||
self.entries.insert(key, entry);
|
||||
Ok(self.entries.get(&key).unwrap())
|
||||
}
|
||||
|
||||
pub fn byte_budget(&self) -> usize {
|
||||
self.entries.values().map(|e| e.byte_size()).sum()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
struct FakeMem(Box<[Cell<u8>]>);
|
||||
impl FakeMem {
|
||||
fn from_vec(v: Vec<u8>) -> Self {
|
||||
FakeMem(v.into_iter().map(Cell::new).collect())
|
||||
}
|
||||
}
|
||||
impl xenia_memory::MemoryAccess for FakeMem {
|
||||
fn read_u8(&self, a: u32) -> u8 {
|
||||
self.0.get(a as usize).map(|c| c.get()).unwrap_or(0)
|
||||
}
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
u32::from_be_bytes([
|
||||
self.read_u8(a),
|
||||
self.read_u8(a + 1),
|
||||
self.read_u8(a + 2),
|
||||
self.read_u8(a + 3),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
u64::from_be_bytes([
|
||||
self.read_u8(a),
|
||||
self.read_u8(a + 1),
|
||||
self.read_u8(a + 2),
|
||||
self.read_u8(a + 3),
|
||||
self.read_u8(a + 4),
|
||||
self.read_u8(a + 5),
|
||||
self.read_u8(a + 6),
|
||||
self.read_u8(a + 7),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) {
|
||||
if let Some(slot) = self.0.get(a as usize) {
|
||||
slot.set(v);
|
||||
}
|
||||
}
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let b = v.to_be_bytes();
|
||||
self.write_u8(a, b[0]);
|
||||
self.write_u8(a + 1, b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let b = v.to_be_bytes();
|
||||
for i in 0..4 {
|
||||
self.write_u8(a + i as u32, b[i]);
|
||||
}
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let b = v.to_be_bytes();
|
||||
for i in 0..8 {
|
||||
self.write_u8(a + i as u32, b[i]);
|
||||
}
|
||||
}
|
||||
fn translate(&self, _: u32) -> Option<*const u8> {
|
||||
None
|
||||
}
|
||||
fn translate_mut(&self, _: u32) -> Option<*mut u8> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_block_info_matches_canary_expectations() {
|
||||
assert_eq!(
|
||||
TextureFormat::K8888.block_info(),
|
||||
BlockInfo::new(1, 1, 4)
|
||||
);
|
||||
assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8));
|
||||
assert_eq!(
|
||||
TextureFormat::Dxt4_5.block_info(),
|
||||
BlockInfo::new(4, 4, 16)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn endian_swap_variants() {
|
||||
assert_eq!(Endian::None.swap32(0x11223344), 0x11223344);
|
||||
assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433);
|
||||
assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211);
|
||||
assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_fetch_constant_rejects_empty() {
|
||||
let z = [0u32; 6];
|
||||
assert!(decode_fetch_constant(z).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_fetch_constant_parses_2d_k8888() {
|
||||
// Build a synthetic k_8_8_8_8 2D texture fetch constant:
|
||||
// dword0: pitch_5=40 (1280/32), tiled=1, type=2
|
||||
// dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12
|
||||
// dword2: width-1=1279, height-1=719
|
||||
// dword5: dimension=1 (2D)
|
||||
let d0 = 0x8000_0000 | (40u32 << 22) | 2;
|
||||
let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32;
|
||||
let d2 = 1279u32 | ((719u32) << 13);
|
||||
let d5 = 1u32 << 9;
|
||||
let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed");
|
||||
assert_eq!(k.format, TextureFormat::K8888);
|
||||
assert_eq!(k.endian, Endian::Swap8In32);
|
||||
assert_eq!(k.width, 1280);
|
||||
assert_eq!(k.height, 720);
|
||||
assert_eq!(k.dimension, Dimension::D2);
|
||||
assert!(k.tiled);
|
||||
assert_eq!(k.pitch_texels, 1280);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_k8888_roundtrip_linear() {
|
||||
// Build a 4×4 non-tiled image with pitch=32 (one macro-tile row).
|
||||
// Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a
|
||||
// big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must
|
||||
// be (x, y, y*4+x, 0xFF) in RGBA order.
|
||||
let w = 4u32;
|
||||
let h = 4u32;
|
||||
let pitch = 32u32;
|
||||
let mut bytes = vec![0u8; (pitch * h * 4) as usize];
|
||||
for y in 0..h {
|
||||
for x in 0..w {
|
||||
let off = ((y * pitch + x) * 4) as usize;
|
||||
let argb = (0xFFu32 << 24)
|
||||
| ((x as u32) << 16)
|
||||
| ((y as u32) << 8)
|
||||
| ((y * 4 + x) as u32);
|
||||
bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes());
|
||||
}
|
||||
}
|
||||
let mem = FakeMem::from_vec(bytes);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K8888,
|
||||
endian: Endian::Swap8In32,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: pitch as u16,
|
||||
};
|
||||
let out = decode_k8888_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 16 * 4);
|
||||
assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
|
||||
let off = ((3 * 4 + 3) * 4) as usize;
|
||||
assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]);
|
||||
}
|
||||
|
||||
// ── First-Pixels M5 format tests ──────────────────────────────
|
||||
|
||||
/// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block.
|
||||
/// Synthetic source of 0xDEADBEEF... bytes; assert the decoder
|
||||
/// returns the same bytes (passthrough after endian swap).
|
||||
#[test]
|
||||
fn decode_dxt23_small_roundtrip() {
|
||||
// 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32
|
||||
// (macro-tile-aligned) the block pitch is 8 (=32/4), and we
|
||||
// allocate 8*1*16 = 128 bytes of source.
|
||||
let mut bytes = vec![0u8; 128];
|
||||
for (i, b) in bytes.iter_mut().enumerate().take(16) {
|
||||
*b = i as u8;
|
||||
}
|
||||
let mem = FakeMem::from_vec(bytes);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::Dxt2_3,
|
||||
endian: Endian::None, // no swap — we can eyeball passthrough
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
let out = decode_dxt23_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 16); // 1 block × 16 bytes
|
||||
for i in 0..16 {
|
||||
assert_eq!(out[i], i as u8);
|
||||
}
|
||||
}
|
||||
|
||||
/// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a
|
||||
/// parallel test prevents a regression that sneaks up via the
|
||||
/// generic `decode_dxt_tiled`.
|
||||
#[test]
|
||||
fn decode_dxt45_uses_16byte_blocks() {
|
||||
let mem = FakeMem::from_vec(vec![0xAAu8; 256]);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 8,
|
||||
height: 4, // 2×1 blocks
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::Dxt4_5,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
let out = decode_dxt45_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 2 * 16);
|
||||
}
|
||||
|
||||
/// k_5_6_5: a single white texel (all bits set, 0xFFFF) should
|
||||
/// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red
|
||||
/// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0
|
||||
/// B=0 via the high-bit-replicate convention.
|
||||
#[test]
|
||||
fn decode_k565_texel_expansion() {
|
||||
// Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels
|
||||
// → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white),
|
||||
// texel[1] = 0xF800 (pure red).
|
||||
let mut bytes = vec![0u8; 64];
|
||||
// 0xFFFF
|
||||
bytes[0] = 0xFF;
|
||||
bytes[1] = 0xFF;
|
||||
// 0xF800 (big-endian memory): high byte 0xF8, low 0x00.
|
||||
// But after apply_endian_32(Endian::None) we use little-endian
|
||||
// word decoding — so memory must carry the bytes in LE order.
|
||||
bytes[2] = 0x00;
|
||||
bytes[3] = 0xF8;
|
||||
let mem = FakeMem::from_vec(bytes);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 2,
|
||||
height: 1,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K565,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
let out = decode_k565_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 2 * 4);
|
||||
// Texel 0: white.
|
||||
assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]);
|
||||
// Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF).
|
||||
assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_host_supported_covers_m5_formats() {
|
||||
assert!(TextureFormat::K8888.is_host_supported());
|
||||
assert!(TextureFormat::K565.is_host_supported());
|
||||
assert!(TextureFormat::Dxt1.is_host_supported());
|
||||
assert!(TextureFormat::Dxt2_3.is_host_supported());
|
||||
assert!(TextureFormat::Dxt4_5.is_host_supported());
|
||||
// Unsupported formats should still report false.
|
||||
assert!(!TextureFormat::K16.is_host_supported());
|
||||
assert!(!TextureFormat::K32Float.is_host_supported());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn texture_cache_caches_and_reuses() {
|
||||
let mut cache = TextureCache::new();
|
||||
let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K8888,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
cache.ensure_cached(key, 0, &mem).unwrap();
|
||||
assert_eq!(cache.decodes_total, 1);
|
||||
// Same version: should hit cache.
|
||||
cache.ensure_cached(key, 0, &mem).unwrap();
|
||||
assert_eq!(cache.decodes_total, 1);
|
||||
// Higher version: stale → re-decode.
|
||||
cache.ensure_cached(key, 1, &mem).unwrap();
|
||||
assert_eq!(cache.decodes_total, 2);
|
||||
assert_eq!(cache.restale_total, 1);
|
||||
}
|
||||
|
||||
/// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey`
|
||||
/// → `ensure_cached` on fresh/version-bumped memory → stale re-decode.
|
||||
/// Mirrors what `vd_swap` does per frame.
|
||||
#[test]
|
||||
fn e2e_fetch_const_to_cache_with_versioning() {
|
||||
// 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned.
|
||||
let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2
|
||||
let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none
|
||||
let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3
|
||||
let d5 = 1u32 << 9; // 2D
|
||||
let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded");
|
||||
assert_eq!(key.format, TextureFormat::K8888);
|
||||
assert_eq!(key.width, 4);
|
||||
|
||||
let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]);
|
||||
let mut cache = TextureCache::new();
|
||||
// v0 decode.
|
||||
let first = cache
|
||||
.ensure_cached(key, 0, &mem)
|
||||
.expect("initial decode")
|
||||
.clone();
|
||||
// Same version → cache hit.
|
||||
cache.ensure_cached(key, 0, &mem).expect("hit");
|
||||
assert_eq!(cache.decodes_total, 1);
|
||||
// Simulate the guest writing to the texture's pages: version bumps.
|
||||
for b in &mem.0[..16] {
|
||||
b.set(0xFF);
|
||||
}
|
||||
cache.ensure_cached(key, 1, &mem).expect("re-decode");
|
||||
assert_eq!(cache.decodes_total, 2);
|
||||
assert_eq!(cache.restale_total, 1);
|
||||
// Bytes differ from v0 (proof the re-decode happened).
|
||||
let second = cache.get(&key).unwrap();
|
||||
assert_ne!(first.bytes, second.bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn texture_cache_rejects_unsupported_format() {
|
||||
let mut cache = TextureCache::new();
|
||||
let mem = FakeMem::from_vec(vec![0u8; 1024]);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K16,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
assert!(matches!(
|
||||
cache.ensure_cached(key, 0, &mem),
|
||||
Err(DecodeError::UnsupportedFormat)
|
||||
));
|
||||
}
|
||||
}
|
||||
178
crates/xenia-gpu/src/tiled_address.rs
Normal file
178
crates/xenia-gpu/src/tiled_address.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
//! Xenos tiled-texture address formula (2D, Tiled2D layout).
|
||||
//!
|
||||
//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
|
||||
//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
|
||||
//! macro-tile pattern with bank+pipe interleave for its internal DRAM
|
||||
//! banks; this formula inverts that so we can read pixels out in linear
|
||||
//! order, given the tiled source buffer.
|
||||
//!
|
||||
//! We use this in two places during P4:
|
||||
//! - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
|
||||
//! - Any place we need to read tiled guest memory into a host-linear
|
||||
//! buffer for CPU-side conversion before upload.
|
||||
|
||||
/// Tile size constants from canary.
|
||||
pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
|
||||
pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px
|
||||
|
||||
/// Canary's `TiledCombine` helper — reassembles the DRAM address from the
|
||||
/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
|
||||
#[inline]
|
||||
fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
|
||||
(y_lsb << 4)
|
||||
| (pipe << 6)
|
||||
| (bank << 11)
|
||||
| (outer_inner_bytes & 0b1111)
|
||||
| (((outer_inner_bytes >> 4) & 0b1) << 5)
|
||||
| (((outer_inner_bytes >> 5) & 0b111) << 8)
|
||||
| ((outer_inner_bytes >> 8) << 12)
|
||||
}
|
||||
|
||||
/// 2D tiled offset in bytes from (x, y) into a tiled surface with
|
||||
/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
|
||||
/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
|
||||
/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
|
||||
/// `u32` rather than canary's signed `int` since our callers stay in
|
||||
/// unsigned arithmetic.
|
||||
///
|
||||
/// This is the canonical formula — do not simplify without re-reading
|
||||
/// `texture_address.h:190-208`; the bit-interleave cannot be expressed
|
||||
/// as a linear function.
|
||||
pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
|
||||
let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
|
||||
// Outer: which 32×32 macro tile we're in.
|
||||
let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
|
||||
+ (x >> MACRO_TILE_WIDTH_LOG2))
|
||||
<< 6;
|
||||
// Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
|
||||
// because that bit becomes the `y_lsb` interleave bit below).
|
||||
let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
|
||||
let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;
|
||||
|
||||
let bank = (y >> 4) & 0b1;
|
||||
let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
|
||||
let y_lsb = y & 1;
|
||||
|
||||
tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
|
||||
}
|
||||
|
||||
/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
|
||||
/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
|
||||
#[inline]
|
||||
pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
|
||||
let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
|
||||
(pitch_pixels + mask) & !mask
|
||||
}
|
||||
|
||||
/// Detile a 2D tiled surface into a linear destination buffer. The
|
||||
/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
|
||||
/// one block in the tiled source, and the detiler writes it into `dst`
|
||||
/// at the linear (x, y) position.
|
||||
///
|
||||
/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
|
||||
/// a single 16-bit block, etc.). `dst` must be at least
|
||||
/// `width * height * bpp` bytes long.
|
||||
///
|
||||
/// Returns `Err(())` if the source doesn't contain enough bytes for the
|
||||
/// largest offset the formula would produce (defensive — callers can
|
||||
/// downgrade silently).
|
||||
pub fn detile_2d(
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
width: u32,
|
||||
height: u32,
|
||||
pitch_pixels: u32,
|
||||
bpp: u32,
|
||||
) -> Result<(), ()> {
|
||||
let bpp_log2 = bpp.trailing_zeros();
|
||||
let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
|
||||
let dst_pitch_bytes = (width * bpp) as usize;
|
||||
let bpp_u = bpp as usize;
|
||||
|
||||
for y in 0..height {
|
||||
for x in 0..width {
|
||||
let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
|
||||
if src_off + bpp_u > src.len() {
|
||||
return Err(());
|
||||
}
|
||||
let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
|
||||
if dst_off + bpp_u > dst.len() {
|
||||
return Err(());
|
||||
}
|
||||
dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
|
||||
#[test]
|
||||
fn origin_is_zero() {
|
||||
assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
|
||||
}
|
||||
|
||||
/// Round-trip: detiling a tiled buffer that was filled using the same
|
||||
/// formula produces the identity linear image.
|
||||
#[test]
|
||||
fn roundtrip_small_pattern() {
|
||||
let w = 32u32;
|
||||
let h = 16u32;
|
||||
let bpp = 4u32;
|
||||
let pitch = align_pitch_to_macro_tile(w);
|
||||
// Allocate a tiled buffer large enough for the largest offset.
|
||||
let max_off = (0..h)
|
||||
.flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
|
||||
.max()
|
||||
.unwrap();
|
||||
let mut tiled = vec![0u8; max_off];
|
||||
// Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
|
||||
// each logical (x, y) position in the tiled buffer.
|
||||
for y in 0..h {
|
||||
for x in 0..w {
|
||||
let off = tiled_2d_offset(x, y, pitch, 2) as usize;
|
||||
tiled[off + 0] = x as u8;
|
||||
tiled[off + 1] = y as u8;
|
||||
tiled[off + 2] = (x ^ y) as u8;
|
||||
tiled[off + 3] = 0xFF;
|
||||
}
|
||||
}
|
||||
let mut linear = vec![0u8; (w * h * bpp) as usize];
|
||||
detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
|
||||
// Verify every logical pixel landed at the right linear offset.
|
||||
for y in 0..h {
|
||||
for x in 0..w {
|
||||
let lin = ((y * w + x) * bpp) as usize;
|
||||
assert_eq!(linear[lin + 0], x as u8);
|
||||
assert_eq!(linear[lin + 1], y as u8);
|
||||
assert_eq!(linear[lin + 2], (x ^ y) as u8);
|
||||
assert_eq!(linear[lin + 3], 0xFF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Within a single macro-tile row, stepping `x` by 1 changes the low
|
||||
/// 3 bits of `x` which feed the `inner_blocks` field — different
|
||||
/// offsets are expected (no aliasing).
|
||||
#[test]
|
||||
fn neighbouring_pixels_have_distinct_offsets() {
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
for y in 0..16 {
|
||||
for x in 0..32 {
|
||||
assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
|
||||
/// rounds to 1312.
|
||||
#[test]
|
||||
fn align_pitch_rounds_up_to_32() {
|
||||
assert_eq!(align_pitch_to_macro_tile(1280), 1280);
|
||||
assert_eq!(align_pitch_to_macro_tile(1281), 1312);
|
||||
assert_eq!(align_pitch_to_macro_tile(31), 32);
|
||||
}
|
||||
}
|
||||
744
crates/xenia-gpu/src/translator.rs
Normal file
744
crates/xenia-gpu/src/translator.rs
Normal file
@@ -0,0 +1,744 @@
|
||||
//! Xenos → WGSL direct translator (P7).
|
||||
//!
|
||||
//! Replaces the runtime uber-shader interpreter (P3b/P3c) for shaders whose
|
||||
//! feature set we cover. Emits a *standalone* WGSL module per shader
|
||||
//! instead of walking a ucode buffer at draw time — pipeline compilation
|
||||
//! happens once, then every subsequent dispatch is a direct `draw()`.
|
||||
//!
|
||||
//! The translator is deliberately narrow: when it encounters an opcode /
|
||||
//! fetch format / CF shape it doesn't know, it returns [`None`] and the
|
||||
//! caller falls back to the interpreter. This keeps the op-coverage work
|
||||
//! incremental — future commits can add one opcode at a time without
|
||||
//! invalidating the scaffolding.
|
||||
//!
|
||||
//! Current coverage (v1):
|
||||
//! * Linear CF: `Exec`/`ExecEnd`, `Alloc`, `Exit`. No loops / branches /
|
||||
//! calls / predicate-gated clauses.
|
||||
//! * ALU vector: `ADD`, `MUL`, `MAX`, `MIN`, `MAD`, `DP4`, `DP3`,
|
||||
//! `DP2_ADD`, `SEQ`, `SGT`, `SGE`, `SNE`, `FRC`, `FLOOR`.
|
||||
//! * ALU scalar: `ADDS`, `MULS`, `MAXS`, `MINS`, `RCP`, `RETAIN_PREV`.
|
||||
//! * Vertex fetch: `R32G32B32A32_FLOAT` only.
|
||||
//! * Texture fetch: 2D via the single `@group(1)` slot (same one P5/M6
|
||||
//! binds).
|
||||
//! * Exports: VS writes position + interpolator 0 (color); PS writes
|
||||
//! color0.
|
||||
//!
|
||||
//! When a shader exceeds this subset, [`translate`] returns `None` and
|
||||
//! `gpu.shader.translate_reject{reason}` is bumped by the caller.
|
||||
|
||||
use crate::ucode::alu::{decode_alu, sop, vop, AluInstruction};
|
||||
use crate::ucode::control_flow::{AllocKind, ControlFlowInstruction};
|
||||
use crate::ucode::fetch::{decode_fetch, FetchInstruction};
|
||||
use crate::ucode::ParsedShader;
|
||||
|
||||
/// Shader stage we're emitting for.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Stage {
|
||||
Vertex,
|
||||
Pixel,
|
||||
}
|
||||
|
||||
/// Success or refusal from the translator. On refusal, the caller falls
|
||||
/// back to the runtime uber-shader interpreter.
|
||||
#[derive(Debug)]
|
||||
pub enum Translation {
|
||||
/// The emitted WGSL body for *this stage only*. Both VS + PS get
|
||||
/// wrapped into one module via [`combine_stages`].
|
||||
Ok(String),
|
||||
/// Translator saw an op/pattern it doesn't handle; fallback.
|
||||
Reject(&'static str),
|
||||
}
|
||||
|
||||
/// Full WGSL module for a (VS, PS) pair ready to hand to
|
||||
/// `wgpu::Device::create_shader_module`. Shares the header across the two
|
||||
/// bodies so bindings, struct declarations, and helpers aren't duplicated.
|
||||
pub fn combine_stages(vs_body: &str, ps_body: &str) -> String {
|
||||
let mut out = String::with_capacity(4096 + vs_body.len() + ps_body.len());
|
||||
out.push_str(MODULE_HEADER);
|
||||
out.push_str(vs_body);
|
||||
out.push_str(ps_body);
|
||||
out
|
||||
}
|
||||
|
||||
/// Translate a single shader stage. Returns `None` on any unsupported
|
||||
/// feature with a short reason string that the caller plumbs into the
|
||||
/// `gpu.shader.translate_reject{reason}` metric.
|
||||
pub fn translate(parsed: &ParsedShader, stage: Stage) -> Translation {
|
||||
let mut ctx = EmitCtx::new(stage);
|
||||
// Emit the stage entry function body.
|
||||
if let Err(reason) = ctx.emit_stage_body(parsed) {
|
||||
return Translation::Reject(reason);
|
||||
}
|
||||
Translation::Ok(ctx.finish())
|
||||
}
|
||||
|
||||
/// Reject reasons; kept as static &'str for zero-alloc metrics.
|
||||
pub mod reject {
|
||||
pub const VEC_OP_UNSUPPORTED: &str = "vec_op_unsupported";
|
||||
pub const SCL_OP_UNSUPPORTED: &str = "scl_op_unsupported";
|
||||
pub const CF_LOOP: &str = "cf_loop";
|
||||
pub const CF_COND: &str = "cf_cond";
|
||||
pub const CF_CALL: &str = "cf_call";
|
||||
pub const CF_UNKNOWN: &str = "cf_unknown";
|
||||
pub const VFETCH_FMT: &str = "vfetch_fmt";
|
||||
pub const TFETCH_NON2D: &str = "tfetch_non2d";
|
||||
pub const INSTR_OOB: &str = "instr_oob";
|
||||
}
|
||||
|
||||
/// Shader-module preamble (bindings, helpers, struct defs). The bindings
|
||||
/// mirror the xenos pipeline's `@group(0)` + `@group(1)` layout from P5/M6
|
||||
/// so we can use **the same bind-group slots** — only the pipeline object
|
||||
/// differs between interpreter mode and translator mode.
|
||||
const MODULE_HEADER: &str = r#"
|
||||
struct XenosDrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
};
|
||||
|
||||
struct XenosConstants {
|
||||
alu: array<vec4<f32>, 512>,
|
||||
fetch: array<u32, 256>,
|
||||
bool_consts: array<u32, 8>,
|
||||
loop_consts: array<u32, 32>,
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<uniform> draw_ctx : XenosDrawConstants;
|
||||
@group(0) @binding(1) var<storage, read> xenos_consts : XenosConstants;
|
||||
@group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
|
||||
@group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
|
||||
@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
|
||||
|
||||
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||
@group(1) @binding(1) var xenos_samp : sampler;
|
||||
|
||||
struct VsOut {
|
||||
@builtin(position) position: vec4<f32>,
|
||||
@location(0) color: vec4<f32>,
|
||||
};
|
||||
|
||||
struct FsOut {
|
||||
@location(0) color0: vec4<f32>,
|
||||
};
|
||||
|
||||
// Helper: reciprocal guarded against divide-by-zero.
|
||||
fn xe_rcp(x: f32) -> f32 {
|
||||
return select(0.0, 1.0 / x, x != 0.0);
|
||||
}
|
||||
|
||||
// GPUBUG-102: per-format byte-swap matching canary's `GpuSwapInline`
|
||||
// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; the host is
|
||||
// little-endian. The fetch constant's `endian` field (low 2 bits of
|
||||
// dword_1) selects:
|
||||
// 0 (kNone) — no swap
|
||||
// 1 (k8in16) — swap bytes within halfwords
|
||||
// 2 (k8in32) — full byte reverse
|
||||
// 3 (k16in32) — swap halfwords
|
||||
fn gpu_swap(value: u32, endian: u32) -> u32 {
|
||||
switch endian {
|
||||
case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); }
|
||||
case 2u: {
|
||||
return ((value & 0x000000FFu) << 24u)
|
||||
| ((value & 0x0000FF00u) << 8u)
|
||||
| ((value & 0x00FF0000u) >> 8u)
|
||||
| ((value & 0xFF000000u) >> 24u);
|
||||
}
|
||||
case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); }
|
||||
default: { return value; }
|
||||
}
|
||||
}
|
||||
"#;
|
||||
|
||||
struct EmitCtx {
|
||||
stage: Stage,
|
||||
out: String,
|
||||
indent: usize,
|
||||
}
|
||||
|
||||
impl EmitCtx {
|
||||
fn new(stage: Stage) -> Self {
|
||||
Self {
|
||||
stage,
|
||||
out: String::with_capacity(2048),
|
||||
indent: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn finish(self) -> String {
|
||||
self.out
|
||||
}
|
||||
|
||||
fn push(&mut self, s: &str) {
|
||||
for _ in 0..self.indent {
|
||||
self.out.push_str(" ");
|
||||
}
|
||||
self.out.push_str(s);
|
||||
self.out.push('\n');
|
||||
}
|
||||
|
||||
fn emit_stage_body(&mut self, parsed: &ParsedShader) -> Result<(), &'static str> {
|
||||
// Entry function + struct header.
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
self.push("@vertex");
|
||||
self.push("fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {");
|
||||
}
|
||||
Stage::Pixel => {
|
||||
self.push("@fragment");
|
||||
self.push("fn fs_main(in: VsOut) -> FsOut {");
|
||||
}
|
||||
}
|
||||
self.indent = 1;
|
||||
// Register file + ps chain + export slots. All local `var`s so each
|
||||
// invocation gets its own state; translator-emitted code doesn't
|
||||
// need `var<private>` because we don't share across function calls.
|
||||
self.push("var r: array<vec4<f32>, 128>;");
|
||||
self.push("for (var i = 0u; i < 128u; i = i + 1u) { r[i] = vec4<f32>(0.0); }");
|
||||
self.push("var ps: f32 = 0.0;");
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
// Seed r0 with vertex index for simple shaders that read it.
|
||||
self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
|
||||
// Synthetic export slots — match the interpreter's layout so
|
||||
// the fallback path and translator path produce the same
|
||||
// visual output on shaders both support.
|
||||
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
||||
self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
|
||||
}
|
||||
Stage::Pixel => {
|
||||
// Seed r0.xy with interpolated color lane so trivial shaders
|
||||
// that read r0 still produce something.
|
||||
self.push("r[0] = in.color;");
|
||||
self.push("var ocolor0: vec4<f32> = in.color;");
|
||||
}
|
||||
}
|
||||
|
||||
let mut current_alloc = AllocKind::Other;
|
||||
for clause in &parsed.cf {
|
||||
match clause {
|
||||
ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
is_end,
|
||||
predicated,
|
||||
..
|
||||
} => {
|
||||
if *predicated {
|
||||
return Err(reject::CF_COND);
|
||||
}
|
||||
self.emit_exec(parsed, *address, *count, *sequence, current_alloc)?;
|
||||
if *is_end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
ControlFlowInstruction::Alloc { kind, .. } => {
|
||||
current_alloc = *kind;
|
||||
}
|
||||
ControlFlowInstruction::Exit => break,
|
||||
ControlFlowInstruction::LoopStart { .. }
|
||||
| ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
|
||||
ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
|
||||
ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
|
||||
return Err(reject::CF_CALL);
|
||||
}
|
||||
ControlFlowInstruction::Unknown { .. } => return Err(reject::CF_UNKNOWN),
|
||||
}
|
||||
}
|
||||
|
||||
match self.stage {
|
||||
Stage::Vertex => {
|
||||
self.push("var out: VsOut;");
|
||||
self.push("out.position = opos;");
|
||||
self.push("out.color = ocolor;");
|
||||
self.push("return out;");
|
||||
}
|
||||
Stage::Pixel => {
|
||||
self.push("var out: FsOut;");
|
||||
self.push("out.color0 = ocolor0;");
|
||||
self.push("return out;");
|
||||
}
|
||||
}
|
||||
self.indent = 0;
|
||||
self.push("}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_exec(
|
||||
&mut self,
|
||||
parsed: &ParsedShader,
|
||||
address: u32,
|
||||
count: u32,
|
||||
sequence: u32,
|
||||
current_alloc: AllocKind,
|
||||
) -> Result<(), &'static str> {
|
||||
for i in 0..(count as usize) {
|
||||
let triple_idx = address as usize + i;
|
||||
let base = triple_idx * 3;
|
||||
if base + 2 >= parsed.instructions.len() {
|
||||
return Err(reject::INSTR_OOB);
|
||||
}
|
||||
let words = [
|
||||
parsed.instructions[base],
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
|
||||
FetchInstruction::Texture(tf) => {
|
||||
if tf.dimension != 1 {
|
||||
return Err(reject::TFETCH_NON2D);
|
||||
}
|
||||
self.emit_tfetch(&tf);
|
||||
}
|
||||
FetchInstruction::Unknown { .. } => return Err(reject::VFETCH_FMT),
|
||||
}
|
||||
} else {
|
||||
let alu = decode_alu(words);
|
||||
self.emit_alu(&alu, current_alloc)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_alu(
|
||||
&mut self,
|
||||
alu: &AluInstruction,
|
||||
current_alloc: AllocKind,
|
||||
) -> Result<(), &'static str> {
|
||||
// GPUBUG-100/101: per-operand temp-vs-constant selector (w0
|
||||
// bits 29-31), 8-bit component-relative swizzle (w1 bytes 0-2),
|
||||
// and 1-bit negate (w1 bits 24-26). Pre-fix all three were
|
||||
// discarded, so every ALU read came back as r[low7] without
|
||||
// any swizzle / negation, dropping every shader's uniforms +
|
||||
// negative operands.
|
||||
let a = src_operand(alu.src_a, alu.src_a_is_temp, alu.src_a_swiz, alu.src_a_negate);
|
||||
let b = src_operand(alu.src_b, alu.src_b_is_temp, alu.src_b_swiz, alu.src_b_negate);
|
||||
let c = src_operand(alu.src_c, alu.src_c_is_temp, alu.src_c_swiz, alu.src_c_negate);
|
||||
|
||||
// Vector pipe.
|
||||
if alu.vector_write_mask != 0 {
|
||||
let expr = vector_expr(alu.vector_opcode, &a, &b, &c)
|
||||
.ok_or(reject::VEC_OP_UNSUPPORTED)?;
|
||||
let dst_reg = alu.vector_dest & 0x7F;
|
||||
if alu.vector_dest_is_export {
|
||||
self.emit_export(dst_reg, current_alloc, &expr, alu.vector_write_mask);
|
||||
} else {
|
||||
self.emit_masked_write(&format!("r[{dst_reg}u]"), &expr, alu.vector_write_mask);
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar pipe. Binary ops use (src_a.x, src_b.x); ps-variants use
|
||||
// src_a.x + running ps. `scl_src_a` mirrors the interpreter's
|
||||
// `scalar_src_is_ps` selector.
|
||||
let scl_src_a = if alu.scalar_src_is_ps {
|
||||
"ps".to_string()
|
||||
} else {
|
||||
format!("{}.x", a)
|
||||
};
|
||||
let scl_src_b = format!("{}.x", b);
|
||||
let expr = scalar_expr(alu.scalar_opcode, &scl_src_a, &scl_src_b, "ps")
|
||||
.ok_or(reject::SCL_OP_UNSUPPORTED)?;
|
||||
self.push(&format!("ps = {expr};"));
|
||||
if alu.scalar_write_mask != 0 {
|
||||
let v = "vec4<f32>(ps, ps, ps, ps)";
|
||||
let dst_reg = alu.scalar_dest & 0x7F;
|
||||
self.emit_masked_write(&format!("r[{dst_reg}u]"), v, alu.scalar_write_mask);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_masked_write(&mut self, lhs: &str, rhs: &str, mask: u8) {
|
||||
if mask == 0xF {
|
||||
self.push(&format!("{lhs} = {rhs};"));
|
||||
return;
|
||||
}
|
||||
self.push(&"{".to_string());
|
||||
self.indent += 1;
|
||||
self.push(&format!("let _prev = {lhs};"));
|
||||
self.push(&format!("let _new = {rhs};"));
|
||||
let mut components = Vec::new();
|
||||
let letters = ['x', 'y', 'z', 'w'];
|
||||
for (i, c) in letters.iter().enumerate() {
|
||||
if (mask >> i) & 1 == 1 {
|
||||
components.push(format!("_new.{c}"));
|
||||
} else {
|
||||
components.push(format!("_prev.{c}"));
|
||||
}
|
||||
}
|
||||
self.push(&format!(
|
||||
"{lhs} = vec4<f32>({}, {}, {}, {});",
|
||||
components[0], components[1], components[2], components[3]
|
||||
));
|
||||
self.indent -= 1;
|
||||
self.push("}");
|
||||
}
|
||||
|
||||
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
||||
// Xenos's export "register" indexing within an alloc range is
|
||||
// normally (alloc_base + offset). Since our CF stream doesn't
|
||||
// carry per-export slot offsets cleanly, use `alloc` to pick the
|
||||
// target.
|
||||
let lhs = match (self.stage, alloc) {
|
||||
(Stage::Vertex, AllocKind::Position) => "opos",
|
||||
(Stage::Vertex, AllocKind::Interpolators) => "ocolor",
|
||||
(Stage::Vertex, AllocKind::Colors) => "ocolor",
|
||||
(Stage::Vertex, _) => "ocolor", // fall through — any other alloc
|
||||
(Stage::Pixel, AllocKind::Colors) => "ocolor0",
|
||||
(Stage::Pixel, _) => "ocolor0",
|
||||
};
|
||||
let _ = dst_reg; // per-slot export indexing reserved for a richer v2
|
||||
self.emit_masked_write(lhs, expr, mask);
|
||||
}
|
||||
|
||||
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
||||
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
||||
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
||||
// formats alongside the CPU texture cache's format expansion.
|
||||
//
|
||||
// GPUBUG-102: the fetch constant (xe_gpu_vertex_fetch_t,
|
||||
// xenos.h:1158-1172) holds the endian field in dword_1's low
|
||||
// 2 bits. Vertex data on Xbox 360 is big-endian; the host is
|
||||
// little-endian. Pre-fix, every dword was bitcast as-is →
|
||||
// vertex positions were byte-reversed garbage and any draw
|
||||
// that did reach the host produced clipped / NaN positions.
|
||||
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
||||
let src_reg = vf.src_register & 0x7F;
|
||||
let dst_reg = vf.dest_register & 0x7F;
|
||||
self.push(&format!(
|
||||
"{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \
|
||||
let fc1 = xenos_consts.fetch[{fc1_idx}u]; \
|
||||
let endian = fc1 & 0x3u; \
|
||||
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
||||
let vidx = u32(r[{src_reg}u].x); \
|
||||
let addr = base + vidx * 4u; \
|
||||
let n = arrayLength(&vertex_buffer); \
|
||||
if (addr + 3u < n) {{ \
|
||||
r[{dst_reg}u] = vec4<f32>( \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian))); \
|
||||
}} }}",
|
||||
fc0_idx = fetch_const * 2,
|
||||
fc1_idx = fetch_const * 2 + 1,
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_tfetch(&mut self, tf: &crate::ucode::fetch::TextureFetch) {
|
||||
// v1: sample the single bound texture; UV = r[src].xy. P5's cache
|
||||
// publishes the `fetch_const=0` texture into `@group(1)`; slot
|
||||
// mismatch is a silent magenta for now.
|
||||
let src_reg = tf.src_register & 0x7F;
|
||||
let dst_reg = tf.dest_register & 0x7F;
|
||||
self.push(&format!(
|
||||
"r[{dst_reg}u] = textureSampleLevel(xenos_tex, xenos_samp, r[{src_reg}u].xy, 0.0);"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
/// Emit the WGSL expression that reads an ALU source operand with
|
||||
/// swizzle + negate applied (no abs — see GPUBUG-100 deferred). Mirrors
|
||||
/// the interpreter shader's `read_src` + `apply_swizzle` + the negate
|
||||
/// half of `apply_modifiers`. The 8-bit `swizzle` is component-relative
|
||||
/// per canary `AluInstruction::GetSwizzledComponentIndex`: for output
|
||||
/// component i, source component is `((swiz >> (2*i)) + i) & 3`.
|
||||
/// Identity swizzle is `0x00`. GPUBUG-100 / GPUBUG-101.
|
||||
fn src_operand(src_byte: u8, is_temp: bool, swizzle: u8, negate: bool) -> String {
|
||||
let base = if is_temp {
|
||||
format!("r[{}u]", (src_byte & 0x3F) as u32)
|
||||
} else {
|
||||
format!("xenos_consts.alu[{}u]", src_byte as u32)
|
||||
};
|
||||
let s = swizzle as u32;
|
||||
let lane = |i: u32| -> char {
|
||||
let c = (((s >> (2 * i)) + i) & 3) as usize;
|
||||
['x', 'y', 'z', 'w'][c]
|
||||
};
|
||||
// Identity swizzle (0x00) maps to .xyzw — emit a bare expression.
|
||||
let swizzled = if swizzle == 0 {
|
||||
base
|
||||
} else {
|
||||
let lx = lane(0);
|
||||
let ly = lane(1);
|
||||
let lz = lane(2);
|
||||
let lw = lane(3);
|
||||
format!("vec4<f32>({base}.{lx}, {base}.{ly}, {base}.{lz}, {base}.{lw})")
|
||||
};
|
||||
if negate {
|
||||
format!("(-{swizzled})")
|
||||
} else {
|
||||
swizzled
|
||||
}
|
||||
}
|
||||
|
||||
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||
let s = match op {
|
||||
vop::ADD => format!("({a} + {b})"),
|
||||
vop::MUL => format!("({a} * {b})"),
|
||||
vop::MAX => format!("max({a}, {b})"),
|
||||
vop::MIN => format!("min({a}, {b})"),
|
||||
vop::MAD => format!("({a} * {b} + {c})"),
|
||||
vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
|
||||
vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
|
||||
vop::DOT2_ADD => format!(
|
||||
"vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"
|
||||
),
|
||||
vop::SEQ => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))"
|
||||
),
|
||||
vop::SGT => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))"
|
||||
),
|
||||
vop::SGE => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))"
|
||||
),
|
||||
vop::SNE => format!(
|
||||
"vec4<f32>(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))"
|
||||
),
|
||||
vop::FRC => format!("fract({a})"),
|
||||
vop::FLOOR => format!("floor({a})"),
|
||||
_ => return None,
|
||||
};
|
||||
Some(s)
|
||||
}
|
||||
|
||||
fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
|
||||
let s = match op {
|
||||
sop::ADDS => format!("({a} + {b})"),
|
||||
sop::ADDS_PREV => format!("({a} + {prev})"),
|
||||
sop::MULS => format!("({a} * {b})"),
|
||||
sop::MULS_PREV => format!("({a} * {prev})"),
|
||||
sop::MAXS => format!("max({a}, {b})"),
|
||||
sop::MINS => format!("min({a}, {b})"),
|
||||
sop::RCP => format!("xe_rcp({a})"),
|
||||
sop::RETAIN_PREV => prev.to_string(),
|
||||
_ => return None,
|
||||
};
|
||||
Some(s)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::ucode::alu::{sop, vop};
|
||||
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||
|
||||
fn synthetic_trivial_shader() -> ParsedShader {
|
||||
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||
// precedes so the ALU's export (if it were one) would target oPos.
|
||||
// Word-0 bits 29-31 set so all three operands resolve as temps —
|
||||
// matches the prior assertion `r[0u] = (r[0u] + r[0u])`.
|
||||
let w0 = (1u32 << 29) | (1u32 << 30) | (1u32 << 31);
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (0u32 << 16); // vector_dest = 0
|
||||
ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Alloc {
|
||||
size: 1,
|
||||
kind: AllocKind::Position,
|
||||
},
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: true,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![w0, 0, w2],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trivial_shader_translates() {
|
||||
let shader = synthetic_trivial_shader();
|
||||
match translate(&shader, Stage::Vertex) {
|
||||
Translation::Ok(body) => {
|
||||
assert!(body.contains("fn vs_main"));
|
||||
assert!(body.contains("r[0u] = (r[0u] + r[0u]);"));
|
||||
assert!(body.contains("return out;"));
|
||||
}
|
||||
Translation::Reject(r) => panic!("rejected: {r}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn combined_module_parses_as_wgsl() {
|
||||
let shader = synthetic_trivial_shader();
|
||||
let vs = match translate(&shader, Stage::Vertex) {
|
||||
Translation::Ok(body) => body,
|
||||
Translation::Reject(r) => panic!("VS rejected: {r}"),
|
||||
};
|
||||
let ps = match translate(&shader, Stage::Pixel) {
|
||||
Translation::Ok(body) => body,
|
||||
Translation::Reject(r) => panic!("PS rejected: {r}"),
|
||||
};
|
||||
let module = combine_stages(&vs, &ps);
|
||||
// naga is pinned as a dev-dep in this crate; if this fails the
|
||||
// translator is emitting invalid WGSL.
|
||||
match naga::front::wgsl::parse_str(&module) {
|
||||
Ok(_) => {}
|
||||
Err(e) => panic!(
|
||||
"emitted WGSL failed to parse:\n{}\n--- module ---\n{}",
|
||||
e, module
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn src_operand_decodes_temp_vs_constant_no_modifiers() {
|
||||
// GPUBUG-101: is_temp=true → r[low6]; is_temp=false → xenos_consts.alu[full].
|
||||
// Identity swizzle (0x00), no negate → bare base expression.
|
||||
assert_eq!(src_operand(0x00, true, 0x00, false), "r[0u]");
|
||||
assert_eq!(src_operand(0x05, true, 0x00, false), "r[5u]");
|
||||
assert_eq!(src_operand(0x3F, true, 0x00, false), "r[63u]");
|
||||
// For temps, bits 6/7 are reserved (abs/rel) — they don't widen
|
||||
// the register index even if set. Phase D2 will consume them.
|
||||
assert_eq!(src_operand(0x80, true, 0x00, false), "r[0u]");
|
||||
assert_eq!(src_operand(0xFF, true, 0x00, false), "r[63u]");
|
||||
// Constants: full 8-bit index.
|
||||
assert_eq!(src_operand(0x00, false, 0x00, false), "xenos_consts.alu[0u]");
|
||||
assert_eq!(src_operand(0x05, false, 0x00, false), "xenos_consts.alu[5u]");
|
||||
assert_eq!(src_operand(0xFF, false, 0x00, false), "xenos_consts.alu[255u]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn src_operand_applies_swizzle_and_negate() {
|
||||
// GPUBUG-100. Component-relative swizzle. swizzle=0x1B reverses
|
||||
// the lanes (.wzyx): for i=0 → ((0x1B >> 0) + 0) & 3 = 3 = w;
|
||||
// for i=1 → ((0x1B >> 2) + 1) & 3 = (6+1)&3 = 3 = w. Hmm —
|
||||
// canary's identity is 0x00 = .xyzw, so .wzyx in component-
|
||||
// relative terms = `s0=3, s1=2, s2=1, s3=0` → bits would be
|
||||
// (3, (2-1)&3=1, (1-2)&3=3, (0-3)&3=1) which combines weirdly.
|
||||
// We just verify the mechanics by precomputing a known case:
|
||||
// swizzle=0x00 (identity) outputs .xyzw — matched by no-swizzle
|
||||
// branch. Negate wraps in `(-…)`.
|
||||
assert_eq!(src_operand(0x05, true, 0x00, true), "(-r[5u])");
|
||||
// swizzle=0xFF → for each i, ((0xFF >> (2i)) + i) & 3:
|
||||
// i=0: (3 + 0) & 3 = 3 → w
|
||||
// i=1: ((0x3F) + 1) & 3 = (63+1)&3 = 0 → x
|
||||
// i=2: ((0x0F) + 2) & 3 = (15+2)&3 = 1 → y
|
||||
// i=3: ((0x03) + 3) & 3 = (3+3)&3 = 2 → z
|
||||
// Output: .wxyz
|
||||
assert_eq!(
|
||||
src_operand(0x05, true, 0xFF, false),
|
||||
"vec4<f32>(r[5u].w, r[5u].x, r[5u].y, r[5u].z)"
|
||||
);
|
||||
// Combined: negate of constant with .wxyz swizzle.
|
||||
assert_eq!(
|
||||
src_operand(0x07, false, 0xFF, true),
|
||||
"(-vec4<f32>(xenos_consts.alu[7u].w, xenos_consts.alu[7u].x, xenos_consts.alu[7u].y, xenos_consts.alu[7u].z))"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shader_using_c0_emits_xenos_consts_read() {
|
||||
// ALU: r0 = c0 + r0. src_a (low byte) is constant index 0;
|
||||
// src_b (next byte) is temp index 0. src_a_is_temp=false →
|
||||
// src1_sel-style bit at w0 bit 29 = 0; src_b_is_temp=true →
|
||||
// bit 30 = 1. (src_c left as 0/temp; unused.)
|
||||
let w0 = 0x00u32 // src_a = c0
|
||||
| (0x00u32 << 8) // src_b = r0
|
||||
| (0x00u32 << 16) // src_c
|
||||
| (0u32 << 29) // src_a_is_temp = false (constant)
|
||||
| (1u32 << 30); // src_b_is_temp = true (register)
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12)
|
||||
| (0u32 << 16);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Alloc {
|
||||
size: 1,
|
||||
kind: AllocKind::Position,
|
||||
},
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: true,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![w0, 0, w2],
|
||||
};
|
||||
match translate(&shader, Stage::Vertex) {
|
||||
Translation::Ok(body) => {
|
||||
assert!(
|
||||
body.contains("xenos_consts.alu[0u]"),
|
||||
"expected c0 operand, got: {body}"
|
||||
);
|
||||
assert!(
|
||||
body.contains("r[0u]"),
|
||||
"expected r0 temp operand, got: {body}"
|
||||
);
|
||||
}
|
||||
Translation::Reject(r) => panic!("rejected: {r}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vfetch_emit_includes_gpu_swap_helper_call() {
|
||||
// GPUBUG-102: emit_vfetch should reference `gpu_swap(...)` for
|
||||
// each lane. Ensures the per-format endian byte-swap is wired
|
||||
// into the AOT path.
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
let vf = crate::ucode::fetch::VertexFetch {
|
||||
fetch_const: 0,
|
||||
src_register: 0,
|
||||
dest_register: 0,
|
||||
dest_write_mask: 0xF,
|
||||
raw: [0; 3],
|
||||
};
|
||||
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
||||
let body = ctx.finish();
|
||||
assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn loop_clause_rejected() {
|
||||
let shader = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::LoopStart {
|
||||
address: 0,
|
||||
loop_id: 0,
|
||||
}],
|
||||
instructions: vec![],
|
||||
};
|
||||
assert!(matches!(
|
||||
translate(&shader, Stage::Vertex),
|
||||
Translation::Reject(reject::CF_LOOP)
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unsupported_op_rejected() {
|
||||
let w2 = (29u32) // VOP_MAX_A, not in v1 subset
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: true,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
}],
|
||||
instructions: vec![0, 0, w2],
|
||||
};
|
||||
assert!(matches!(
|
||||
translate(&shader, Stage::Vertex),
|
||||
Translation::Reject(reject::VEC_OP_UNSUPPORTED)
|
||||
));
|
||||
}
|
||||
}
|
||||
243
crates/xenia-gpu/src/ucode/alu.rs
Normal file
243
crates/xenia-gpu/src/ucode/alu.rs
Normal file
@@ -0,0 +1,243 @@
|
||||
//! Xenos ALU (vector + scalar) instruction decoder.
|
||||
//!
|
||||
//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
|
||||
//! - word0: operand modifier flags + destination info
|
||||
//! - word1: source register / swizzle fields
|
||||
//! - word2: opcode + write mask + export target
|
||||
//!
|
||||
//! See `ucode.h:900-1400` for the full field map. This decoder captures the
|
||||
//! minimal shape the uber-shader needs; flags we don't interpret yet are
|
||||
//! retained as raw bits in `raw` for downstream inspection.
|
||||
|
||||
/// Decoded ALU instruction.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct AluInstruction {
|
||||
/// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
|
||||
pub vector_opcode: u8,
|
||||
/// Scalar ALU opcode (bits 7..13 of word2).
|
||||
pub scalar_opcode: u8,
|
||||
/// Destination register index for vector result (7 bits).
|
||||
pub vector_dest: u8,
|
||||
/// Destination register index for scalar result (7 bits).
|
||||
pub scalar_dest: u8,
|
||||
/// 4-bit write mask for the vector result (x/y/z/w).
|
||||
pub vector_write_mask: u8,
|
||||
/// 4-bit write mask for the scalar result.
|
||||
pub scalar_write_mask: u8,
|
||||
/// Set when the instruction should write to the export bank (position,
|
||||
/// interpolators, color, etc.) instead of the general register file.
|
||||
pub vector_dest_is_export: bool,
|
||||
/// Selects `ps` (previous scalar result) as the scalar operand when set.
|
||||
pub scalar_src_is_ps: bool,
|
||||
/// Source register indices (at most 3 for vector ops). The src bytes
|
||||
/// are the canary `srcN_reg` fields (8 bits each); for **temp-typed**
|
||||
/// operands (see `src_a_is_temp` etc.), bit 7 of the byte is the abs
|
||||
/// flag and bit 6 is the loop-relative flag — bits 5:0 give the temp
|
||||
/// index. For **constant-typed** operands the full byte is the
|
||||
/// constant index.
|
||||
pub src_a: u8,
|
||||
pub src_b: u8,
|
||||
pub src_c: u8,
|
||||
/// Per-operand "is temporary" flag — when true, the corresponding
|
||||
/// `src_X` byte indexes a general register (r#); when false, it
|
||||
/// indexes an ALU constant (c#). Decoded from word-0 bits 29-31
|
||||
/// (canary's `src3_sel`/`src2_sel`/`src1_sel`). GPUBUG-101.
|
||||
pub src_a_is_temp: bool,
|
||||
pub src_b_is_temp: bool,
|
||||
pub src_c_is_temp: bool,
|
||||
/// Per-operand 8-bit component-relative swizzle (canary's
|
||||
/// `srcN_swiz`, ucode.h:2064-2066). For output component i, the
|
||||
/// selected source component is `((swizzle >> (2*i)) + i) & 3`.
|
||||
/// Identity swizzle is `0x00`. GPUBUG-100.
|
||||
pub src_a_swiz: u8,
|
||||
pub src_b_swiz: u8,
|
||||
pub src_c_swiz: u8,
|
||||
/// Per-operand negate flags (canary's `srcN_reg_negate`, w1 bits
|
||||
/// 24/25/26). Applied after the swizzle. GPUBUG-100.
|
||||
pub src_a_negate: bool,
|
||||
pub src_b_negate: bool,
|
||||
pub src_c_negate: bool,
|
||||
/// Set when the instruction is predicated; skipped if the predicate
|
||||
/// doesn't match `predicate_condition`.
|
||||
pub predicated: bool,
|
||||
pub predicate_condition: bool,
|
||||
/// Raw dwords — preserved verbatim so the translator / interpreter can
|
||||
/// reach into fields we haven't parsed explicitly yet.
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
/// Decode a 3-dword ALU triple.
|
||||
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
AluInstruction {
|
||||
vector_opcode: (w2 & 0x3F) as u8,
|
||||
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
|
||||
vector_dest: ((w2 >> 16) & 0x7F) as u8,
|
||||
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
|
||||
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
|
||||
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
|
||||
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
|
||||
src_a: (w0 & 0xFF) as u8,
|
||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||
// Word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
|
||||
// ucode.h:2078-2086). Our `src_a` is canary's third operand
|
||||
// (low byte of w0), so its selector is bit 29.
|
||||
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
||||
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
||||
src_a_swiz: (w1 & 0xFF) as u8,
|
||||
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
||||
src_c_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 24) & 1) != 0,
|
||||
src_b_negate: ((w1 >> 25) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 26) & 1) != 0,
|
||||
predicated: ((w0 >> 27) & 1) != 0,
|
||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||
raw: words,
|
||||
}
|
||||
}
|
||||
|
||||
/// Vector ALU opcodes we reference by name. Values match canary's
|
||||
/// `AluVectorOpcode` enum in `ucode.h:1354`.
|
||||
pub mod vop {
|
||||
pub const ADD: u8 = 0;
|
||||
pub const MUL: u8 = 1;
|
||||
pub const MAX: u8 = 2;
|
||||
pub const MIN: u8 = 3;
|
||||
pub const SEQ: u8 = 4;
|
||||
pub const SGT: u8 = 5;
|
||||
pub const SGE: u8 = 6;
|
||||
pub const SNE: u8 = 7;
|
||||
pub const FRC: u8 = 8;
|
||||
pub const TRUNC: u8 = 9;
|
||||
pub const FLOOR: u8 = 10;
|
||||
pub const MAD: u8 = 11;
|
||||
pub const CND_EQ: u8 = 12;
|
||||
pub const CND_GE: u8 = 13;
|
||||
pub const CND_GT: u8 = 14;
|
||||
pub const DOT4: u8 = 15;
|
||||
pub const DOT3: u8 = 16;
|
||||
pub const DOT2_ADD: u8 = 17;
|
||||
pub const CUBE: u8 = 18;
|
||||
pub const MAX4: u8 = 19;
|
||||
pub const SETP_EQ_PUSH: u8 = 20;
|
||||
pub const SETP_NE_PUSH: u8 = 21;
|
||||
pub const SETP_GT_PUSH: u8 = 22;
|
||||
pub const SETP_GE_PUSH: u8 = 23;
|
||||
pub const KILL_EQ: u8 = 24;
|
||||
pub const KILL_GT: u8 = 25;
|
||||
pub const KILL_GE: u8 = 26;
|
||||
pub const KILL_NE: u8 = 27;
|
||||
pub const DST: u8 = 28;
|
||||
pub const MAX_A: u8 = 29;
|
||||
}
|
||||
|
||||
/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
|
||||
/// `ucode.h:1001`.
|
||||
pub mod sop {
|
||||
pub const ADDS: u8 = 0;
|
||||
pub const ADDS_PREV: u8 = 1;
|
||||
pub const MULS: u8 = 2;
|
||||
pub const MULS_PREV: u8 = 3;
|
||||
pub const MULS_PREV2: u8 = 4;
|
||||
pub const MAXS: u8 = 5;
|
||||
pub const MINS: u8 = 6;
|
||||
pub const SEQS: u8 = 7;
|
||||
pub const SGTS: u8 = 8;
|
||||
pub const SGES: u8 = 9;
|
||||
pub const SNES: u8 = 10;
|
||||
pub const FRCS: u8 = 11;
|
||||
pub const TRUNCS: u8 = 12;
|
||||
pub const FLOORS: u8 = 13;
|
||||
pub const EXP: u8 = 14;
|
||||
pub const LOGC: u8 = 15;
|
||||
pub const LOG: u8 = 16;
|
||||
pub const RCPC: u8 = 17;
|
||||
pub const RCPF: u8 = 18;
|
||||
pub const RCP: u8 = 19;
|
||||
pub const RSQC: u8 = 20;
|
||||
pub const RSQF: u8 = 21;
|
||||
pub const RSQ: u8 = 22;
|
||||
pub const MAXAS: u8 = 23;
|
||||
pub const MAXASF: u8 = 24;
|
||||
pub const SUBS: u8 = 25;
|
||||
pub const SUBS_PREV: u8 = 26;
|
||||
pub const SETP_EQ: u8 = 27;
|
||||
pub const SETP_NE: u8 = 28;
|
||||
pub const SETP_GT: u8 = 29;
|
||||
pub const SETP_GE: u8 = 30;
|
||||
pub const SETP_INV: u8 = 31;
|
||||
pub const SETP_POP: u8 = 32;
|
||||
pub const SETP_CLR: u8 = 33;
|
||||
pub const SETP_RSTR: u8 = 34;
|
||||
pub const KILLS_EQ: u8 = 35;
|
||||
pub const KILLS_GT: u8 = 36;
|
||||
pub const KILLS_GE: u8 = 37;
|
||||
pub const KILLS_NE: u8 = 38;
|
||||
pub const KILLS_ONE: u8 = 39;
|
||||
pub const SQRT: u8 = 40;
|
||||
pub const MULSC0: u8 = 42;
|
||||
pub const MULSC1: u8 = 43;
|
||||
pub const ADDSC0: u8 = 44;
|
||||
pub const ADDSC1: u8 = 45;
|
||||
pub const SUBSC0: u8 = 46;
|
||||
pub const SUBSC1: u8 = 47;
|
||||
pub const SIN: u8 = 48;
|
||||
pub const COS: u8 = 49;
|
||||
pub const RETAIN_PREV: u8 = 50;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Regression: our table previously drifted from canary's values (e.g.
|
||||
/// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
|
||||
/// the most-often-used scalar + vector opcodes here.
|
||||
#[test]
|
||||
fn opcodes_match_canary_values() {
|
||||
// Scalar.
|
||||
assert_eq!(sop::MAXS, 5);
|
||||
assert_eq!(sop::MINS, 6);
|
||||
assert_eq!(sop::SEQS, 7);
|
||||
assert_eq!(sop::EXP, 14);
|
||||
assert_eq!(sop::LOG, 16);
|
||||
assert_eq!(sop::RCP, 19);
|
||||
assert_eq!(sop::RSQ, 22);
|
||||
assert_eq!(sop::SUBS, 25);
|
||||
assert_eq!(sop::SETP_EQ, 27);
|
||||
assert_eq!(sop::KILLS_EQ, 35);
|
||||
assert_eq!(sop::SQRT, 40);
|
||||
assert_eq!(sop::SIN, 48);
|
||||
assert_eq!(sop::RETAIN_PREV, 50);
|
||||
// Vector.
|
||||
assert_eq!(vop::SNE, 7);
|
||||
assert_eq!(vop::CND_EQ, 12);
|
||||
assert_eq!(vop::MAX4, 19);
|
||||
assert_eq!(vop::KILL_EQ, 24);
|
||||
assert_eq!(vop::DST, 28);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_extracts_opcodes_and_dests() {
|
||||
// Build a minimal ALU word:
|
||||
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
|
||||
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RCP as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (3u32 << 16) // vector_dest
|
||||
| (7u32 << 24); // scalar_dest
|
||||
let alu = decode_alu([0, 0, w2]);
|
||||
assert_eq!(alu.vector_opcode, vop::ADD);
|
||||
assert_eq!(alu.scalar_opcode, sop::RCP);
|
||||
assert_eq!(alu.vector_dest, 3);
|
||||
assert_eq!(alu.scalar_dest, 7);
|
||||
assert_eq!(alu.vector_write_mask, 0xF);
|
||||
}
|
||||
}
|
||||
173
crates/xenia-gpu/src/ucode/control_flow.rs
Normal file
173
crates/xenia-gpu/src/ucode/control_flow.rs
Normal file
@@ -0,0 +1,173 @@
|
||||
//! Xenos control-flow clause decoder.
|
||||
//!
|
||||
//! A shader's CF block is a sequence of 48-bit clauses packed two-per-
|
||||
//! three-dword row. Each clause encodes an opcode and type-specific fields
|
||||
//! (exec addr/count, loop start/end, branch target, etc.).
|
||||
//!
|
||||
//! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset
|
||||
//! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`,
|
||||
//! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the
|
||||
//! translator can log + degrade.
|
||||
|
||||
/// Parsed representation of one CF clause.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ControlFlowInstruction {
|
||||
/// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions.
|
||||
Exec {
|
||||
/// Instruction-block dword index where this clause's instructions start,
|
||||
/// expressed in **triple units** (each inst = 3 dwords).
|
||||
address: u32,
|
||||
/// Number of triples to execute.
|
||||
count: u32,
|
||||
/// The ALU-vs-fetch sequence bitmap (2 bits per instruction).
|
||||
sequence: u32,
|
||||
/// True when this clause ends the shader.
|
||||
is_end: bool,
|
||||
/// True if predicated; skip when predicate != predicate_condition.
|
||||
predicated: bool,
|
||||
predicate_condition: bool,
|
||||
},
|
||||
/// `kLoopStart` — begin a `aL` loop referencing a loop constant.
|
||||
LoopStart { address: u32, loop_id: u32 },
|
||||
/// `kLoopEnd` — close the loop; `address` points at the matching start.
|
||||
LoopEnd { address: u32, loop_id: u32 },
|
||||
/// `kCondJmp` — conditional jump to another CF index.
|
||||
CondJmp {
|
||||
target: u32,
|
||||
predicated: bool,
|
||||
predicate_condition: bool,
|
||||
},
|
||||
/// `kCondCall` — call into another CF subroutine.
|
||||
CondCall { target: u32 },
|
||||
/// `kReturn` — return from subroutine.
|
||||
Return,
|
||||
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
||||
Alloc { size: u32, kind: AllocKind },
|
||||
/// Exit the shader (terminal).
|
||||
Exit,
|
||||
/// Unknown / unhandled opcode.
|
||||
Unknown { opcode: u8 },
|
||||
}
|
||||
|
||||
/// Export target types for `kAlloc` clauses.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum AllocKind {
|
||||
Position,
|
||||
Interpolators,
|
||||
Colors,
|
||||
Memexport,
|
||||
Other,
|
||||
}
|
||||
|
||||
impl AllocKind {
|
||||
fn from_bits(b: u32) -> Self {
|
||||
match b & 0x7 {
|
||||
0 => AllocKind::Position,
|
||||
1 => AllocKind::Interpolators,
|
||||
2 => AllocKind::Colors,
|
||||
3 => AllocKind::Memexport,
|
||||
_ => AllocKind::Other,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode one row (three consecutive CF dwords) into two CF clauses.
|
||||
///
|
||||
/// Word layout per canary (`ucode.h:218-256`):
|
||||
/// - word0 + lo16(word1) → CF_A's 48-bit payload
|
||||
/// - hi16(word1) + word2 → CF_B's 48-bit payload
|
||||
///
|
||||
/// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47).
|
||||
pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) {
|
||||
// Build each 48-bit value as u64; LE within the clause.
|
||||
let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32);
|
||||
let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16);
|
||||
(decode_single(a), decode_single(b))
|
||||
}
|
||||
|
||||
fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
// Top 4 bits of the 48-bit payload.
|
||||
let opcode = ((payload >> 44) & 0xF) as u8;
|
||||
// Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
|
||||
// extraction — good enough for the interpreter, which logs unknowns.
|
||||
let predicated = ((payload >> 28) & 1) != 0;
|
||||
let predicate_condition = ((payload >> 29) & 1) != 0;
|
||||
|
||||
match opcode {
|
||||
0 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: false,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
1 => ControlFlowInstruction::Exit,
|
||||
2 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: true,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
6 => ControlFlowInstruction::LoopStart {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
7 => ControlFlowInstruction::LoopEnd {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
8 => ControlFlowInstruction::CondCall {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
},
|
||||
9 => ControlFlowInstruction::Return,
|
||||
10 => ControlFlowInstruction::CondJmp {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
12 => ControlFlowInstruction::Alloc {
|
||||
size: (payload & 0x7) as u32,
|
||||
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
||||
},
|
||||
other => ControlFlowInstruction::Unknown { opcode: other },
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn opcode_exit_decodes() {
|
||||
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
|
||||
let payload: u64 = 1u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||
assert_eq!(cf, ControlFlowInstruction::Exit);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn opcode_exec_end_carries_address_count() {
|
||||
// opcode 2 (ExecEnd), address=4, count=2, sequence=0.
|
||||
let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4;
|
||||
let hi = (payload & 0xFFFF_FFFF) as u32;
|
||||
let lo = ((payload >> 32) & 0xFFFF) as u32;
|
||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||
match cf {
|
||||
ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
is_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(address, 4);
|
||||
assert_eq!(count, 2);
|
||||
assert!(is_end);
|
||||
}
|
||||
other => panic!("expected Exec, got {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
117
crates/xenia-gpu/src/ucode/fetch.rs
Normal file
117
crates/xenia-gpu/src/ucode/fetch.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Xenos fetch (vertex + texture) instruction decoder.
|
||||
//!
|
||||
//! Like ALU instructions, fetches are 96 bits (3 dwords). The opcode lives
|
||||
//! in the low 5 bits of word0. We split them into `VertexFetch` and
|
||||
//! `TextureFetch` structurally because their operand layouts differ.
|
||||
//!
|
||||
//! Reference: `xenia-canary/src/xenia/gpu/ucode.h:690-877`.
|
||||
|
||||
/// Decoded fetch instruction.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum FetchInstruction {
|
||||
Vertex(VertexFetch),
|
||||
Texture(TextureFetch),
|
||||
/// Unknown / minor variants we don't model yet.
|
||||
Unknown { opcode: u8, raw: [u32; 3] },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct VertexFetch {
|
||||
/// Vertex fetch constant index (0..=95).
|
||||
pub fetch_const: u8,
|
||||
/// Source register index (vertex index in r#).
|
||||
pub src_register: u8,
|
||||
/// Destination register for the fetched value.
|
||||
pub dest_register: u8,
|
||||
/// 4-bit write mask.
|
||||
pub dest_write_mask: u8,
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct TextureFetch {
|
||||
/// Texture fetch constant index (0..=31).
|
||||
pub fetch_const: u8,
|
||||
pub src_register: u8,
|
||||
pub dest_register: u8,
|
||||
pub dest_write_mask: u8,
|
||||
/// Dimension: 0=1D, 1=2D, 2=3D/stacked, 3=cube.
|
||||
pub dimension: u8,
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
/// Opcodes (low 5 bits of word0). From `ucode.h`.
|
||||
pub mod op {
|
||||
pub const VERTEX_FETCH: u8 = 0x00;
|
||||
pub const TEXTURE_FETCH: u8 = 0x01;
|
||||
pub const GET_TEXTURE_BORDER_COLOR_FRAC: u8 = 0x16;
|
||||
pub const GET_TEXTURE_COMPUTED_LOD: u8 = 0x17;
|
||||
pub const GET_TEXTURE_WEIGHTS: u8 = 0x18;
|
||||
pub const GET_TEXTURE_GRADIENTS: u8 = 0x19;
|
||||
pub const SET_TEXTURE_LOD: u8 = 0x1A;
|
||||
pub const SET_TEXTURE_GRADIENTS_HORZ: u8 = 0x1B;
|
||||
pub const SET_TEXTURE_GRADIENTS_VERT: u8 = 0x1C;
|
||||
}
|
||||
|
||||
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let opcode = (w0 & 0x1F) as u8;
|
||||
match opcode {
|
||||
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
dimension: ((w1 >> 29) & 0x3) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
_ => FetchInstruction::Unknown { opcode, raw: words },
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn decode_vertex_fetch() {
|
||||
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
|
||||
let v = decode_fetch([w0, 0, 0]);
|
||||
match v {
|
||||
FetchInstruction::Vertex(vf) => {
|
||||
assert_eq!(vf.fetch_const, 5);
|
||||
assert_eq!(vf.src_register, 2);
|
||||
assert_eq!(vf.dest_register, 7);
|
||||
}
|
||||
other => panic!("expected Vertex, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_texture_fetch() {
|
||||
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
|
||||
let t = decode_fetch([w0, (2u32 << 29), 0]);
|
||||
match t {
|
||||
FetchInstruction::Texture(tf) => {
|
||||
assert_eq!(tf.fetch_const, 3);
|
||||
assert_eq!(tf.dimension, 2);
|
||||
}
|
||||
other => panic!("expected Texture, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_opcode_is_classified() {
|
||||
let v = decode_fetch([0x16, 0, 0]); // GET_TEXTURE_BORDER_COLOR_FRAC
|
||||
assert!(matches!(v, FetchInstruction::Unknown { opcode: 0x16, .. }));
|
||||
}
|
||||
}
|
||||
249
crates/xenia-gpu/src/ucode/mod.rs
Normal file
249
crates/xenia-gpu/src/ucode/mod.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
//! Xenos (ATI R500-family) shader microcode decoder.
|
||||
//!
|
||||
//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
|
||||
//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
|
||||
//! instructions (vector + scalar pipes), and fetch instructions (vertex +
|
||||
//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
|
||||
//! translator comes online in P7, it reuses the same parser.
|
||||
//!
|
||||
//! ## Binary layout
|
||||
//!
|
||||
//! A compiled shader has two sections back-to-back:
|
||||
//!
|
||||
//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
|
||||
//! two clauses into three 32-bit words:
|
||||
//! ```text
|
||||
//! word0 word1 word2
|
||||
//! [-CF_A (48)-][-CF_B (48)-]
|
||||
//! ```
|
||||
//! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
|
||||
//! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
|
||||
//!
|
||||
//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
|
||||
//! instructions. Each control-flow clause of kind `Exec*` references a
|
||||
//! contiguous range of these by `(address, count)` in dwords * 3.
|
||||
//!
|
||||
//! We read big-endian dwords straight out of guest memory (the `raw`
|
||||
//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
|
||||
//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
|
||||
//! layout and `:700-877` for the fetch/ALU mix.
|
||||
|
||||
pub mod alu;
|
||||
pub mod control_flow;
|
||||
pub mod fetch;
|
||||
|
||||
use self::alu::AluInstruction;
|
||||
use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
|
||||
use self::fetch::FetchInstruction;
|
||||
|
||||
/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
|
||||
/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
|
||||
pub mod cf_kind {
|
||||
pub const EXEC: u32 = 0;
|
||||
pub const EXEC_END: u32 = 1;
|
||||
pub const ALLOC: u32 = 2;
|
||||
pub const EXIT: u32 = 3;
|
||||
pub const LOOP_START: u32 = 4;
|
||||
pub const LOOP_END: u32 = 5;
|
||||
pub const COND_JMP: u32 = 6;
|
||||
pub const COND_CALL: u32 = 7;
|
||||
pub const RETURN: u32 = 8;
|
||||
pub const UNKNOWN: u32 = 15;
|
||||
}
|
||||
|
||||
/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
|
||||
pub mod cf_alloc_kind {
|
||||
pub const POSITION: u32 = 0;
|
||||
pub const INTERPOLATORS: u32 = 1;
|
||||
pub const COLORS: u32 = 2;
|
||||
pub const MEMEXPORT: u32 = 3;
|
||||
pub const OTHER: u32 = 4;
|
||||
}
|
||||
|
||||
/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
|
||||
/// interpreter expects:
|
||||
///
|
||||
/// ```text
|
||||
/// [0] cf_count
|
||||
/// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause
|
||||
/// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch)
|
||||
/// ```
|
||||
///
|
||||
/// The CF table lets WGSL walk clauses without reconstructing bit-packed
|
||||
/// layouts on the GPU. Semantics per `kind`:
|
||||
///
|
||||
/// | kind | primary | aux |
|
||||
/// |-------------|----------------------------|------------------------------|
|
||||
/// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count |
|
||||
/// | ALLOC | alloc_kind (see cf_alloc_kind) | size |
|
||||
/// | EXIT | 0 | 0 |
|
||||
/// | LOOP_START | address | loop_id |
|
||||
/// | LOOP_END | address | loop_id |
|
||||
/// | COND_JMP | target | predicate flags |
|
||||
/// | COND_CALL | target | 0 |
|
||||
/// | RETURN | 0 | 0 |
|
||||
/// | UNKNOWN | opcode | 0 |
|
||||
pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
|
||||
let cf_count = parsed.cf.len() as u32;
|
||||
let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
|
||||
out.push(cf_count);
|
||||
for clause in &parsed.cf {
|
||||
let (kind, primary, aux) = encode_cf(*clause);
|
||||
out.push(kind);
|
||||
out.push(primary);
|
||||
out.push(aux);
|
||||
}
|
||||
out.extend_from_slice(&parsed.instructions);
|
||||
out
|
||||
}
|
||||
|
||||
fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
||||
use ControlFlowInstruction::*;
|
||||
match c {
|
||||
Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
is_end,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
} => {
|
||||
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
||||
let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
|
||||
| (pred_bits << 8);
|
||||
(kind, address, (sequence << 8) | count)
|
||||
}
|
||||
Alloc { size, kind } => {
|
||||
let akind = match kind {
|
||||
AllocKind::Position => cf_alloc_kind::POSITION,
|
||||
AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
|
||||
AllocKind::Colors => cf_alloc_kind::COLORS,
|
||||
AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
|
||||
AllocKind::Other => cf_alloc_kind::OTHER,
|
||||
};
|
||||
(cf_kind::ALLOC, akind, size)
|
||||
}
|
||||
Exit => (cf_kind::EXIT, 0, 0),
|
||||
LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
|
||||
LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
|
||||
CondJmp {
|
||||
target,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
} => {
|
||||
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
||||
(cf_kind::COND_JMP, target, pred_bits)
|
||||
}
|
||||
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
||||
Return => (cf_kind::RETURN, 0, 0),
|
||||
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
||||
}
|
||||
}
|
||||
|
||||
/// One instruction word set from the instruction-block section. Xenos packs
|
||||
/// ALU and fetch instructions identically (96 bits each); the owning exec
|
||||
/// clause's "sequence" bitmap decides which is which.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DecodedInstruction {
|
||||
/// ALU pipe (vector ALU + optional co-issued scalar ALU).
|
||||
Alu(AluInstruction),
|
||||
/// Vertex or texture fetch.
|
||||
Fetch(FetchInstruction),
|
||||
}
|
||||
|
||||
/// Parsed shader: the control-flow clause list + the raw 32-bit instruction
|
||||
/// words. The uber-shader / translator is expected to index into
|
||||
/// `instructions` based on `(clause.address * 3, clause.count * 3)`.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ParsedShader {
|
||||
pub cf: Vec<ControlFlowInstruction>,
|
||||
/// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
|
||||
/// instruction; the owning `Exec` clause's `sequence` bitmap picks the
|
||||
/// kind.
|
||||
pub instructions: Vec<u32>,
|
||||
}
|
||||
|
||||
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
||||
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
||||
/// is encoded in the first word's low 12 bits of the last exec clause —
|
||||
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
||||
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
let mut cf = Vec::new();
|
||||
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
||||
// layout). Walk pairs of 3 dwords per pair of clauses.
|
||||
let mut i = 0usize;
|
||||
while i + 2 < raw_dwords.len() {
|
||||
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
||||
let (first, second) = a;
|
||||
let seen_exit = matches!(
|
||||
first,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
) || matches!(
|
||||
second,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
);
|
||||
cf.push(first);
|
||||
cf.push(second);
|
||||
i += 3;
|
||||
if seen_exit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Everything after `i` dwords is the instruction block.
|
||||
let instructions = raw_dwords[i..].to_vec();
|
||||
ParsedShader { cf, instructions }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn empty_blob_parses_empty() {
|
||||
let p = parse_shader(&[]);
|
||||
assert!(p.cf.is_empty());
|
||||
assert!(p.instructions.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_for_wgsl_layout_is_correct() {
|
||||
// Build a tiny ParsedShader by hand and verify the packed form.
|
||||
let parsed = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0x10,
|
||||
count: 3,
|
||||
sequence: 0b1010,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
ControlFlowInstruction::Exit,
|
||||
],
|
||||
instructions: vec![0x1111, 0x2222, 0x3333],
|
||||
};
|
||||
let packed = pack_for_wgsl(&parsed);
|
||||
assert_eq!(packed[0], 2, "cf_count");
|
||||
// First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
|
||||
assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
|
||||
assert_eq!(packed[2], 0x10);
|
||||
assert_eq!(packed[3], (0b1010 << 8) | 3);
|
||||
// Second clause: EXIT
|
||||
assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
|
||||
// Instruction block starts at 1 + 2*3 = 7
|
||||
assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trivial_exit_clause_stops_parsing() {
|
||||
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
||||
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
||||
let w0 = 0u32; // clause A body
|
||||
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
||||
let w2 = 0u32;
|
||||
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
||||
assert!(!p.cf.is_empty());
|
||||
// Exit detected → remaining dword is instruction data.
|
||||
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
||||
}
|
||||
}
|
||||
124
crates/xenia-gpu/src/xenos_constants.rs
Normal file
124
crates/xenia-gpu/src/xenos_constants.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
//! The "Xenos constants" block the WGSL interpreter consumes per draw.
|
||||
//!
|
||||
//! Mirrors the Xenos register-file regions that carry the per-draw constant
|
||||
//! values shaders reference at runtime:
|
||||
//!
|
||||
//! | Region | Base | Count | Size |
|
||||
//! |--------|------|-------|------|
|
||||
//! | ALU | 0x4000 | 512 × vec4<f32> | 8 KB |
|
||||
//! | Fetch | 0x4800 | 256 × u32 | 1 KB |
|
||||
//! | Bool | 0x4900 | 8 × u32 | 32 B |
|
||||
//! | Loop | 0x4908 | 32 × u32 | 128 B |
|
||||
//!
|
||||
//! Total: ~9.2 KB, well under the 64 KB min uniform buffer size on all wgpu
|
||||
//! backends. The `XenosConstantsBlock` is declared `#[repr(C)]` + bytemuck
|
||||
//! `Pod` so it can be `bytemuck::bytes_of()`'d directly into a wgpu uniform
|
||||
//! buffer. The matching WGSL `struct XenosConstants` lives in
|
||||
//! `shaders/xenos_interp.wgsl`.
|
||||
|
||||
use bytemuck::{Pod, Zeroable};
|
||||
|
||||
use crate::register_file::RegisterFile;
|
||||
|
||||
pub const ALU_CONSTANT_COUNT: usize = 512;
|
||||
pub const FETCH_CONSTANT_COUNT: usize = 256;
|
||||
pub const BOOL_CONSTANT_COUNT: usize = 8;
|
||||
pub const LOOP_CONSTANT_COUNT: usize = 32;
|
||||
|
||||
pub const CONST_BASE_ALU: u32 = 0x4000;
|
||||
pub const CONST_BASE_FETCH: u32 = 0x4800;
|
||||
pub const CONST_BASE_BOOL: u32 = 0x4900;
|
||||
pub const CONST_BASE_LOOP: u32 = 0x4908;
|
||||
|
||||
/// Per-draw constants block uploaded once to the uniform buffer at
|
||||
/// `@group(0) @binding(1)`.
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct XenosConstantsBlock {
|
||||
pub alu: [[f32; 4]; ALU_CONSTANT_COUNT],
|
||||
pub fetch: [u32; FETCH_CONSTANT_COUNT],
|
||||
pub bool_consts: [u32; BOOL_CONSTANT_COUNT],
|
||||
pub loop_consts: [u32; LOOP_CONSTANT_COUNT],
|
||||
}
|
||||
|
||||
// SAFETY: all fields are Pod arrays of Pod primitives; `#[repr(C)]` fixes
|
||||
// the layout. `bytemuck` derives `Pod` only when alignment + padding line
|
||||
// up, so manual `unsafe impl` is the right tool here.
|
||||
unsafe impl Zeroable for XenosConstantsBlock {}
|
||||
unsafe impl Pod for XenosConstantsBlock {}
|
||||
|
||||
impl Default for XenosConstantsBlock {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
alu: [[0.0; 4]; ALU_CONSTANT_COUNT],
|
||||
fetch: [0; FETCH_CONSTANT_COUNT],
|
||||
bool_consts: [0; BOOL_CONSTANT_COUNT],
|
||||
loop_consts: [0; LOOP_CONSTANT_COUNT],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl XenosConstantsBlock {
|
||||
/// Size in bytes — exposed for tests + wgpu buffer sizing.
|
||||
pub const SIZE: usize = std::mem::size_of::<Self>();
|
||||
|
||||
/// Snapshot the constants from a Xenos `RegisterFile` into a dense,
|
||||
/// host-friendly layout the WGSL interpreter expects. ALU constants
|
||||
/// (vec4 each) are 4 consecutive registers; fetch constants are u32.
|
||||
pub fn snapshot(rf: &RegisterFile) -> Self {
|
||||
let mut out = Self::default();
|
||||
for i in 0..ALU_CONSTANT_COUNT {
|
||||
let base = CONST_BASE_ALU + (i as u32) * 4;
|
||||
out.alu[i] = [
|
||||
f32::from_bits(rf.read(base)),
|
||||
f32::from_bits(rf.read(base + 1)),
|
||||
f32::from_bits(rf.read(base + 2)),
|
||||
f32::from_bits(rf.read(base + 3)),
|
||||
];
|
||||
}
|
||||
for i in 0..FETCH_CONSTANT_COUNT {
|
||||
out.fetch[i] = rf.read(CONST_BASE_FETCH + i as u32);
|
||||
}
|
||||
for i in 0..BOOL_CONSTANT_COUNT {
|
||||
out.bool_consts[i] = rf.read(CONST_BASE_BOOL + i as u32);
|
||||
}
|
||||
for i in 0..LOOP_CONSTANT_COUNT {
|
||||
out.loop_consts[i] = rf.read(CONST_BASE_LOOP + i as u32);
|
||||
}
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Layout-sanity: total size is (512·16) + (256·4) + (8·4) + (32·4) =
|
||||
/// 8192 + 1024 + 32 + 128 = 9376 bytes. If this number drifts, either
|
||||
/// the constant counts changed or the compiler added padding; either
|
||||
/// way we want to know at test time because the WGSL struct layout in
|
||||
/// `xenos_interp.wgsl` depends on it.
|
||||
#[test]
|
||||
fn xenos_constants_block_size_is_stable() {
|
||||
assert_eq!(XenosConstantsBlock::SIZE, 9376);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_roundtrip_from_register_file() {
|
||||
let mut rf = RegisterFile::new();
|
||||
// Write a recognisable pattern to alu[0] = (1.0, 2.0, 3.0, 4.0)
|
||||
rf.write(CONST_BASE_ALU + 0, f32::to_bits(1.0));
|
||||
rf.write(CONST_BASE_ALU + 1, f32::to_bits(2.0));
|
||||
rf.write(CONST_BASE_ALU + 2, f32::to_bits(3.0));
|
||||
rf.write(CONST_BASE_ALU + 3, f32::to_bits(4.0));
|
||||
rf.write(CONST_BASE_FETCH + 5, 0xDEAD_BEEF);
|
||||
rf.write(CONST_BASE_BOOL, 0x1234);
|
||||
rf.write(CONST_BASE_LOOP + 3, 0x5678);
|
||||
|
||||
let snap = XenosConstantsBlock::snapshot(&rf);
|
||||
assert_eq!(snap.alu[0], [1.0, 2.0, 3.0, 4.0]);
|
||||
assert_eq!(snap.fetch[5], 0xDEAD_BEEF);
|
||||
assert_eq!(snap.bool_consts[0], 0x1234);
|
||||
assert_eq!(snap.loop_consts[3], 0x5678);
|
||||
}
|
||||
}
|
||||
@@ -6,5 +6,6 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
xenia-types = { workspace = true }
|
||||
xenia-memory = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
@@ -1,9 +1,24 @@
|
||||
//! Human input device system.
|
||||
//!
|
||||
//! Holds the guest-facing `X_INPUT_*` struct layouts (big-endian, matching the
|
||||
//! Xbox 360 ABI exactly) and the host-neutral `GamepadState` snapshot used by
|
||||
//! both the UI (writer) and the kernel's `XamInputGetState` handler (reader).
|
||||
|
||||
use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
|
||||
/// Human input device system stub.
|
||||
pub struct InputSystem {
|
||||
pub gamepad: GamepadState,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Copy)]
|
||||
/// Host-side gamepad snapshot.
|
||||
///
|
||||
/// Kept as POD so it can be dropped into an `AtomicCell<GamepadState>` for
|
||||
/// lock-free UI→CPU state transfer. Layout intentionally matches the in-memory
|
||||
/// subset of `X_INPUT_GAMEPAD` (minus endianness), but the wire serializer in
|
||||
/// [`write_input_state`] handles the big-endian conversion explicitly.
|
||||
#[derive(Default, Clone, Copy, Debug)]
|
||||
#[repr(C)]
|
||||
pub struct GamepadState {
|
||||
pub buttons: u16,
|
||||
pub left_trigger: u8,
|
||||
@@ -14,7 +29,8 @@ pub struct GamepadState {
|
||||
pub right_stick_y: i16,
|
||||
}
|
||||
|
||||
/// Xbox 360 button flags
|
||||
/// Xbox 360 button flags. Values match `X_INPUT_GAMEPAD_BUTTON` in
|
||||
/// `xenia-canary/src/xenia/hid/input.h`.
|
||||
pub mod buttons {
|
||||
pub const DPAD_UP: u16 = 0x0001;
|
||||
pub const DPAD_DOWN: u16 = 0x0002;
|
||||
@@ -26,12 +42,28 @@ pub mod buttons {
|
||||
pub const RIGHT_THUMB: u16 = 0x0080;
|
||||
pub const LEFT_SHOULDER: u16 = 0x0100;
|
||||
pub const RIGHT_SHOULDER: u16 = 0x0200;
|
||||
pub const GUIDE: u16 = 0x0400;
|
||||
pub const A: u16 = 0x1000;
|
||||
pub const B: u16 = 0x2000;
|
||||
pub const X: u16 = 0x4000;
|
||||
pub const Y: u16 = 0x8000;
|
||||
}
|
||||
|
||||
/// Xbox guest error codes returned by `XamInput*`.
|
||||
pub mod errors {
|
||||
/// ERROR_SUCCESS
|
||||
pub const SUCCESS: u32 = 0;
|
||||
/// ERROR_DEVICE_NOT_CONNECTED
|
||||
pub const DEVICE_NOT_CONNECTED: u32 = 0x48F;
|
||||
/// ERROR_EMPTY (used by XamInputGetKeystroke when no event queued)
|
||||
pub const EMPTY: u32 = 0x10D2;
|
||||
}
|
||||
|
||||
/// Sub-types that games query via `X_INPUT_CAPABILITIES::SubType`.
|
||||
pub mod subtype {
|
||||
pub const GAMEPAD: u8 = 0x01;
|
||||
}
|
||||
|
||||
impl InputSystem {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
@@ -45,3 +77,117 @@ impl Default for InputSystem {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize a [`GamepadState`] into the 16-byte big-endian `X_INPUT_STATE`
|
||||
/// struct at `out_ptr` in guest memory.
|
||||
///
|
||||
/// Layout (matches `xenia-canary/src/xenia/hid/input.h`):
|
||||
///
|
||||
/// | Offset | Size | Field | Endianness |
|
||||
/// |--------|------|----------------|------------|
|
||||
/// | 0x00 | 4 | packet_number | BE u32 |
|
||||
/// | 0x04 | 2 | buttons | BE u16 |
|
||||
/// | 0x06 | 1 | left_trigger | u8 |
|
||||
/// | 0x07 | 1 | right_trigger | u8 |
|
||||
/// | 0x08 | 2 | thumb_lx | BE i16 |
|
||||
/// | 0x0A | 2 | thumb_ly | BE i16 |
|
||||
/// | 0x0C | 2 | thumb_rx | BE i16 |
|
||||
/// | 0x0E | 2 | thumb_ry | BE i16 |
|
||||
pub fn write_input_state(
|
||||
mem: &GuestMemory,
|
||||
out_ptr: u32,
|
||||
packet_number: u32,
|
||||
state: &GamepadState,
|
||||
) {
|
||||
if out_ptr == 0 {
|
||||
return;
|
||||
}
|
||||
mem.write_u32(out_ptr, packet_number);
|
||||
mem.write_u16(out_ptr + 0x04, state.buttons);
|
||||
mem.write_u8(out_ptr + 0x06, state.left_trigger);
|
||||
mem.write_u8(out_ptr + 0x07, state.right_trigger);
|
||||
mem.write_u16(out_ptr + 0x08, state.left_stick_x as u16);
|
||||
mem.write_u16(out_ptr + 0x0A, state.left_stick_y as u16);
|
||||
mem.write_u16(out_ptr + 0x0C, state.right_stick_x as u16);
|
||||
mem.write_u16(out_ptr + 0x0E, state.right_stick_y as u16);
|
||||
}
|
||||
|
||||
/// Serialize an `X_INPUT_CAPABILITIES` block for a standard wired controller.
|
||||
///
|
||||
/// Layout (20 bytes, matches `xenia-canary`):
|
||||
///
|
||||
/// | Offset | Size | Field |
|
||||
/// |--------|------|------------------|
|
||||
/// | 0x00 | 1 | Type (gamepad=1) |
|
||||
/// | 0x01 | 1 | SubType |
|
||||
/// | 0x02 | 2 | Flags |
|
||||
/// | 0x04 | 12 | Gamepad state |
|
||||
/// | 0x10 | 2 | Vibration.left |
|
||||
/// | 0x12 | 2 | Vibration.right |
|
||||
pub fn write_input_capabilities(mem: &GuestMemory, out_ptr: u32) {
|
||||
if out_ptr == 0 {
|
||||
return;
|
||||
}
|
||||
// Type = DEVTYPE_GAMEPAD (1), SubType = STANDARD (1), Flags = 0
|
||||
mem.write_u8(out_ptr, 1);
|
||||
mem.write_u8(out_ptr + 0x01, subtype::GAMEPAD);
|
||||
mem.write_u16(out_ptr + 0x02, 0);
|
||||
// Gamepad capabilities: buttons = all standard bits, triggers+sticks = full range.
|
||||
// Games typically AND the gamepad mask to decide which controls exist; advertising
|
||||
// everything is the safe default.
|
||||
mem.write_u16(out_ptr + 0x04, 0xF3FF); // buttons: all except GUIDE
|
||||
mem.write_u8(out_ptr + 0x06, 0xFF); // left_trigger range
|
||||
mem.write_u8(out_ptr + 0x07, 0xFF); // right_trigger range
|
||||
mem.write_u16(out_ptr + 0x08, 0xFFFFu16); // lx
|
||||
mem.write_u16(out_ptr + 0x0A, 0xFFFFu16); // ly
|
||||
mem.write_u16(out_ptr + 0x0C, 0xFFFFu16); // rx
|
||||
mem.write_u16(out_ptr + 0x0E, 0xFFFFu16); // ry
|
||||
// Vibration: both motors full range
|
||||
mem.write_u16(out_ptr + 0x10, 0xFFFFu16);
|
||||
mem.write_u16(out_ptr + 0x12, 0xFFFFu16);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use xenia_memory::page_table::MemoryProtect;
|
||||
|
||||
#[test]
|
||||
fn write_input_state_layout_is_big_endian() {
|
||||
let mut mem = GuestMemory::new().unwrap();
|
||||
let rw = MemoryProtect::READ | MemoryProtect::WRITE;
|
||||
mem.alloc(0x4000_0000, 0x1000, rw).unwrap();
|
||||
let state = GamepadState {
|
||||
buttons: buttons::A | buttons::START, // 0x1010
|
||||
left_trigger: 0x80,
|
||||
right_trigger: 0x40,
|
||||
left_stick_x: 0x0102,
|
||||
left_stick_y: -1,
|
||||
right_stick_x: 0x0304,
|
||||
right_stick_y: 0x0506,
|
||||
};
|
||||
write_input_state(&mut mem, 0x4000_0000, 0xDEAD_BEEF, &state);
|
||||
// packet_number BE
|
||||
assert_eq!(mem.read_u8(0x4000_0000), 0xDE);
|
||||
assert_eq!(mem.read_u8(0x4000_0001), 0xAD);
|
||||
assert_eq!(mem.read_u8(0x4000_0002), 0xBE);
|
||||
assert_eq!(mem.read_u8(0x4000_0003), 0xEF);
|
||||
// buttons BE (0x1010 → high byte 0x10 first)
|
||||
assert_eq!(mem.read_u8(0x4000_0004), 0x10);
|
||||
assert_eq!(mem.read_u8(0x4000_0005), 0x10);
|
||||
// triggers
|
||||
assert_eq!(mem.read_u8(0x4000_0006), 0x80);
|
||||
assert_eq!(mem.read_u8(0x4000_0007), 0x40);
|
||||
// thumb_lx BE
|
||||
assert_eq!(mem.read_u8(0x4000_0008), 0x01);
|
||||
assert_eq!(mem.read_u8(0x4000_0009), 0x02);
|
||||
// thumb_ly = -1 → 0xFFFF
|
||||
assert_eq!(mem.read_u8(0x4000_000A), 0xFF);
|
||||
assert_eq!(mem.read_u8(0x4000_000B), 0xFF);
|
||||
// thumb_rx BE
|
||||
assert_eq!(mem.read_u8(0x4000_000C), 0x03);
|
||||
assert_eq!(mem.read_u8(0x4000_000D), 0x04);
|
||||
assert_eq!(mem.read_u8(0x4000_000E), 0x05);
|
||||
assert_eq!(mem.read_u8(0x4000_000F), 0x06);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,10 @@ license.workspace = true
|
||||
xenia-types = { workspace = true }
|
||||
xenia-memory = { workspace = true }
|
||||
xenia-cpu = { workspace = true }
|
||||
xenia-vfs = { workspace = true }
|
||||
xenia-hid = { workspace = true }
|
||||
xenia-gpu = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
|
||||
357
crates/xenia-kernel/src/audit.rs
Normal file
357
crates/xenia-kernel/src/audit.rs
Normal file
@@ -0,0 +1,357 @@
|
||||
//! Per-handle audit trail for diagnosing HLE sync gaps.
|
||||
//!
|
||||
//! When enabled (via `--trace-handles` / `XENIA_TRACE_HANDLES=1`), the kernel
|
||||
//! records every handle's create/signal/wait/wake events into a bounded
|
||||
//! ring per handle. `dump_thread_diagnostic` (in `xenia-app`) prints the
|
||||
//! trail at end-of-run, which lets a session see *who* signaled (or failed
|
||||
//! to signal) a given handle and *who* parked on it.
|
||||
//!
|
||||
//! The harness is behavior-neutral: when `enabled = false` (the default),
|
||||
//! every record method is an `#[inline]` no-op. When enabled, each record
|
||||
//! costs an O(1) HashMap probe + a `VecDeque::push_back` with a bounded
|
||||
//! `pop_front` to keep memory at ~32 KiB per handle worst case.
|
||||
//!
|
||||
//! See [project_xenia_rs_scheduler.md] note on the latent
|
||||
//! `scheduler.deadlock_recoveries` event during boot — this harness exists
|
||||
//! to identify which kernel API should signal handles
|
||||
//! `0x10FC / 0x1014 / 0x1104 / 0x10DC / 0x10F0` but doesn't.
|
||||
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
|
||||
/// Maximum events per category per handle. Bounded so a long-running session
|
||||
/// doesn't OOM if a handle is signaled millions of times.
|
||||
pub const AUDIT_RING_CAPACITY: usize = 32;
|
||||
|
||||
/// One audit record. Captured at the export's call site so `lr` points at
|
||||
/// the guest caller (one instruction past the `bl` to the kernel thunk).
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct HandleAuditEntry {
|
||||
/// Per-thread timebase tick at the time of the event. Useful for
|
||||
/// ordering events across threads — same units as
|
||||
/// `Scheduler::ctx(0).timebase`.
|
||||
pub cycle: u64,
|
||||
/// Guest thread id (NOT hw_id — `tid` survives migration).
|
||||
pub tid: u32,
|
||||
/// Caller's LR (the guest pc one past the `bl` to the export).
|
||||
pub lr: u32,
|
||||
/// Stable, kernel-internal label naming the source export. e.g.
|
||||
/// "KeSetEvent", "NtSetEvent", "wake_eligible_waiters".
|
||||
pub source: &'static str,
|
||||
/// Free-form auxiliary data. For signals: previous_state. For waits:
|
||||
/// `(alertable, timeout_ns_or_max)` packed. For wakes: `gpr[3]` set.
|
||||
/// Read by callers as needed.
|
||||
pub aux: u64,
|
||||
}
|
||||
|
||||
/// Per-handle audit trail. Lives in `KernelState::audit.trails`.
|
||||
#[derive(Debug)]
|
||||
pub struct HandleAuditTrail {
|
||||
/// Stable label: "Event/Manual", "Event/Auto", "Semaphore", "Timer/Manual",
|
||||
/// "Timer/Auto", "Mutant", "Thread". Used for filtering in the dump.
|
||||
pub kind: &'static str,
|
||||
/// When/who/where the handle was minted.
|
||||
pub created: HandleAuditEntry,
|
||||
/// KRNBUG-AUDIT-002 producer-trace. Captured frames at allocation
|
||||
/// time, only populated when the handle is in `HandleAudit::focus`
|
||||
/// AND the create site routed through the `_with_stack` variant.
|
||||
/// Frame layout: `(frame_pointer, saved_lr_for_caller_of_that_frame)`.
|
||||
/// Index 0 is the live frame: `(ctx.gpr[1], ctx.lr)`. Index 1+ comes
|
||||
/// from walking the PPC back-chain. An empty vec means either the
|
||||
/// handle wasn't in focus or the create site didn't capture a stack.
|
||||
pub created_stack: Vec<(u32, u32)>,
|
||||
/// KRNBUG-AUDIT-003 class probes. Each entry is one already-formatted
|
||||
/// "frame=N r31=0x... vtable=0x... class=..." line, captured at
|
||||
/// allocation time from the live PPC context (frame 0: ctx.gpr[31] /
|
||||
/// r30 / r3) and the standard prologue spill area at `[fp - 12]` /
|
||||
/// `[fp - 16]` for deeper frames. Pre-formatted because the source
|
||||
/// memory is overwritten once tid=1 leaves the static-init phase, so
|
||||
/// the probe must run at the create call site, not at end-of-run.
|
||||
pub created_class_probes: Vec<String>,
|
||||
/// Bounded ring of signal events.
|
||||
pub signals: VecDeque<HandleAuditEntry>,
|
||||
/// Bounded ring of wait-entry events (one per `Wait*` call).
|
||||
pub waits: VecDeque<HandleAuditEntry>,
|
||||
/// Bounded ring of wake events (one per scheduler-side wake).
|
||||
pub wakes: VecDeque<HandleAuditEntry>,
|
||||
}
|
||||
|
||||
impl HandleAuditTrail {
|
||||
fn new(kind: &'static str, created: HandleAuditEntry) -> Self {
|
||||
Self {
|
||||
kind,
|
||||
created,
|
||||
created_stack: Vec::new(),
|
||||
created_class_probes: Vec::new(),
|
||||
signals: VecDeque::with_capacity(AUDIT_RING_CAPACITY),
|
||||
waits: VecDeque::with_capacity(AUDIT_RING_CAPACITY),
|
||||
wakes: VecDeque::with_capacity(AUDIT_RING_CAPACITY),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The audit table itself. Lives on `KernelState`; opt-in via `enabled`.
|
||||
///
|
||||
/// `focus` + `ghost_trails` form the **parked-waiter diagnostic** added for
|
||||
/// audit-2026-05-fix Phase 2 (KRNBUG-AUDIT-001). When `focus` is non-empty,
|
||||
/// `record_signal_attempt` keeps a "ghost trail" for handles in the focus
|
||||
/// set even if no `record_create` ever observed them — i.e. the guest hand-
|
||||
/// initialized a `KEVENT` (via `KeInitializeEvent` or a raw write) and the
|
||||
/// existing `record_signal` would silently drop the attempt. Ghost trails
|
||||
/// are the only way to distinguish "guest never called Nt/KeSetEvent on
|
||||
/// this handle" from "signal landed but waiter wasn't woken".
|
||||
#[derive(Debug, Default)]
|
||||
pub struct HandleAudit {
|
||||
pub trails: HashMap<u32, HandleAuditTrail>,
|
||||
pub enabled: bool,
|
||||
/// Focus set: when non-empty, signals targeting handles in this set are
|
||||
/// captured even when no `record_create` exists. Populated from
|
||||
/// `--trace-handles=0x1004,0x100c,...`. Empty = whole-table audit.
|
||||
pub focus: HashSet<u32>,
|
||||
/// Ghost trails for never-created handles whose signals we still want
|
||||
/// to see. Keyed by handle. Only populated for handles in `focus`.
|
||||
pub ghost_trails: HashMap<u32, GhostTrail>,
|
||||
}
|
||||
|
||||
/// A ghost trail is a signal-only timeline for a handle that was never
|
||||
/// `record_create`d. We don't have a `kind` because we never saw a creation;
|
||||
/// callers rendering the report should label these as `<UNCREATED>`.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GhostTrail {
|
||||
pub signals: VecDeque<HandleAuditEntry>,
|
||||
}
|
||||
|
||||
impl HandleAudit {
|
||||
/// Push an entry into a bounded ring, dropping the oldest when full.
|
||||
#[inline]
|
||||
fn push_bounded(ring: &mut VecDeque<HandleAuditEntry>, entry: HandleAuditEntry) {
|
||||
if ring.len() == AUDIT_RING_CAPACITY {
|
||||
ring.pop_front();
|
||||
}
|
||||
ring.push_back(entry);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn record_create(&mut self, handle: u32, kind: &'static str, entry: HandleAuditEntry) {
|
||||
if !self.enabled {
|
||||
return;
|
||||
}
|
||||
self.trails
|
||||
.insert(handle, HandleAuditTrail::new(kind, entry));
|
||||
}
|
||||
|
||||
/// Same as `record_create`, but additionally stores a captured guest
|
||||
/// stack trace on the trail (`created_stack`). Intended for handles
|
||||
/// in `focus` so the dump can name the actual subsystem caller of the
|
||||
/// kernel API rather than just the immediate wrapper return.
|
||||
#[inline]
|
||||
pub fn record_create_with_stack(
|
||||
&mut self,
|
||||
handle: u32,
|
||||
kind: &'static str,
|
||||
entry: HandleAuditEntry,
|
||||
stack: Vec<(u32, u32)>,
|
||||
) {
|
||||
self.record_create_with_stack_and_probes(handle, kind, entry, stack, Vec::new());
|
||||
}
|
||||
|
||||
/// Variant of `record_create_with_stack` that also accepts pre-
|
||||
/// formatted class-probe strings (KRNBUG-AUDIT-003). Each string is
|
||||
/// one frame's RTTI/vtable readout: `frame=N candidate=r31 this=0x...
|
||||
/// vtable=0x... class=...` or the RTTI-stripped fallback. Caller
|
||||
/// formats them so this module remains memory-layout-agnostic.
|
||||
#[inline]
|
||||
pub fn record_create_with_stack_and_probes(
|
||||
&mut self,
|
||||
handle: u32,
|
||||
kind: &'static str,
|
||||
entry: HandleAuditEntry,
|
||||
stack: Vec<(u32, u32)>,
|
||||
class_probes: Vec<String>,
|
||||
) {
|
||||
if !self.enabled {
|
||||
return;
|
||||
}
|
||||
let mut trail = HandleAuditTrail::new(kind, entry);
|
||||
trail.created_stack = stack;
|
||||
trail.created_class_probes = class_probes;
|
||||
self.trails.insert(handle, trail);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn record_signal(&mut self, handle: u32, entry: HandleAuditEntry) {
|
||||
if !self.enabled {
|
||||
return;
|
||||
}
|
||||
if let Some(trail) = self.trails.get_mut(&handle) {
|
||||
Self::push_bounded(&mut trail.signals, entry);
|
||||
return;
|
||||
}
|
||||
// No primary trail. Fall through to ghost-trail logic so signals
|
||||
// targeting focus-set handles are not silently dropped.
|
||||
self.record_signal_attempt_ghost(handle, entry);
|
||||
}
|
||||
|
||||
/// Record a signal attempt that targeted a focus-set handle but had no
|
||||
/// primary trail (i.e. the handle was never `record_create`d via one
|
||||
/// of our audit hook sites). Inserts into `ghost_trails`. Bounded by
|
||||
/// `AUDIT_RING_CAPACITY` per handle. No-op when `enabled = false` or
|
||||
/// `handle` is not in `focus`.
|
||||
///
|
||||
/// Public for direct invocation from internal kernel signal sites that
|
||||
/// don't go through `record_signal` (e.g. `signal_io_completion_event`,
|
||||
/// IRQ-callback paths) — those callers should both `record_signal`
|
||||
/// (for the primary-trail case) AND fall through here.
|
||||
#[inline]
|
||||
pub fn record_signal_attempt_ghost(&mut self, handle: u32, entry: HandleAuditEntry) {
|
||||
if !self.enabled || !self.focus.contains(&handle) {
|
||||
return;
|
||||
}
|
||||
let ghost = self.ghost_trails.entry(handle).or_default();
|
||||
Self::push_bounded(&mut ghost.signals, entry);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn record_wait(&mut self, handle: u32, entry: HandleAuditEntry) {
|
||||
if !self.enabled {
|
||||
return;
|
||||
}
|
||||
if let Some(trail) = self.trails.get_mut(&handle) {
|
||||
Self::push_bounded(&mut trail.waits, entry);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn record_wake(&mut self, handle: u32, entry: HandleAuditEntry) {
|
||||
if !self.enabled {
|
||||
return;
|
||||
}
|
||||
if let Some(trail) = self.trails.get_mut(&handle) {
|
||||
Self::push_bounded(&mut trail.wakes, entry);
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience: `(signal_count, wait_count, wake_count)` for a handle.
|
||||
/// Returns `None` if no trail exists.
|
||||
pub fn counts(&self, handle: u32) -> Option<(usize, usize, usize)> {
|
||||
self.trails
|
||||
.get(&handle)
|
||||
.map(|t| (t.signals.len(), t.waits.len(), t.wakes.len()))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn entry(cycle: u64, source: &'static str) -> HandleAuditEntry {
|
||||
HandleAuditEntry { cycle, tid: 1, lr: 0x8200_0000, source, aux: 0 }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disabled_audit_is_a_noop() {
|
||||
let mut a = HandleAudit::default();
|
||||
a.record_create(0x1000, "Event/Auto", entry(0, "NtCreateEvent"));
|
||||
a.record_signal(0x1000, entry(1, "NtSetEvent"));
|
||||
assert!(a.trails.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn enabled_records_create_and_events() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.record_create(0x1014, "Event/Auto", entry(0, "NtCreateEvent"));
|
||||
a.record_signal(0x1014, entry(10, "NtSetEvent"));
|
||||
a.record_wait(0x1014, entry(5, "NtWaitForSingleObjectEx"));
|
||||
a.record_wake(0x1014, entry(11, "wake_eligible_waiters"));
|
||||
|
||||
let counts = a.counts(0x1014).unwrap();
|
||||
assert_eq!(counts, (1, 1, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn signal_for_unknown_handle_is_dropped() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
// No `record_create` first → handle has no trail. Without focus,
|
||||
// the signal is silently dropped (legacy behavior).
|
||||
a.record_signal(0x9999, entry(1, "NtSetEvent"));
|
||||
assert!(a.trails.is_empty());
|
||||
assert!(a.ghost_trails.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn signal_for_focus_handle_lands_in_ghost_trail() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.focus.insert(0x1004);
|
||||
// No `record_create` for 0x1004 — but it's in the focus set.
|
||||
a.record_signal(0x1004, entry(1, "NtSetEvent"));
|
||||
a.record_signal(0x1004, entry(2, "KeSetEvent"));
|
||||
// 0x9999 NOT in focus → still dropped.
|
||||
a.record_signal(0x9999, entry(3, "NtSetEvent"));
|
||||
|
||||
assert!(a.trails.is_empty());
|
||||
let ghost = a.ghost_trails.get(&0x1004).expect("ghost trail expected");
|
||||
assert_eq!(ghost.signals.len(), 2);
|
||||
assert!(!a.ghost_trails.contains_key(&0x9999));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ghost_trail_does_not_double_record_when_primary_exists() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.focus.insert(0x1004);
|
||||
a.record_create(0x1004, "Event/Manual", entry(0, "NtCreateEvent"));
|
||||
a.record_signal(0x1004, entry(1, "NtSetEvent"));
|
||||
|
||||
assert_eq!(a.trails[&0x1004].signals.len(), 1);
|
||||
assert!(a.ghost_trails.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ring_is_bounded_to_capacity() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.record_create(0x10FC, "Event/Auto", entry(0, "NtCreateEvent"));
|
||||
for i in 0..(AUDIT_RING_CAPACITY * 3) as u64 {
|
||||
a.record_signal(0x10FC, entry(i, "NtSetEvent"));
|
||||
}
|
||||
let trail = &a.trails[&0x10FC];
|
||||
assert_eq!(trail.signals.len(), AUDIT_RING_CAPACITY);
|
||||
// Oldest should have been dropped — the first remaining entry is at
|
||||
// cycle = 2 * AUDIT_RING_CAPACITY (i.e. 64 if capacity = 32).
|
||||
let first = trail.signals.front().unwrap();
|
||||
assert_eq!(first.cycle, (AUDIT_RING_CAPACITY * 2) as u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_handle_counts_returns_none() {
|
||||
let a = HandleAudit::default();
|
||||
assert!(a.counts(0x10FC).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_with_stack_stores_frames() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
let frames = vec![
|
||||
(0x7000_0100, 0x824a_9f6c),
|
||||
(0x7000_0200, 0x824a_b020),
|
||||
(0x7000_0300, 0x82bb_aa00),
|
||||
];
|
||||
a.record_create_with_stack(
|
||||
0x1004,
|
||||
"Event/Manual",
|
||||
entry(0, "NtCreateEvent"),
|
||||
frames.clone(),
|
||||
);
|
||||
let trail = &a.trails[&0x1004];
|
||||
assert_eq!(trail.created_stack, frames);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_with_stack_disabled_is_noop() {
|
||||
let mut a = HandleAudit::default();
|
||||
a.record_create_with_stack(
|
||||
0x1004,
|
||||
"Event/Manual",
|
||||
entry(0, "NtCreateEvent"),
|
||||
vec![(0x7000_0000, 0x8200_0000)],
|
||||
);
|
||||
assert!(a.trails.is_empty());
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
516
crates/xenia-kernel/src/interrupts.rs
Normal file
516
crates/xenia-kernel/src/interrupts.rs
Normal file
@@ -0,0 +1,516 @@
|
||||
//! Graphics interrupt + synthetic v-sync bookkeeping (P6).
|
||||
//!
|
||||
//! The Xbox 360 graphics driver calls `VdSetGraphicsInterruptCallback` to
|
||||
//! register a single per-process callback that the OS invokes on:
|
||||
//!
|
||||
//! 1. **V-sync** — at 60 Hz; source code 0 (`INTERRUPT_SOURCE_VSYNC`).
|
||||
//! 2. **Command-processor interrupt** — when `PM4_INTERRUPT` fires from the
|
||||
//! guest-issued command stream; source code 1 (`INTERRUPT_SOURCE_CP`).
|
||||
//!
|
||||
//! Canary's [xboxkrnl_video.cc:303-310](xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc#L303-L310)
|
||||
//! dispatches the callback on HW thread 0. We follow the same convention.
|
||||
//!
|
||||
//! The delivery model is cooperative: we inject the callback entry into HW
|
||||
//! thread 0 at the top of a scheduler round when it's safe (not mid-export,
|
||||
//! not already inside another interrupt). When the callback returns to
|
||||
//! [`LR_HALT_SENTINEL`] the main loop restores the saved [`PpcContext`]
|
||||
//! fields and the HW thread picks up where it left off.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use xenia_cpu::context::{CrField, PpcContext};
|
||||
use xenia_cpu::ThreadRef;
|
||||
|
||||
pub const INTERRUPT_SOURCE_VSYNC: u32 = 0;
|
||||
pub const INTERRUPT_SOURCE_CP: u32 = 1;
|
||||
|
||||
/// Guest-registered V-sync / graphics-interrupt callback (from
|
||||
/// `VdSetGraphicsInterruptCallback`).
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GraphicsInterruptCallback {
|
||||
pub callback_pc: u32,
|
||||
pub user_data: u32,
|
||||
}
|
||||
|
||||
/// Snapshot of the fields we mutate when diverting a HW thread into an
|
||||
/// interrupt callback. Restored when the callback returns to
|
||||
/// `LR_HALT_SENTINEL`.
|
||||
///
|
||||
/// We save **all PPC volatile registers** (r0, r2–r12) plus `r1` (SP),
|
||||
/// `pc`, `lr`, `ctr`, and `cr`. Non-volatile regs (r13–r31) are preserved
|
||||
/// by the callback's own `__savegprlr_N` prologue/epilogue per the PPC
|
||||
/// ELF ABI, so they don't need stashing here.
|
||||
///
|
||||
/// **SP (`gpr[1]`) is included because the injector decrements it by
|
||||
/// [`CALLBACK_STACK_PAD`] before the callback runs** — see that constant's
|
||||
/// docs for why. Without this, the callback's `__savegprlr_N` prologue
|
||||
/// overwrites the interrupted function's own stack-saved LR (which lives
|
||||
/// at `[r1 - 8]`), and when the interrupted function later tries to
|
||||
/// return, `bclr` jumps to `LR_HALT_SENTINEL` and the thread exits
|
||||
/// prematurely.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SavedCallbackCtx {
|
||||
pub pc: u32,
|
||||
pub lr: u64,
|
||||
pub ctr: u64,
|
||||
/// All PPC volatile GPRs (r0, r2–r12) plus r1 (SP) in index order.
|
||||
/// Index 0 = r0, 1 = r1, 2 = r2, …, 12 = r12. Index 13..32 unused.
|
||||
pub gprs: [u64; 13],
|
||||
pub cr: [CrField; 8],
|
||||
pub source: u32,
|
||||
}
|
||||
|
||||
/// Bytes the injector reserves below the interrupted thread's SP before
|
||||
/// running the ISR callback. Matches Canary's
|
||||
/// [`Processor::Execute`](../../../../xenia-canary/src/xenia/cpu/processor.cc#L383)
|
||||
/// which decrements `r[1]` by `64 + 112 = 176` before
|
||||
/// `function->Call(...)` and restores afterwards. The pad must be larger
|
||||
/// than any plausible sum of `__savegprlr_N`'s save-area (up to 64 B for
|
||||
/// r25-r31 + 8 B for LR) plus the callback's own `stwu r1,-N(r1)` frame
|
||||
/// (the Sylpheed vsync ISR uses 128 B).
|
||||
///
|
||||
/// Pre-fix: the ISR's `__savegprlr_25` stored the callback's saved LR
|
||||
/// (= `LR_HALT_SENTINEL`, from injection) at `[r1 - 8]` — exactly where
|
||||
/// the interrupted thread's current `bl`-saved LR lived. The
|
||||
/// interrupted function's return site got stomped with `SENTINEL`, so
|
||||
/// `__restgprlr_N -> bclr` jumped to the halt sentinel and the thread
|
||||
/// exited through the wrong path. Manifested in Sylpheed as tid=5
|
||||
/// (producer for the render queue) terminating at cycle 7.5M, starving
|
||||
/// both `0x10fc` (main's completion wait) and the PKEVENT that tid=6
|
||||
/// polls — no second `VdSwap`, no first pixel.
|
||||
pub const CALLBACK_STACK_PAD: u32 = 64 + 112;
|
||||
|
||||
impl SavedCallbackCtx {
|
||||
pub fn capture(ctx: &PpcContext, source: u32) -> Self {
|
||||
let mut gprs = [0u64; 13];
|
||||
for i in 0..13 {
|
||||
gprs[i] = ctx.gpr[i];
|
||||
}
|
||||
Self {
|
||||
pc: ctx.pc,
|
||||
lr: ctx.lr,
|
||||
ctr: ctx.ctr,
|
||||
gprs,
|
||||
cr: ctx.cr,
|
||||
source,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn restore(self, ctx: &mut PpcContext) {
|
||||
ctx.pc = self.pc;
|
||||
ctx.lr = self.lr;
|
||||
ctx.ctr = self.ctr;
|
||||
for i in 0..13 {
|
||||
ctx.gpr[i] = self.gprs[i];
|
||||
}
|
||||
ctx.cr = self.cr;
|
||||
}
|
||||
}
|
||||
|
||||
/// Maximum pending sources held in the FIFO queue before new ones are
|
||||
/// dropped. Four is enough to absorb a short burst (a few v-syncs arriving
|
||||
/// while HW 0 is mid-callback from a prior one) without letting runaway
|
||||
/// delivery swamp the guest.
|
||||
pub const INTERRUPT_QUEUE_CAP: usize = 4;
|
||||
|
||||
/// All interrupt bookkeeping — single field on `KernelState`.
|
||||
///
|
||||
/// **First-Pixels M2 (2026-04-20)** — changed from a single-slot
|
||||
/// `pending_source: Option<u32>` coalesce to a bounded FIFO so bursts
|
||||
/// don't drop silently, and dropped `VSYNC_INSTR_PERIOD` from 500k to
|
||||
/// 150k so cadence approximates 60 Hz at the current ~10 MIPS interpreter
|
||||
/// throughput. Combined with the `HwState::ServicingIrq` variant added to
|
||||
/// `xenia-cpu::scheduler`, interrupts can now be delivered even when HW 0
|
||||
/// is `Blocked(WaitAny)` — the injector stashes the block into the new
|
||||
/// variant and the restore path re-blocks when the callback returns,
|
||||
/// unless a `wake()` during the callback resolved the wait.
|
||||
/// M2.5 — per-slot pending-IRQ bitmask. Each `AtomicU8` holds one bit per
|
||||
/// interrupt source (currently 2 sources: VSYNC=bit 0, CP=bit 1) destined
|
||||
/// for that specific HW slot. Used by the M3 parallel path: T_main (or
|
||||
/// the GPU thread) sets a bit Release on the target slot's atomic; the
|
||||
/// target T_cpu_i checks the bit Acquire at its quantum boundary and
|
||||
/// self-injects without taking another thread's slot lock.
|
||||
///
|
||||
/// The 6-element fixed-size array mirrors `xenia_cpu::scheduler::HW_THREAD_COUNT`.
|
||||
pub type PendingLocalIrq = [std::sync::atomic::AtomicU8;
|
||||
xenia_cpu::scheduler::HW_THREAD_COUNT];
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct InterruptState {
|
||||
/// Registered callback (set by `VdSetGraphicsInterruptCallback`).
|
||||
pub callback: Option<GraphicsInterruptCallback>,
|
||||
/// Bounded FIFO of pending interrupt sources awaiting injection.
|
||||
/// Push-back on queue, pop-front on inject. Over-cap pushes drop.
|
||||
pub pending: VecDeque<u32>,
|
||||
/// When `Some`, some HW thread is currently running a callback; on
|
||||
/// return-to-sentinel we restore this and clear the flag.
|
||||
pub saved: Option<SavedCallbackCtx>,
|
||||
/// Which guest thread the current callback was injected into.
|
||||
/// Required because we no longer anchor delivery to HW 0 — any
|
||||
/// non-Exited thread is a valid target. Meaningful only while
|
||||
/// `saved.is_some()`. Stored as a `ThreadRef` so per-slot
|
||||
/// runqueues don't get ambiguous addressing.
|
||||
pub injected_ref: Option<ThreadRef>,
|
||||
/// Monotonic count of delivered interrupts.
|
||||
pub delivered: u64,
|
||||
/// Dropped interrupts (callback unset, queue full, or thread
|
||||
/// exited/idle at inject time).
|
||||
pub dropped: u64,
|
||||
/// Instruction-count accumulator for the synthetic v-sync ticker
|
||||
/// (legacy path used by unit tests via `tick_vsync_instr`). Production
|
||||
/// uses `tick_vsync_wallclock` instead — see [`KRNBUG-D08`].
|
||||
pub vsync_accumulator: u64,
|
||||
/// Last observed instruction count for the legacy instruction-count
|
||||
/// ticker. `tick_vsync_instr` diffs against this to advance
|
||||
/// `vsync_accumulator`.
|
||||
pub last_instr_count: u64,
|
||||
/// Wall-clock anchor for the production v-sync ticker. `None` until
|
||||
/// the first `tick_vsync_wallclock` call (lazy init so unit tests
|
||||
/// that never invoke that function don't construct an Instant).
|
||||
/// Each call fires `(elapsed / VSYNC_PERIOD)` v-syncs and advances
|
||||
/// the anchor by that many full periods.
|
||||
pub last_vsync_instant: Option<Instant>,
|
||||
/// M2.5 — per-slot pending-IRQ bits. Set by the producer (M3's
|
||||
/// IRQ-routing logic on `T_main`) with `Release`; consumed by the
|
||||
/// target T_cpu_i with `Acquire` at quantum boundary. Unused under
|
||||
/// the lockstep path (M2's single-host-thread model still uses
|
||||
/// `pending` + `try_inject_graphics_interrupt`); the field is wired
|
||||
/// here so M3's per-HW-thread path is a flag flip, not a refactor.
|
||||
pub pending_local_irq: PendingLocalIrq,
|
||||
}
|
||||
|
||||
/// How many guest instructions correspond to one synthetic v-sync.
|
||||
///
|
||||
/// **Legacy** — drives `tick_vsync_instr` only. Production uses
|
||||
/// `tick_vsync_wallclock` with [`VSYNC_PERIOD`]. Kept because audit M11
|
||||
/// observed this proxy drifts from 629 v-syncs/100M lockstep down to ~2
|
||||
/// under `--parallel`, where the dispatcher executes more PPC instructions
|
||||
/// per tick call. Unit tests still drive the instruction-count ticker for
|
||||
/// determinism.
|
||||
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
|
||||
|
||||
/// Wall-clock period for the **production** v-sync ticker. 16.667 ms
|
||||
/// targets exactly 60 Hz. KRNBUG-D08 — converting from the
|
||||
/// instruction-count proxy fixes the `--parallel` rate drop while
|
||||
/// keeping lockstep cadence stable (instruction-count was *also* an
|
||||
/// approximation; wall-clock is the canonical Xbox 360 v-sync source).
|
||||
pub const VSYNC_PERIOD: Duration = Duration::from_nanos(16_666_667);
|
||||
|
||||
impl InterruptState {
|
||||
/// Record a new callback registration.
|
||||
pub fn set_callback(&mut self, callback_pc: u32, user_data: u32) {
|
||||
self.callback = Some(GraphicsInterruptCallback {
|
||||
callback_pc,
|
||||
user_data,
|
||||
});
|
||||
}
|
||||
|
||||
/// Queue an interrupt for the next safe injection point.
|
||||
pub fn queue_interrupt(&mut self, source: u32) {
|
||||
if self.callback.is_none() {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
}
|
||||
if self.pending.len() >= INTERRUPT_QUEUE_CAP {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
}
|
||||
self.pending.push_back(source);
|
||||
}
|
||||
|
||||
/// Peek at the next pending source without removing it.
|
||||
pub fn peek_next(&self) -> Option<u32> {
|
||||
self.pending.front().copied()
|
||||
}
|
||||
|
||||
/// Pop the next pending source (called by the injector after it has
|
||||
/// committed to dispatching it).
|
||||
pub fn take_next(&mut self) -> Option<u32> {
|
||||
self.pending.pop_front()
|
||||
}
|
||||
|
||||
/// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
|
||||
/// that need a deterministic clock source. Production code calls
|
||||
/// `tick_vsync_wallclock` instead. Returns `true` if at least one
|
||||
/// v-sync was queued.
|
||||
pub fn tick_vsync_instr(&mut self, current_instr_count: u64) -> bool {
|
||||
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
||||
self.last_instr_count = current_instr_count;
|
||||
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
|
||||
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
||||
return false;
|
||||
}
|
||||
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
||||
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
||||
for _ in 0..periods {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// **Production** — wall-clock v-sync ticker. Fires
|
||||
/// `floor(elapsed / VSYNC_PERIOD)` v-syncs since the last call and
|
||||
/// advances the anchor by that many full periods (so a long pause
|
||||
/// doesn't lose all the v-syncs it spans, and a quick succession of
|
||||
/// calls doesn't over-fire). KRNBUG-D08 — replaces the legacy
|
||||
/// instruction-count proxy that drifted under `--parallel`.
|
||||
/// Returns `true` if at least one v-sync was queued.
|
||||
pub fn tick_vsync_wallclock(&mut self) -> bool {
|
||||
let now = Instant::now();
|
||||
let anchor = match self.last_vsync_instant {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
self.last_vsync_instant = Some(now);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
let elapsed = now.saturating_duration_since(anchor);
|
||||
let period_ns = VSYNC_PERIOD.as_nanos() as u64;
|
||||
let elapsed_ns = elapsed.as_nanos() as u64;
|
||||
let periods = elapsed_ns / period_ns;
|
||||
if periods == 0 {
|
||||
return false;
|
||||
}
|
||||
// Advance the anchor by the number of full periods consumed,
|
||||
// not to `now`. That lets a long pause distribute its missed
|
||||
// v-syncs evenly without lazy-batching the entire backlog into
|
||||
// one tick (over-fire would interleave dozens of callback
|
||||
// injections back-to-back). Cap at INTERRUPT_QUEUE_CAP so a
|
||||
// clock that jumped forward (system suspend) doesn't try to
|
||||
// queue more than the FIFO can hold.
|
||||
let advance = Duration::from_nanos(periods * period_ns);
|
||||
self.last_vsync_instant = Some(anchor + advance);
|
||||
let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP);
|
||||
for _ in 0..to_queue {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Is HW thread 0 currently in a callback?
|
||||
pub fn is_in_callback(&self) -> bool {
|
||||
self.saved.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn queue_interrupt_drops_without_callback() {
|
||||
let mut s = InterruptState::default();
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
assert_eq!(s.dropped, 1);
|
||||
assert!(s.pending.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn queue_interrupt_fifo_preserves_order() {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_CP);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
assert_eq!(s.dropped, 0);
|
||||
// FIFO: take_next hands them out in push order.
|
||||
assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_CP));
|
||||
assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
assert_eq!(s.take_next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn queue_interrupt_caps_at_queue_size() {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
for _ in 0..INTERRUPT_QUEUE_CAP {
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
}
|
||||
// Over-cap: drops rather than evicting the oldest.
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
assert_eq!(s.dropped, 2);
|
||||
assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_instr_fires_at_new_150k_threshold() {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
|
||||
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1));
|
||||
assert!(s.pending.is_empty());
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD));
|
||||
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_instr_drains_multiple_periods_in_one_call() {
|
||||
// Long kernel export → big instr delta → multiple v-syncs must
|
||||
// be delivered, not lost.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10));
|
||||
assert_eq!(s.pending.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_wallclock_first_call_sets_anchor() {
|
||||
// First call seeds the anchor and never fires. KRNBUG-D08:
|
||||
// initial wall-clock state has no prior reference, so we can't
|
||||
// know the elapsed delta yet.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
assert!(!s.tick_vsync_wallclock());
|
||||
assert!(s.pending.is_empty());
|
||||
assert!(s.last_vsync_instant.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_wallclock_fires_after_period() {
|
||||
// Sleeps one full v-sync period (16.667 ms) and verifies a
|
||||
// single v-sync is queued. Sleep is fine in --release tests
|
||||
// (one-shot, ~17 ms cost).
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
s.tick_vsync_wallclock(); // seed
|
||||
std::thread::sleep(VSYNC_PERIOD + Duration::from_millis(2));
|
||||
assert!(s.tick_vsync_wallclock());
|
||||
assert_eq!(s.pending.len(), 1);
|
||||
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_vsync_wallclock_caps_burst_at_queue_cap() {
|
||||
// A multi-period elapsed window queues at most
|
||||
// INTERRUPT_QUEUE_CAP v-syncs (the FIFO can't hold more anyway).
|
||||
// Sleep 6 periods (~100 ms), expect INTERRUPT_QUEUE_CAP queued.
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
s.tick_vsync_wallclock(); // seed
|
||||
std::thread::sleep(VSYNC_PERIOD * 6 + Duration::from_millis(2));
|
||||
assert!(s.tick_vsync_wallclock());
|
||||
assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
|
||||
}
|
||||
|
||||
/// Simulates what the main loop does: inject, execute guest code up
|
||||
/// to the sentinel, restore. Uses a single-instruction `bclr` callback
|
||||
/// — the interpreter sees `pc == callback_pc`, steps, and the blr
|
||||
/// instruction writes `lr` into `pc`, which equals `LR_HALT_SENTINEL`
|
||||
/// → main loop detects and triggers restore.
|
||||
#[test]
|
||||
fn inject_restore_roundtrip_smoke() {
|
||||
let mut ctx = PpcContext::new();
|
||||
ctx.pc = 0x1000_0000;
|
||||
ctx.lr = 0xCAFE_BABE;
|
||||
ctx.gpr[3] = 0x1234;
|
||||
ctx.gpr[4] = 0x5678;
|
||||
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x2000_0000, 0xDEAD);
|
||||
|
||||
// Simulate main loop inject: save ctx fields, divert pc/lr/r3/r4.
|
||||
let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
|
||||
s.saved = Some(saved);
|
||||
ctx.pc = 0x2000_0000;
|
||||
ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
|
||||
ctx.gpr[3] = INTERRUPT_SOURCE_VSYNC as u64;
|
||||
ctx.gpr[4] = 0xDEAD;
|
||||
assert!(s.is_in_callback());
|
||||
|
||||
// Guest callback "runs" to the sentinel — simulate by writing
|
||||
// pc = lr (what `blr` would do).
|
||||
ctx.pc = ctx.lr as u32;
|
||||
|
||||
// Main loop detects pc == LR_HALT_SENTINEL while in_callback:
|
||||
let saved = s.saved.take().unwrap();
|
||||
saved.restore(&mut ctx);
|
||||
s.delivered += 1;
|
||||
|
||||
assert_eq!(ctx.pc, 0x1000_0000);
|
||||
assert_eq!(ctx.lr, 0xCAFE_BABE);
|
||||
assert_eq!(ctx.gpr[3], 0x1234);
|
||||
assert_eq!(ctx.gpr[4], 0x5678);
|
||||
assert!(!s.is_in_callback());
|
||||
assert_eq!(s.delivered, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn saved_ctx_roundtrip() {
|
||||
let mut ctx = PpcContext::new();
|
||||
ctx.pc = 0x11223344;
|
||||
ctx.lr = 0xDEADBEEF;
|
||||
ctx.gpr[3] = 0xAAAA;
|
||||
ctx.gpr[4] = 0xBBBB;
|
||||
let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
|
||||
ctx.pc = 0;
|
||||
ctx.lr = 0;
|
||||
ctx.gpr[3] = 0;
|
||||
ctx.gpr[4] = 0;
|
||||
saved.restore(&mut ctx);
|
||||
assert_eq!(ctx.pc, 0x11223344);
|
||||
assert_eq!(ctx.lr, 0xDEADBEEF);
|
||||
assert_eq!(ctx.gpr[3], 0xAAAA);
|
||||
assert_eq!(ctx.gpr[4], 0xBBBB);
|
||||
}
|
||||
|
||||
/// Full volatile-GPR + SP roundtrip. Regression test for the
|
||||
/// 2026-04-24 IRQ-injection fix: the ISR callback's prologue clobbers
|
||||
/// `[r1 - 8]` on the interrupted thread's stack unless the injector
|
||||
/// pre-decrements SP by [`CALLBACK_STACK_PAD`] and the saved ctx puts
|
||||
/// SP (and the rest of the PPC volatile set) back on return.
|
||||
#[test]
|
||||
fn saved_ctx_covers_sp_and_all_volatile_gprs() {
|
||||
let mut ctx = PpcContext::new();
|
||||
ctx.pc = 0xAAAA_BBBB;
|
||||
ctx.lr = 0x1111_2222;
|
||||
ctx.ctr = 0x3333_4444;
|
||||
for i in 0..13 {
|
||||
ctx.gpr[i] = 0x1000 + i as u64;
|
||||
}
|
||||
// r13..r31 are non-volatile and should survive the callback's own
|
||||
// save/restore — the saved ctx deliberately does NOT cover them.
|
||||
for i in 13..32 {
|
||||
ctx.gpr[i] = 0xDEAD_0000 + i as u64;
|
||||
}
|
||||
|
||||
let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
|
||||
|
||||
// Simulate injector: flip pc/lr/r1/r3/r4 (what the real injector
|
||||
// actually does — see try_inject_graphics_interrupt in main.rs).
|
||||
ctx.pc = 0xCAFE;
|
||||
ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
|
||||
ctx.gpr[1] = ctx.gpr[1].wrapping_sub(CALLBACK_STACK_PAD as u64);
|
||||
ctx.gpr[3] = INTERRUPT_SOURCE_VSYNC as u64;
|
||||
ctx.gpr[4] = 0xBEEF;
|
||||
// Simulate callback clobbering a few volatile regs that aren't
|
||||
// part of the "obviously diverted" set.
|
||||
ctx.gpr[0] = 0xFEED_FACE;
|
||||
ctx.gpr[7] = 0x9999;
|
||||
ctx.gpr[12] = 0xABCD;
|
||||
|
||||
saved.restore(&mut ctx);
|
||||
|
||||
// All volatile GPRs restored to pre-injection.
|
||||
for i in 0..13 {
|
||||
assert_eq!(
|
||||
ctx.gpr[i],
|
||||
0x1000 + i as u64,
|
||||
"volatile r{} clobbered by callback was not restored",
|
||||
i
|
||||
);
|
||||
}
|
||||
// SP specifically back to the pre-pad value.
|
||||
assert_eq!(ctx.gpr[1], 0x1001, "SP must be restored to pre-injection");
|
||||
// Non-volatile regs were never captured; they stay as the callback
|
||||
// left them (here, untouched because we didn't modify 13..32).
|
||||
for i in 13..32 {
|
||||
assert_eq!(ctx.gpr[i], 0xDEAD_0000 + i as u64);
|
||||
}
|
||||
assert_eq!(ctx.pc, 0xAAAA_BBBB);
|
||||
assert_eq!(ctx.lr, 0x1111_2222);
|
||||
assert_eq!(ctx.ctr, 0x3333_4444);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,22 @@
|
||||
pub mod audit;
|
||||
pub mod exports;
|
||||
pub mod interrupts;
|
||||
pub mod objects;
|
||||
pub mod path;
|
||||
pub mod state;
|
||||
pub mod thread;
|
||||
pub mod ui_bridge;
|
||||
pub mod xam;
|
||||
pub mod xaudio;
|
||||
|
||||
pub use interrupts::{
|
||||
GraphicsInterruptCallback, InterruptState, SavedCallbackCtx, INTERRUPT_SOURCE_CP,
|
||||
INTERRUPT_SOURCE_VSYNC, VSYNC_INSTR_PERIOD,
|
||||
};
|
||||
pub use state::{KernelState, ModuleId};
|
||||
pub use thread::{allocate_thread_image, ThreadImage};
|
||||
pub use ui_bridge::{SwapInfo, UiBridge};
|
||||
pub use xaudio::{
|
||||
XAudioClient, XAudioState, INTERRUPT_SOURCE_AUDIO, XAUDIO_INSTR_PERIOD, XAUDIO_MAX_CLIENTS,
|
||||
XAUDIO_PERIOD,
|
||||
};
|
||||
|
||||
@@ -1,12 +1,112 @@
|
||||
//! Kernel object tracking for HLE.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use xenia_cpu::ThreadRef;
|
||||
|
||||
/// Kernel object types tracked by handle.
|
||||
///
|
||||
/// Sync variants (`Event`, `Semaphore`, `Mutex`, `Thread`) carry an in-place
|
||||
/// waiter list so wait/set/release sites keep invariants local — dropping the
|
||||
/// object implicitly drops its waiters. Waiters are stored as `ThreadRef`
|
||||
/// (post-Axis-1) — a bare `hw_id: u8` would have been ambiguous under per-slot
|
||||
/// runqueues where multiple guest threads share one HW slot.
|
||||
#[derive(Debug)]
|
||||
pub enum KernelObject {
|
||||
Event { manual_reset: bool, signaled: bool },
|
||||
Semaphore { count: i32, max: i32 },
|
||||
File { path: String },
|
||||
Thread { id: u32 },
|
||||
Timer,
|
||||
Mutex,
|
||||
Event {
|
||||
manual_reset: bool,
|
||||
signaled: bool,
|
||||
/// Guest threads parked on this event.
|
||||
waiters: Vec<ThreadRef>,
|
||||
},
|
||||
Semaphore {
|
||||
count: i32,
|
||||
max: i32,
|
||||
waiters: Vec<ThreadRef>,
|
||||
},
|
||||
File {
|
||||
/// Normalized VFS path (e.g. "default.xex", "media/shared/foo.pkg").
|
||||
path: String,
|
||||
/// Full file size in bytes.
|
||||
size: u64,
|
||||
/// Current read/write cursor.
|
||||
position: u64,
|
||||
/// Whole-file buffer — VFS reads the entire file up front so
|
||||
/// subsequent NtReadFile calls are O(1) slice copies.
|
||||
/// `Arc<Vec<u8>>` so duplicate handles could share backing storage.
|
||||
data: Arc<Vec<u8>>,
|
||||
/// Directory-enumeration cursor consumed by `NtQueryDirectoryFile`.
|
||||
/// `None` before the first call; `Some(N)` = next VFS entry index
|
||||
/// to emit. Reset to `Some(0)` when the guest passes
|
||||
/// `restart_scan=1`. Unused on non-directory files.
|
||||
dir_enum_pos: Option<usize>,
|
||||
/// AUDIT-038 — when `Some`, this file is backed by a real host-FS
|
||||
/// path (the cache: persistent VFS) rather than the in-memory
|
||||
/// `data` buffer. NtReadFile / NtWriteFile / NtSetInformationFile
|
||||
/// route through `std::fs` against this path. Mirrors canary's
|
||||
/// `HostPathDevice` (xenia-canary/src/xenia/vfs/devices/
|
||||
/// host_path_device.cc) which symlinks `cache:` → `\CACHE`.
|
||||
/// `None` for disc-VFS reads, root-of-device opens, and synth
|
||||
/// stubs (those keep the in-memory zero-byte semantics).
|
||||
host_path: Option<PathBuf>,
|
||||
},
|
||||
Thread {
|
||||
id: u32,
|
||||
/// HW thread slot currently running this guest thread (None once exited
|
||||
/// — `exit_code` becomes Some).
|
||||
hw_id: Option<u8>,
|
||||
/// None while the thread is running; populated on ExTerminateThread
|
||||
/// or halt-sentinel return.
|
||||
exit_code: Option<u32>,
|
||||
/// Guest threads parked in KeWaitForSingleObject on this thread handle.
|
||||
waiters: Vec<ThreadRef>,
|
||||
},
|
||||
Timer {
|
||||
/// Xbox 360 timer_type 0 = NotificationTimer (manual-reset),
|
||||
/// 1 = SynchronizationTimer (auto-reset). Same shape as Event.
|
||||
manual_reset: bool,
|
||||
signaled: bool,
|
||||
/// Absolute tick-space deadline; None when disarmed.
|
||||
deadline: Option<u64>,
|
||||
/// Period in ticks (same units as `deadline`); 0 = one-shot.
|
||||
period_ticks: u64,
|
||||
/// Original ms value (canary's SetTimer keeps it for diagnostics).
|
||||
period_ms: u32,
|
||||
/// APC routine (deferred — see `timer_apc` warn in nt_set_timer_ex).
|
||||
callback_routine: u32,
|
||||
callback_arg: u32,
|
||||
waiters: Vec<ThreadRef>,
|
||||
},
|
||||
Mutex {
|
||||
/// HW thread id currently holding the mutex; None when free.
|
||||
owner: Option<u8>,
|
||||
recursion: u32,
|
||||
waiters: Vec<ThreadRef>,
|
||||
},
|
||||
NotifyListener {
|
||||
mask: u64,
|
||||
max_version: u32,
|
||||
queue: VecDeque<(u32, u32)>,
|
||||
waiters: Vec<ThreadRef>,
|
||||
},
|
||||
}
|
||||
|
||||
impl KernelObject {
|
||||
/// Returns the per-object waiter list for the 5 sync variants (Event,
|
||||
/// Semaphore, Thread, Timer, Mutex) and `None` for `File`. Used by
|
||||
/// deadline-expiry scrub in `KernelState::handle_timeout_wake` so a
|
||||
/// timed-out waiter isn't left stranded in a handle's waiters list.
|
||||
pub fn waiters_mut(&mut self) -> Option<&mut Vec<ThreadRef>> {
|
||||
match self {
|
||||
KernelObject::Event { waiters, .. }
|
||||
| KernelObject::Semaphore { waiters, .. }
|
||||
| KernelObject::Thread { waiters, .. }
|
||||
| KernelObject::Timer { waiters, .. }
|
||||
| KernelObject::Mutex { waiters, .. }
|
||||
| KernelObject::NotifyListener { waiters, .. } => Some(waiters),
|
||||
KernelObject::File { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
139
crates/xenia-kernel/src/path.rs
Normal file
139
crates/xenia-kernel/src/path.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
//! Path normalization for kernel file I/O.
|
||||
//!
|
||||
//! Guests pass file paths inside an `OBJECT_ATTRIBUTES` struct that points at
|
||||
//! an `ANSI_STRING` descriptor. Those paths come in several Xbox-flavored
|
||||
//! forms — NT device paths (`\Device\Cdrom0\...`), drive letters (`D:\...`,
|
||||
//! `d:\...`), or symbolic link prefixes (`game:\...`). We strip whichever
|
||||
//! prefix applies and return a plain slash-separated path relative to the
|
||||
//! mounted VFS root, so `VfsDevice::read_file` can look it up directly.
|
||||
|
||||
use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
|
||||
/// Xbox `ANSI_STRING`:
|
||||
/// u16 Length
|
||||
/// u16 MaximumLength
|
||||
/// u32 Buffer (guest pointer)
|
||||
fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
|
||||
if ptr == 0 {
|
||||
return None;
|
||||
}
|
||||
let length = mem.read_u16(ptr) as u32;
|
||||
let buffer = mem.read_u32(ptr + 4);
|
||||
if buffer == 0 || length == 0 {
|
||||
return Some(String::new());
|
||||
}
|
||||
let mut out = String::with_capacity(length as usize);
|
||||
for i in 0..length {
|
||||
let c = mem.read_u8(buffer + i);
|
||||
if c == 0 {
|
||||
break;
|
||||
}
|
||||
out.push(c as char);
|
||||
}
|
||||
Some(out)
|
||||
}
|
||||
|
||||
/// Xbox `OBJECT_ATTRIBUTES`:
|
||||
/// u32 RootDirectory (handle)
|
||||
/// u32 Name (pointer to ANSI_STRING)
|
||||
/// u32 Attributes
|
||||
fn read_object_attributes_name(mem: &GuestMemory, obj_attrs_ptr: u32) -> Option<String> {
|
||||
if obj_attrs_ptr == 0 {
|
||||
return None;
|
||||
}
|
||||
let name_ptr = mem.read_u32(obj_attrs_ptr + 4);
|
||||
read_ansi_string(mem, name_ptr)
|
||||
}
|
||||
|
||||
/// Known Xbox device prefixes that need to be stripped before looking a path
|
||||
/// up in the VFS. The list mirrors the symbolic links xenia-canary sets up
|
||||
/// at boot (see `xboxkrnl_io.cc`). Case-insensitive matching.
|
||||
const DEVICE_PREFIXES: &[&str] = &[
|
||||
"\\Device\\Cdrom0\\",
|
||||
"\\Device\\Harddisk0\\Partition1\\",
|
||||
"\\Device\\Harddisk0\\Partition0\\",
|
||||
"\\Device\\Harddisk0\\",
|
||||
"\\Device\\Mu0\\",
|
||||
"\\Device\\Mu1\\",
|
||||
"\\Device\\Mass0\\",
|
||||
"\\Device\\Mass1\\",
|
||||
"\\Device\\Mass2\\",
|
||||
"\\SystemRoot\\",
|
||||
"\\??\\",
|
||||
"game:\\",
|
||||
"d:\\",
|
||||
"D:\\",
|
||||
];
|
||||
|
||||
/// Strip any Xbox device prefix and normalize backslashes to forward slashes.
|
||||
/// Returns the path relative to the VFS root.
|
||||
pub fn normalize_path(raw: &str) -> String {
|
||||
let mut s = raw.trim().to_string();
|
||||
|
||||
// Case-insensitive prefix strip.
|
||||
let lowered = s.to_ascii_lowercase();
|
||||
for prefix in DEVICE_PREFIXES {
|
||||
let pl = prefix.to_ascii_lowercase();
|
||||
if lowered.starts_with(&pl) {
|
||||
s = s[pl.len()..].to_string();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Drop any leading slash/backslash that survived prefix stripping.
|
||||
while s.starts_with('\\') || s.starts_with('/') {
|
||||
s.remove(0);
|
||||
}
|
||||
|
||||
// Canonical form: forward slashes.
|
||||
s.replace('\\', "/")
|
||||
}
|
||||
|
||||
/// Convenience: read the OBJECT_ATTRIBUTES struct at `obj_attrs_ptr` and
|
||||
/// return a normalized VFS path. Returns `None` if the struct pointer or its
|
||||
/// inner name pointer is null.
|
||||
pub fn object_attributes_to_vfs_path(mem: &GuestMemory, obj_attrs_ptr: u32) -> Option<String> {
|
||||
let raw = read_object_attributes_name(mem, obj_attrs_ptr)?;
|
||||
if raw.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(normalize_path(&raw))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn strips_device_cdrom() {
|
||||
assert_eq!(normalize_path("\\Device\\Cdrom0\\default.xex"), "default.xex");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_drive_letter_lowercase() {
|
||||
assert_eq!(normalize_path("d:\\media\\shared\\foo.pkg"), "media/shared/foo.pkg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_drive_letter_uppercase() {
|
||||
assert_eq!(normalize_path("D:\\default.xex"), "default.xex");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_game_prefix() {
|
||||
assert_eq!(normalize_path("game:\\data\\whatever.bin"), "data/whatever.bin");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preserves_relative_path() {
|
||||
assert_eq!(normalize_path("scripts/init.lua"), "scripts/init.lua");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_partition1() {
|
||||
assert_eq!(
|
||||
normalize_path("\\Device\\Harddisk0\\Partition1\\content\\abc.sav"),
|
||||
"content/abc.sav"
|
||||
);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
68
crates/xenia-kernel/src/thread.rs
Normal file
68
crates/xenia-kernel/src/thread.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
//! Guest-thread image allocation — shared by the initial thread setup in
|
||||
//! `xenia-app/src/main.rs` and `ExCreateThread`. Stack, PCR, and TLS blocks
|
||||
//! all come from the existing kernel bump allocators so layout is consistent.
|
||||
|
||||
use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
|
||||
use crate::state::KernelState;
|
||||
|
||||
/// Addresses the caller passes to `Scheduler::spawn` / the initial-thread
|
||||
/// setup. Matches xenia-canary's per-thread allocations: a stack, a PCR, and
|
||||
/// a TLS block.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ThreadImage {
|
||||
pub stack_base: u32,
|
||||
pub stack_size: u32,
|
||||
pub pcr_base: u32,
|
||||
pub tls_base: u32,
|
||||
}
|
||||
|
||||
/// Allocate stack + PCR + TLS for one guest thread and initialize the PCR
|
||||
/// fields that games read in their thread prolog.
|
||||
///
|
||||
/// - Stack comes from `KernelState::stack_alloc` (bump allocator at
|
||||
/// 0x7100_0000 upward). The returned base is the *bottom*; callers
|
||||
/// compute SP as `base + size`.
|
||||
/// - PCR and TLS are fixed 4 KiB pages allocated via `heap_alloc` so they
|
||||
/// land in the user heap region together with other kernel metadata.
|
||||
/// - `hw_thread_id` is written at PCR+0x2C so `KeGetCurrentProcessorNumber`-
|
||||
/// style reads from r13 resolve correctly even though we never register
|
||||
/// that export.
|
||||
pub fn allocate_thread_image(
|
||||
kernel: &mut KernelState,
|
||||
mem: &GuestMemory,
|
||||
stack_size: u32,
|
||||
hw_thread_id: u8,
|
||||
) -> Option<ThreadImage> {
|
||||
// Round stack size to a page and give games a minimum that matches
|
||||
// xenia-canary's 16 MiB default when callers request 0 (common for
|
||||
// ExCreateThread when the caller lets the kernel pick).
|
||||
let stack_size = if stack_size == 0 {
|
||||
0x10_0000
|
||||
} else {
|
||||
(stack_size + 0xFFF) & !0xFFF
|
||||
};
|
||||
// stack_alloc returns top-of-stack; we need the base.
|
||||
let stack_top = kernel.stack_alloc(stack_size, mem)?;
|
||||
let stack_base = stack_top - stack_size;
|
||||
|
||||
let pcr_base = kernel.heap_alloc(0x1000, mem)?;
|
||||
let tls_base = kernel.heap_alloc(0x1000, mem)?;
|
||||
|
||||
// PCR layout (canary xboxkrnl/xboxkrnl_module.cc, simplified):
|
||||
// +0x000 tls_ptr → TLS block base
|
||||
// +0x02C current_processor_id → HW thread id (0..5)
|
||||
// +0x100 current_thread → placeholder non-zero tag
|
||||
// +0x150 dpc_active → 0 (no DPC queued)
|
||||
mem.write_u32(pcr_base, tls_base);
|
||||
mem.write_u32(pcr_base + 0x2C, hw_thread_id as u32);
|
||||
mem.write_u32(pcr_base + 0x100, 0x1000);
|
||||
mem.write_u32(pcr_base + 0x150, 0);
|
||||
|
||||
Some(ThreadImage {
|
||||
stack_base,
|
||||
stack_size,
|
||||
pcr_base,
|
||||
tls_base,
|
||||
})
|
||||
}
|
||||
185
crates/xenia-kernel/src/ui_bridge.rs
Normal file
185
crates/xenia-kernel/src/ui_bridge.rs
Normal file
@@ -0,0 +1,185 @@
|
||||
//! Bridge between the kernel (CPU-thread side) and a host UI (main-thread side).
|
||||
//!
|
||||
//! The kernel side needs to:
|
||||
//! - snapshot the latest host gamepad each time a guest calls
|
||||
//! `XamInputGetState`, and
|
||||
//! - signal the UI when the guest calls `VdSwap` so the UI can upload the
|
||||
//! guest's frontbuffer to a wgpu texture and present it.
|
||||
//!
|
||||
//! Both directions are expressed as trait-object closures so that `xenia-kernel`
|
||||
//! does not have to depend on winit/wgpu/gilrs. The [`UiBridge`] is installed
|
||||
//! on [`KernelState::ui`] by `cmd_exec` when `--ui` is passed.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64};
|
||||
|
||||
use xenia_gpu::texture_cache::TextureKey;
|
||||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||||
use xenia_hid::GamepadState;
|
||||
use xenia_memory::MemoryAccess;
|
||||
|
||||
/// Information surfaced to the UI each time the guest presents a frame.
|
||||
///
|
||||
/// Fields mirror the seven "interesting" arguments to `VdSwap` in
|
||||
/// `xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc`: the raw
|
||||
/// frontbuffer pointer, its dimensions, and the format/color-space enum values
|
||||
/// the guest passed through.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct SwapInfo {
|
||||
/// Guest physical/virtual address of the frontbuffer to present.
|
||||
pub frontbuffer_addr: u32,
|
||||
/// Width in pixels as reported by the guest.
|
||||
pub width: u32,
|
||||
/// Height in pixels as reported by the guest.
|
||||
pub height: u32,
|
||||
/// Xenos texture format enum (the guest passes a pointer; we dereference
|
||||
/// it here). 0 means "unknown / guest passed a null pointer".
|
||||
pub texture_format: u32,
|
||||
/// Color-space enum (sRGB / BT.709 / …).
|
||||
pub color_space: u32,
|
||||
/// Monotonically increasing frame counter maintained by the kernel; useful
|
||||
/// for HUD display and deduping.
|
||||
pub frame_index: u64,
|
||||
/// Total PM4 `DRAW_INDX*` packets the GPU has captured since boot.
|
||||
/// Surfaced so the UI HUD can show progress even before the full
|
||||
/// uber-shader pipeline is wired in.
|
||||
pub draws_total: u64,
|
||||
/// Total PM4 packets executed, across all opcodes — useful signal for
|
||||
/// "is the GPU actually getting anything at all to consume?".
|
||||
pub packets_total: u64,
|
||||
/// Most-recent draw's Xenos primitive-type code (0 = none yet).
|
||||
pub last_draw_prim: u32,
|
||||
/// Most-recent draw's vertex count.
|
||||
pub last_draw_vertex_count: u32,
|
||||
/// Indirect-buffer jumps so far (useful "is the game driving the ring
|
||||
/// buffer through IBs?" signal).
|
||||
pub indirect_buffer_jumps: u64,
|
||||
/// WAIT_REG_MEM stalls observed on the GPU slot.
|
||||
pub wait_reg_mem_blocks: u64,
|
||||
/// Summed CPU instruction count across all 6 HW threads. Mirrors the
|
||||
/// `cycle_count` field each `PpcContext` maintains; gives the HUD a live
|
||||
/// "how far has the guest run?" readout.
|
||||
pub instructions_total: u64,
|
||||
/// Active VS shader blob key at the most recent DRAW_INDX* (0 = none).
|
||||
/// P3b: the UI uses this to index into `handles.shader_blobs` so the
|
||||
/// Xenos uber-shader interpreter can upload the matching microcode.
|
||||
pub vs_blob_key: u32,
|
||||
/// Active PS shader blob key at the most recent DRAW_INDX*.
|
||||
pub ps_blob_key: u32,
|
||||
/// P4: total EDRAM→memory resolves fired since boot (TILE_FLUSH
|
||||
/// events). Non-zero means the game is committing pixels.
|
||||
pub resolves_total: u64,
|
||||
/// Subset of `resolves_total` whose byte-copy path succeeded and wrote
|
||||
/// at least one sample into guest memory.
|
||||
pub resolves_copied_total: u64,
|
||||
/// Subset of `resolves_total` that were skipped by the byte-copy path
|
||||
/// due to an unsupported format / MSAA mode / 3D destination.
|
||||
pub resolves_skipped_total: u64,
|
||||
/// P4: unique RT keys seen (from the GPU's internal render-target
|
||||
/// cache). Grows as the game exercises new RT footprints.
|
||||
pub unique_render_targets: u64,
|
||||
/// P6: total graphics-interrupt callbacks delivered (v-sync + CP).
|
||||
/// Non-zero means `VdSetGraphicsInterruptCallback` has been wired end
|
||||
/// to end and callbacks are actually running.
|
||||
pub interrupts_delivered: u64,
|
||||
/// P6: graphics-interrupts queued but dropped (callback unset,
|
||||
/// thread 0 blocked, or already inside another callback).
|
||||
pub interrupts_dropped: u64,
|
||||
}
|
||||
|
||||
/// Handles the kernel uses to talk to a running host UI.
|
||||
///
|
||||
/// None of the closures are allowed to block for long — they are called from
|
||||
/// the CPU interpreter thread on the hot path.
|
||||
#[derive(Clone)]
|
||||
pub struct UiBridge {
|
||||
/// Snapshot the host gamepad. Called from `XamInputGetState`.
|
||||
pub gamepad: Arc<dyn Fn() -> GamepadState + Send + Sync>,
|
||||
/// Report that the guest completed a frame. The closure gets the swap
|
||||
/// metadata plus a borrow of guest memory so it can copy the frontbuffer
|
||||
/// bytes into a UI-owned staging buffer before returning. Called from
|
||||
/// `VdSwap` on the CPU thread.
|
||||
pub post_swap: Arc<dyn Fn(SwapInfo, &dyn MemoryAccess) + Send + Sync>,
|
||||
/// Indicates the UI wants the CPU loop to stop. Checked periodically by
|
||||
/// the interpreter loop.
|
||||
pub shutdown: Arc<AtomicBool>,
|
||||
/// Set to `true` when a gamepad is present. `XamInputGetState` returns
|
||||
/// `ERROR_DEVICE_NOT_CONNECTED` when this is `false`.
|
||||
pub gamepad_connected: Arc<AtomicBool>,
|
||||
/// Live CPU instruction counter mirror. The app's run loop publishes
|
||||
/// the sum of `ctx.cycle_count` across HW threads here every ~8k
|
||||
/// instructions so the HUD can report progress between VdSwap events.
|
||||
pub instructions_counter: Arc<AtomicU64>,
|
||||
/// P3b asset publish: `vd_swap` snapshots the GPU's `shader_blobs` and
|
||||
/// constants register region and feeds them to the UI so the Xenos
|
||||
/// uber-shader interpreter has the microcode + constants needed to
|
||||
/// execute the guest draw. Split from `post_swap` so the asset wire
|
||||
/// stays optional — if the UI doesn't need them (headless mode) the
|
||||
/// closure is a no-op.
|
||||
pub publish_xenos_assets:
|
||||
Arc<dyn Fn(HashMap<u32, Vec<u32>>, XenosConstantsBlock) + Send + Sync>,
|
||||
/// P4 frontbuffer publish: at each `VdSwap`, the kernel CPU-side
|
||||
/// detiles the guest frontbuffer (k_8_8_8_8 Tiled2D) into a linear
|
||||
/// RGBA8 buffer and hands it to the UI. The closure receives
|
||||
/// `(width, height, bytes)` — the UI uploads it as a texture.
|
||||
pub publish_frontbuffer:
|
||||
Arc<dyn Fn(u32, u32, Vec<u8>) + Send + Sync>,
|
||||
/// P5 primary texture publish: at each `VdSwap`, the kernel thread
|
||||
/// decodes the PS shader's primary-texture fetch constant (slot 0
|
||||
/// for now) and hands the decoded linear bytes + key to the UI so
|
||||
/// the xenos pipeline can bind a real texture at `@group(1)`.
|
||||
/// Receives `(TextureKey, bytes)`; when `None` is sent the UI
|
||||
/// reverts to its magenta stub.
|
||||
pub publish_texture:
|
||||
Arc<dyn Fn(Option<(TextureKey, Vec<u8>)>) + Send + Sync>,
|
||||
}
|
||||
|
||||
impl UiBridge {
|
||||
/// Snapshot input state (user 0 only; higher indices are unconnected).
|
||||
pub fn snapshot_gamepad(&self) -> GamepadState {
|
||||
(self.gamepad)()
|
||||
}
|
||||
|
||||
/// True iff a gamepad is connected for user 0.
|
||||
pub fn is_connected(&self, user_index: u32) -> bool {
|
||||
user_index == 0
|
||||
&& self
|
||||
.gamepad_connected
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Push a swap event to the UI thread.
|
||||
pub fn notify_swap(&self, info: SwapInfo, mem: &dyn MemoryAccess) {
|
||||
(self.post_swap)(info, mem);
|
||||
}
|
||||
|
||||
/// Snapshot current shader blobs + constants and hand them to the UI.
|
||||
/// Call from `vd_swap` so the UI has the matching assets for every
|
||||
/// draw captured in this frame.
|
||||
pub fn publish_assets(
|
||||
&self,
|
||||
blobs: HashMap<u32, Vec<u32>>,
|
||||
constants: XenosConstantsBlock,
|
||||
) {
|
||||
(self.publish_xenos_assets)(blobs, constants);
|
||||
}
|
||||
|
||||
/// True iff the UI asked for shutdown.
|
||||
pub fn should_shutdown(&self) -> bool {
|
||||
self.shutdown.load(std::sync::atomic::Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Hand a detiled frontbuffer frame to the UI. Called at most once per
|
||||
/// `VdSwap`. `bytes` must be `width * height * 4` bytes in
|
||||
/// `Rgba8Unorm` order (the UI pipeline's expected layout).
|
||||
pub fn publish_frontbuffer(&self, width: u32, height: u32, bytes: Vec<u8>) {
|
||||
(self.publish_frontbuffer)(width, height, bytes);
|
||||
}
|
||||
|
||||
/// Hand one decoded guest texture to the UI. `Some` = update the bound
|
||||
/// slot; `None` = revert to the magenta stub.
|
||||
pub fn publish_texture(&self, tex: Option<(TextureKey, Vec<u8>)>) {
|
||||
(self.publish_texture)(tex);
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,10 @@
|
||||
//! HLE kernel export implementations (xam.xex).
|
||||
|
||||
use crate::state::{KernelState, ModuleId};
|
||||
use crate::objects::KernelObject;
|
||||
use crate::state::{GuestMemoryPcr, KernelState, ModuleId};
|
||||
use crate::thread::allocate_thread_image;
|
||||
use xenia_cpu::PpcContext;
|
||||
use xenia_cpu::scheduler::SpawnParams;
|
||||
use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
|
||||
pub fn register_exports(state: &mut KernelState) {
|
||||
@@ -12,10 +15,10 @@ pub fn register_exports(state: &mut KernelState) {
|
||||
state.register_export(Xam, 0x02, "NetDll_WSACleanup", stub_success);
|
||||
|
||||
// Input
|
||||
state.register_export(Xam, 0x0190, "XamInputGetCapabilities", xam_input_not_connected);
|
||||
state.register_export(Xam, 0x0191, "XamInputGetState", xam_input_not_connected);
|
||||
state.register_export(Xam, 0x0192, "XamInputSetState", xam_input_not_connected);
|
||||
state.register_export(Xam, 0x0198, "XamInputGetKeystrokeEx", xam_input_not_connected);
|
||||
state.register_export(Xam, 0x0190, "XamInputGetCapabilities", xam_input_get_capabilities);
|
||||
state.register_export(Xam, 0x0191, "XamInputGetState", xam_input_get_state);
|
||||
state.register_export(Xam, 0x0192, "XamInputSetState", xam_input_set_state);
|
||||
state.register_export(Xam, 0x0198, "XamInputGetKeystrokeEx", xam_input_get_keystroke);
|
||||
|
||||
// Inactivity
|
||||
state.register_export(Xam, 0x01A0, "XamEnableInactivityProcessing", stub_success);
|
||||
@@ -42,7 +45,7 @@ pub fn register_exports(state: &mut KernelState) {
|
||||
// User
|
||||
state.register_export(Xam, 0x020A, "XamUserGetXUID", xam_user_get_xuid);
|
||||
state.register_export(Xam, 0x020E, "XamUserGetName", xam_user_get_name);
|
||||
state.register_export(Xam, 0x0210, "XamUserGetSigninState", stub_return_zero);
|
||||
state.register_export(Xam, 0x0210, "XamUserGetSigninState", xam_user_get_signin_state);
|
||||
state.register_export(Xam, 0x0219, "XamUserReadProfileSettings", xam_user_read_profile_settings);
|
||||
state.register_export(Xam, 0x021A, "XamUserWriteProfileSettings", stub_success);
|
||||
|
||||
@@ -94,47 +97,196 @@ pub fn register_exports(state: &mut KernelState) {
|
||||
|
||||
// ===== Generic stubs =====
|
||||
|
||||
fn stub_success(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn stub_success(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn stub_return_zero(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn stub_return_zero(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn stub_error_no_more_files(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn stub_error_no_more_files(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 0x12; // ERROR_NO_MORE_FILES
|
||||
}
|
||||
|
||||
// ===== Input =====
|
||||
|
||||
fn xam_input_not_connected(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 0x48F; // ERROR_DEVICE_NOT_CONNECTED
|
||||
/// Helper: pack a `GamepadState` into a 12-byte key used to detect input
|
||||
/// changes. Cheap to compare across frames.
|
||||
fn gamepad_key(state: &xenia_hid::GamepadState) -> u128 {
|
||||
let mut bytes = [0u8; 16];
|
||||
bytes[0..2].copy_from_slice(&state.buttons.to_be_bytes());
|
||||
bytes[2] = state.left_trigger;
|
||||
bytes[3] = state.right_trigger;
|
||||
bytes[4..6].copy_from_slice(&state.left_stick_x.to_be_bytes());
|
||||
bytes[6..8].copy_from_slice(&state.left_stick_y.to_be_bytes());
|
||||
bytes[8..10].copy_from_slice(&state.right_stick_x.to_be_bytes());
|
||||
bytes[10..12].copy_from_slice(&state.right_stick_y.to_be_bytes());
|
||||
u128::from_be_bytes(bytes)
|
||||
}
|
||||
|
||||
fn xam_input_get_capabilities(
|
||||
ctx: &mut PpcContext,
|
||||
mem: &GuestMemory,
|
||||
state: &mut KernelState,
|
||||
) {
|
||||
// r3 = user_index, r4 = flags, r5 = out X_INPUT_CAPABILITIES*
|
||||
let user = ctx.gpr[3] as u32;
|
||||
let out_ptr = ctx.gpr[5] as u32;
|
||||
let connected = state.ui.as_ref().is_some_and(|ui| ui.is_connected(user));
|
||||
if !connected {
|
||||
ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
|
||||
return;
|
||||
}
|
||||
xenia_hid::write_input_capabilities(mem, out_ptr);
|
||||
ctx.gpr[3] = xenia_hid::errors::SUCCESS as u64;
|
||||
}
|
||||
|
||||
fn xam_input_get_state(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// r3 = user_index, r4 = flags, r5 = out X_INPUT_STATE*
|
||||
let user = ctx.gpr[3] as u32;
|
||||
let out_ptr = ctx.gpr[5] as u32;
|
||||
let Some(ui) = state.ui.as_ref() else {
|
||||
ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
|
||||
return;
|
||||
};
|
||||
if !ui.is_connected(user) {
|
||||
ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
|
||||
return;
|
||||
}
|
||||
let gamepad = ui.snapshot_gamepad();
|
||||
let key = gamepad_key(&gamepad);
|
||||
if key != state.last_input_bytes {
|
||||
state.input_packet_number = state.input_packet_number.wrapping_add(1);
|
||||
state.last_input_bytes = key;
|
||||
}
|
||||
xenia_hid::write_input_state(mem, out_ptr, state.input_packet_number, &gamepad);
|
||||
ctx.gpr[3] = xenia_hid::errors::SUCCESS as u64;
|
||||
}
|
||||
|
||||
fn xam_input_set_state(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
|
||||
// r3 = user_index, r4 = flags, r5 = X_INPUT_VIBRATION*
|
||||
// Rumble is out of scope for Phase 1; we accept the call and return
|
||||
// success so games don't retry in a tight loop, but we never actually
|
||||
// shake anything.
|
||||
let user = ctx.gpr[3] as u32;
|
||||
let connected = state.ui.as_ref().is_some_and(|ui| ui.is_connected(user));
|
||||
if !connected {
|
||||
ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
|
||||
return;
|
||||
}
|
||||
ctx.gpr[3] = xenia_hid::errors::SUCCESS as u64;
|
||||
}
|
||||
|
||||
fn xam_input_get_keystroke(
|
||||
ctx: &mut PpcContext,
|
||||
_mem: &GuestMemory,
|
||||
_state: &mut KernelState,
|
||||
) {
|
||||
// No keyboard input in Phase 1 — always "queue empty". Games that only
|
||||
// use the gamepad ignore this return code; those that drive text entry
|
||||
// through the keystroke queue simply get a permanently empty queue, which
|
||||
// manifests as no virtual-keyboard input — acceptable for minimal UI.
|
||||
ctx.gpr[3] = xenia_hid::errors::EMPTY as u64;
|
||||
}
|
||||
|
||||
// ===== Loader =====
|
||||
|
||||
fn xam_loader_launch_title(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xam_loader_launch_title(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
tracing::warn!("XamLoaderLaunchTitle called");
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn xam_loader_terminate_title(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xam_loader_terminate_title(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
tracing::warn!("XamLoaderTerminateTitle called");
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
// ===== Task =====
|
||||
|
||||
fn xam_task_schedule(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
|
||||
let handle = state.alloc_handle();
|
||||
tracing::info!("XamTaskSchedule: handle={:#x}", handle);
|
||||
ctx.gpr[3] = 0;
|
||||
/// `XamTaskSchedule(callback, message, optional_ptr, handle_ptr_out)` —
|
||||
/// spawn a guest thread that runs `callback(message)` asynchronously.
|
||||
/// Mirrors xenia-canary's `XamTaskSchedule_entry` (xam_task.cc:43-80):
|
||||
/// stack is `max(0x4000, page-aligned default)`, the new thread enters at
|
||||
/// `callback` with `message` in r3, and the resulting thread handle is
|
||||
/// written to `handle_ptr_out`.
|
||||
fn xam_task_schedule(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
let callback = ctx.gpr[3] as u32;
|
||||
let message_ptr = ctx.gpr[4] as u32;
|
||||
let optional_ptr = ctx.gpr[5] as u32;
|
||||
let handle_ptr = ctx.gpr[6] as u32;
|
||||
|
||||
if optional_ptr != 0 {
|
||||
let v1 = mem.read_u32(optional_ptr);
|
||||
let v2 = mem.read_u32(optional_ptr + 4);
|
||||
tracing::info!("XamTaskSchedule: args v1={:#010x} v2={:#010x}", v1, v2);
|
||||
}
|
||||
|
||||
let stack_size = std::cmp::max(0x4000u32, (0x10_0000u32 + 0xFFF) & !0xFFF);
|
||||
|
||||
let Some(image) = allocate_thread_image(state, mem, stack_size, 0) else {
|
||||
tracing::error!("XamTaskSchedule: failed to allocate thread image");
|
||||
ctx.gpr[3] = 0xC000_009A; // STATUS_INSUFFICIENT_RESOURCES
|
||||
return;
|
||||
};
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
let tid = state.next_thread_id.fetch_add(1, Ordering::Relaxed);
|
||||
let handle = state.alloc_handle_for(KernelObject::Thread {
|
||||
id: tid,
|
||||
hw_id: None,
|
||||
exit_code: None,
|
||||
waiters: Vec::new(),
|
||||
});
|
||||
|
||||
let tls_slot_count = state.next_tls_index.load(Ordering::Relaxed);
|
||||
let params = SpawnParams {
|
||||
entry: callback,
|
||||
start_context: message_ptr,
|
||||
stack_base: image.stack_base,
|
||||
stack_size: image.stack_size,
|
||||
pcr_base: image.pcr_base,
|
||||
tls_base: image.tls_base,
|
||||
thread_handle: handle,
|
||||
guest_tid: tid,
|
||||
create_suspended: false,
|
||||
is_initial: false,
|
||||
tls_slot_count,
|
||||
affinity_mask: 0,
|
||||
priority: 0,
|
||||
ideal_processor: None,
|
||||
};
|
||||
match state.scheduler.spawn(params, &mut GuestMemoryPcr(mem)) {
|
||||
Ok(hw_id) => {
|
||||
metrics::counter!("scheduler.spawn.ok").increment(1);
|
||||
if let Some(KernelObject::Thread { hw_id: slot, .. }) = state.objects.get_mut(&handle) {
|
||||
*slot = Some(hw_id);
|
||||
}
|
||||
if handle_ptr != 0 {
|
||||
mem.write_u32(handle_ptr, handle);
|
||||
}
|
||||
state.audit_create_with_ctx(handle, "Thread", ctx, mem, "XamTaskSchedule");
|
||||
tracing::info!(
|
||||
"XamTaskSchedule: tid={} handle={:#x} hw={} callback={:#010x} message={:#010x}",
|
||||
tid,
|
||||
handle,
|
||||
hw_id,
|
||||
callback,
|
||||
message_ptr,
|
||||
);
|
||||
ctx.gpr[3] = 0; // STATUS_SUCCESS
|
||||
}
|
||||
Err(_) => {
|
||||
metrics::counter!("scheduler.spawn.rejected").increment(1);
|
||||
tracing::error!("XamTaskSchedule: no free HW thread slot");
|
||||
ctx.gpr[3] = 0xC000_009A;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===== Alloc =====
|
||||
|
||||
fn xam_alloc(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
|
||||
fn xam_alloc(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// r3 = flags, r4 = size, r5 = out_ptr_ptr
|
||||
let size = ctx.gpr[4] as u32;
|
||||
let out_ptr = ctx.gpr[5] as u32;
|
||||
@@ -154,7 +306,7 @@ fn xam_alloc(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelStat
|
||||
|
||||
// ===== User =====
|
||||
|
||||
fn xam_user_get_xuid(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xam_user_get_xuid(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
|
||||
// r3 = user_index, r4 = xuid_ptr
|
||||
let xuid_ptr = ctx.gpr[4] as u32;
|
||||
if xuid_ptr != 0 {
|
||||
@@ -163,7 +315,7 @@ fn xam_user_get_xuid(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut K
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn xam_user_get_name(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xam_user_get_name(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
|
||||
// r3 = user_index, r4 = buffer, r5 = buffer_size
|
||||
let buffer = ctx.gpr[4] as u32;
|
||||
if buffer != 0 {
|
||||
@@ -172,14 +324,20 @@ fn xam_user_get_name(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut K
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn xam_user_read_profile_settings(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xam_user_read_profile_settings(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
// Return error — no profile
|
||||
ctx.gpr[3] = 0x0000_048B; // ERROR_NOT_FOUND
|
||||
}
|
||||
|
||||
fn xam_user_get_signin_state(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
// r3 = user_index
|
||||
let user_index = ctx.gpr[3] as u32;
|
||||
ctx.gpr[3] = if user_index == 0 { 1 } else { 0 };
|
||||
}
|
||||
|
||||
// ===== System =====
|
||||
|
||||
fn xam_get_execution_id(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
|
||||
fn xam_get_execution_id(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// r3 = execution_id_ptr_ptr — write pointer to execution info
|
||||
let ptr_ptr = ctx.gpr[3] as u32;
|
||||
if ptr_ptr != 0 {
|
||||
@@ -197,25 +355,124 @@ fn xam_get_execution_id(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
fn xam_get_system_version(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xam_get_system_version(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 0x2000_0000; // System version
|
||||
}
|
||||
|
||||
// ===== Notify =====
|
||||
|
||||
fn xam_notify_create_listener(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
|
||||
let handle = state.alloc_handle();
|
||||
const K_X_NOTIFY_SYSTEM: u64 = 0x00000001;
|
||||
const K_X_NOTIFY_LIVE: u64 = 0x00000002;
|
||||
const K_X_NOTIFICATION_SYSTEM_UI: u32 = 0x09;
|
||||
const K_X_NOTIFICATION_SYSTEM_SIGN_IN_CHANGED: u32 = 0x0A;
|
||||
const K_X_NOTIFICATION_LIVE_CONNECTION_CHANGED: u32 = 0x02000001;
|
||||
const K_X_NOTIFICATION_LIVE_LINK_STATE_CHANGED: u32 = 0x02000003;
|
||||
|
||||
fn notification_mask_index(id: u32) -> u32 {
|
||||
(id >> 25) & 0x3F
|
||||
}
|
||||
|
||||
fn notification_version(id: u32) -> u32 {
|
||||
(id >> 16) & 0x1FF
|
||||
}
|
||||
|
||||
fn enqueue_notification(obj: &mut KernelObject, id: u32, data: u32) {
|
||||
if let KernelObject::NotifyListener { mask, max_version, queue, .. } = obj {
|
||||
let idx = notification_mask_index(id);
|
||||
if (*mask & (1u64 << idx)) == 0 {
|
||||
return;
|
||||
}
|
||||
if notification_version(id) > *max_version {
|
||||
return;
|
||||
}
|
||||
queue.push_back((id, data));
|
||||
}
|
||||
}
|
||||
|
||||
fn xam_notify_create_listener(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
let mask = ctx.gpr[3];
|
||||
let mut max_version = ctx.gpr[4] as u32;
|
||||
if max_version > 10 {
|
||||
max_version = 10;
|
||||
}
|
||||
let handle = state.alloc_handle_for(KernelObject::NotifyListener {
|
||||
mask,
|
||||
max_version,
|
||||
queue: std::collections::VecDeque::new(),
|
||||
waiters: Vec::new(),
|
||||
});
|
||||
|
||||
let mut startup_pending: Vec<(u32, u32)> = Vec::new();
|
||||
if !state.has_notified_startup && (mask & K_X_NOTIFY_SYSTEM) != 0 {
|
||||
state.has_notified_startup = true;
|
||||
startup_pending.push((K_X_NOTIFICATION_SYSTEM_UI, 0));
|
||||
startup_pending.push((K_X_NOTIFICATION_SYSTEM_SIGN_IN_CHANGED, 1));
|
||||
}
|
||||
if !state.has_notified_live_startup && (mask & K_X_NOTIFY_LIVE) != 0 {
|
||||
state.has_notified_live_startup = true;
|
||||
startup_pending.push((K_X_NOTIFICATION_LIVE_CONNECTION_CHANGED, 0x001510F1));
|
||||
startup_pending.push((K_X_NOTIFICATION_LIVE_LINK_STATE_CHANGED, 0));
|
||||
}
|
||||
if let Some(obj) = state.objects.get_mut(&handle) {
|
||||
for (id, data) in startup_pending {
|
||||
enqueue_notification(obj, id, data);
|
||||
}
|
||||
}
|
||||
|
||||
let _ = mem;
|
||||
ctx.gpr[3] = handle as u64;
|
||||
}
|
||||
|
||||
fn xnotify_get_next(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
// r3 = handle, r4 = id_ptr, r5 = param_ptr
|
||||
ctx.gpr[3] = 0; // FALSE (no notifications)
|
||||
fn xnotify_get_next(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
let handle = ctx.gpr[3] as u32;
|
||||
let match_id = ctx.gpr[4] as u32;
|
||||
let id_ptr = ctx.gpr[5] as u32;
|
||||
let param_ptr = ctx.gpr[6] as u32;
|
||||
|
||||
if param_ptr != 0 {
|
||||
mem.write_u32(param_ptr, 0);
|
||||
}
|
||||
if id_ptr == 0 {
|
||||
ctx.gpr[3] = 0;
|
||||
return;
|
||||
}
|
||||
mem.write_u32(id_ptr, 0);
|
||||
|
||||
let Some(KernelObject::NotifyListener { queue, .. }) = state.objects.get_mut(&handle) else {
|
||||
ctx.gpr[3] = 0;
|
||||
return;
|
||||
};
|
||||
|
||||
let dequeued = if match_id != 0 {
|
||||
let pos = queue.iter().position(|&(id, _)| id == match_id);
|
||||
match pos {
|
||||
Some(p) => {
|
||||
let (id, data) = queue.remove(p).unwrap();
|
||||
Some((id, data))
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
} else {
|
||||
queue.pop_front()
|
||||
};
|
||||
|
||||
match dequeued {
|
||||
Some((id, data)) => {
|
||||
mem.write_u32(id_ptr, id);
|
||||
if param_ptr != 0 {
|
||||
mem.write_u32(param_ptr, data);
|
||||
}
|
||||
ctx.gpr[3] = 1;
|
||||
}
|
||||
None => {
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===== Session =====
|
||||
|
||||
fn xam_session_create_handle(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
|
||||
fn xam_session_create_handle(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// r3 = handle_ptr
|
||||
let handle_ptr = ctx.gpr[3] as u32;
|
||||
let handle = state.alloc_handle();
|
||||
@@ -227,19 +484,19 @@ fn xam_session_create_handle(ctx: &mut PpcContext, mem: &mut GuestMemory, state:
|
||||
|
||||
// ===== Locale =====
|
||||
|
||||
fn xget_avpack(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 0x16; // HDMI
|
||||
fn xget_avpack(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 8;
|
||||
}
|
||||
|
||||
fn xget_game_region(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xget_game_region(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 0xFF; // All regions
|
||||
}
|
||||
|
||||
fn xget_language(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xget_language(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
|
||||
ctx.gpr[3] = 1; // English
|
||||
}
|
||||
|
||||
fn xget_video_mode(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
|
||||
fn xget_video_mode(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
|
||||
// r3 = video_mode_ptr
|
||||
let ptr = ctx.gpr[3] as u32;
|
||||
if ptr != 0 {
|
||||
@@ -251,3 +508,184 @@ fn xget_video_mode(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut Ker
|
||||
}
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use xenia_memory::page_table::MemoryProtect;
|
||||
|
||||
const SCRATCH_BASE: u32 = 0x4000_0000;
|
||||
|
||||
fn fresh() -> (PpcContext, GuestMemory, KernelState) {
|
||||
let mut mem = GuestMemory::new().expect("memory init");
|
||||
mem.alloc(SCRATCH_BASE, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE)
|
||||
.expect("scratch page must commit");
|
||||
let mut state = KernelState::new();
|
||||
state.install_initial_thread(
|
||||
PpcContext::default(),
|
||||
0x7000_0000,
|
||||
0x10_0000,
|
||||
SCRATCH_BASE + 0x800,
|
||||
SCRATCH_BASE + 0xC00,
|
||||
0xF000_0001,
|
||||
&mut mem,
|
||||
);
|
||||
state.scheduler.begin_slot_visit(0);
|
||||
(PpcContext::default(), mem, state)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xam_task_schedule_spawns_real_thread() {
|
||||
let (mut ctx, mut mem, mut state) = fresh();
|
||||
|
||||
let callback_pc: u32 = 0x824a_93c8;
|
||||
let message_ptr: u32 = SCRATCH_BASE + 0x100;
|
||||
let handle_out: u32 = SCRATCH_BASE + 0x200;
|
||||
ctx.gpr[3] = callback_pc as u64;
|
||||
ctx.gpr[4] = message_ptr as u64;
|
||||
ctx.gpr[5] = 0;
|
||||
ctx.gpr[6] = handle_out as u64;
|
||||
ctx.lr = 0x824a_9a14;
|
||||
|
||||
xam_task_schedule(&mut ctx, &mut mem, &mut state);
|
||||
|
||||
assert_eq!(ctx.gpr[3], 0, "XamTaskSchedule must return STATUS_SUCCESS");
|
||||
|
||||
let handle = mem.read_u32(handle_out);
|
||||
assert!(handle >= 0x1000, "handle must be allocated, got {:#x}", handle);
|
||||
|
||||
let r = state
|
||||
.scheduler
|
||||
.find_by_handle(handle)
|
||||
.expect("spawned thread must be findable by handle");
|
||||
let new_ctx = state.scheduler.ctx_mut_ref(r);
|
||||
assert_eq!(new_ctx.pc, callback_pc, "entry PC must be the callback");
|
||||
assert_eq!(
|
||||
new_ctx.gpr[3] as u32, message_ptr,
|
||||
"r3 must hold the message pointer"
|
||||
);
|
||||
|
||||
match state.objects.get(&handle) {
|
||||
Some(KernelObject::Thread { hw_id: Some(_), .. }) => {}
|
||||
other => panic!("expected Thread object with hw_id set, got {:?}", other),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xget_avpack_returns_hdmi() {
|
||||
let (mut ctx, mem, mut state) = fresh();
|
||||
xget_avpack(&mut ctx, &mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xam_user_get_signin_state_user0_signed_in_locally() {
|
||||
let (mut ctx, mem, mut state) = fresh();
|
||||
ctx.gpr[3] = 0;
|
||||
xam_user_get_signin_state(&mut ctx, &mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], 1);
|
||||
|
||||
ctx.gpr[3] = 1;
|
||||
xam_user_get_signin_state(&mut ctx, &mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], 0);
|
||||
|
||||
ctx.gpr[3] = 4;
|
||||
xam_user_get_signin_state(&mut ctx, &mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], 0);
|
||||
}
|
||||
|
||||
fn drain_notifications(state: &mut KernelState, mem: &GuestMemory, handle: u32) -> Vec<(u32, u32)> {
|
||||
let id_ptr = SCRATCH_BASE + 0x100;
|
||||
let param_ptr = SCRATCH_BASE + 0x104;
|
||||
let mut out = Vec::new();
|
||||
loop {
|
||||
let mut ctx = PpcContext::default();
|
||||
ctx.gpr[3] = handle as u64;
|
||||
ctx.gpr[4] = 0;
|
||||
ctx.gpr[5] = id_ptr as u64;
|
||||
ctx.gpr[6] = param_ptr as u64;
|
||||
xnotify_get_next(&mut ctx, mem, state);
|
||||
if ctx.gpr[3] == 0 {
|
||||
break;
|
||||
}
|
||||
let id = mem.read_u32(id_ptr);
|
||||
let data = mem.read_u32(param_ptr);
|
||||
out.push((id, data));
|
||||
if out.len() > 16 {
|
||||
panic!("runaway dequeue");
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn create_listener(state: &mut KernelState, mem: &GuestMemory, mask: u64, max_version: u32) -> u32 {
|
||||
let mut ctx = PpcContext::default();
|
||||
ctx.gpr[3] = mask;
|
||||
ctx.gpr[4] = max_version as u64;
|
||||
xam_notify_create_listener(&mut ctx, mem, state);
|
||||
ctx.gpr[3] as u32
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn first_listener_with_full_mask_gets_4_startup_notifications() {
|
||||
let (_ctx, mem, mut state) = fresh();
|
||||
let h = create_listener(&mut state, &mem, 0x2F, 10);
|
||||
let drained = drain_notifications(&mut state, &mem, h);
|
||||
assert_eq!(
|
||||
drained,
|
||||
vec![
|
||||
(K_X_NOTIFICATION_SYSTEM_UI, 0),
|
||||
(K_X_NOTIFICATION_SYSTEM_SIGN_IN_CHANGED, 1),
|
||||
(K_X_NOTIFICATION_LIVE_CONNECTION_CHANGED, 0x001510F1),
|
||||
(K_X_NOTIFICATION_LIVE_LINK_STATE_CHANGED, 0),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn second_listener_does_not_re_fire_startup() {
|
||||
let (_ctx, mem, mut state) = fresh();
|
||||
let h1 = create_listener(&mut state, &mem, 0x2F, 10);
|
||||
let _ = drain_notifications(&mut state, &mem, h1);
|
||||
let h2 = create_listener(&mut state, &mem, 0x2F, 10);
|
||||
assert!(drain_notifications(&mut state, &mem, h2).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_only_mask_filters_live() {
|
||||
let (_ctx, mem, mut state) = fresh();
|
||||
let h = create_listener(&mut state, &mem, K_X_NOTIFY_SYSTEM, 10);
|
||||
let drained = drain_notifications(&mut state, &mem, h);
|
||||
assert_eq!(
|
||||
drained,
|
||||
vec![
|
||||
(K_X_NOTIFICATION_SYSTEM_UI, 0),
|
||||
(K_X_NOTIFICATION_SYSTEM_SIGN_IN_CHANGED, 1),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn version_filter_drops_too_new() {
|
||||
let (_ctx, mem, mut state) = fresh();
|
||||
let h = create_listener(&mut state, &mem, 0x2F, 0);
|
||||
let drained = drain_notifications(&mut state, &mem, h);
|
||||
let kept_versions: Vec<u32> = drained.iter().map(|&(id, _)| notification_version(id)).collect();
|
||||
assert!(kept_versions.iter().all(|&v| v == 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xnotify_get_next_returns_zero_for_unknown_handle() {
|
||||
let (mut ctx, mem, mut state) = fresh();
|
||||
let id_ptr = SCRATCH_BASE + 0x100;
|
||||
let param_ptr = SCRATCH_BASE + 0x104;
|
||||
ctx.gpr[3] = 0xDEAD_BEEF;
|
||||
ctx.gpr[4] = 0;
|
||||
ctx.gpr[5] = id_ptr as u64;
|
||||
ctx.gpr[6] = param_ptr as u64;
|
||||
xnotify_get_next(&mut ctx, &mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], 0);
|
||||
assert_eq!(mem.read_u32(id_ptr), 0);
|
||||
assert_eq!(mem.read_u32(param_ptr), 0);
|
||||
}
|
||||
}
|
||||
|
||||
333
crates/xenia-kernel/src/xaudio.rs
Normal file
333
crates/xenia-kernel/src/xaudio.rs
Normal file
@@ -0,0 +1,333 @@
|
||||
//! XAudio render-driver-client registration + buffer-complete callback loop
|
||||
//! (canary parity: `xenia/apu/audio_system.cc`).
|
||||
//!
|
||||
//! Replaces the host-thread + per-client-semaphore + XAudio2 driver layer with
|
||||
//! a periodic ticker that enqueues a "buffer complete" fire for each
|
||||
//! registered client at the audio frame rate (256 samples / 48 kHz ≈ 5.33 ms).
|
||||
//! The injection path in `xenia-app` reuses the same [`crate::SavedCallbackCtx`]
|
||||
//! plumbing the graphics-interrupt path uses — only one callback runs at a
|
||||
//! time across either subsystem, gated by `interrupts.is_in_callback()`.
|
||||
//!
|
||||
//! Lockstep mode uses an instruction-count proxy
|
||||
//! ([`XAUDIO_INSTR_PERIOD`]) so `--stable-digest` stays bit-exact;
|
||||
//! `--parallel` uses wall-clock ([`XAUDIO_PERIOD`]) — same dual-mode pattern
|
||||
//! as KRNBUG-D08 v-sync.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use xenia_cpu::ThreadRef;
|
||||
|
||||
/// Mirrors [audio_system.h:30](../../../../xenia-canary/src/xenia/apu/audio_system.h#L30)
|
||||
/// `kMaximumClientCount = 8`.
|
||||
pub const XAUDIO_MAX_CLIENTS: usize = 8;
|
||||
|
||||
/// AUDIT-032 Plan B: synthetic kernel-handle base for the dedicated audio
|
||||
/// worker threads' parking `WaitAny`. These handles are deliberately OUTSIDE
|
||||
/// the normal allocator range (which starts at `0x1000` and grows by 4 in
|
||||
/// [`crate::state::KernelState::alloc_handle`]) so a `state.objects` lookup
|
||||
/// always misses — meaning [`crate::exports::wake_eligible_waiters`] will
|
||||
/// never spuriously wake a worker. The only legitimate path that flips a
|
||||
/// worker out of `Blocked(WaitAny[SYNTHETIC])` is the audio-callback
|
||||
/// injection in `try_inject_audio_callback` (state→`ServicingIrq`) and the
|
||||
/// `LR_HALT` saved-context restore (state→`Blocked` again). One handle per
|
||||
/// client slot keeps wait lists per-worker (defensive — `wake_eligible` is a
|
||||
/// no-op anyway).
|
||||
pub const XAUDIO_SYNTHETIC_HANDLE_BASE: u32 = 0xF000_0000;
|
||||
|
||||
/// Compute the synthetic park-handle for client slot `i`.
|
||||
pub const fn synthetic_park_handle(i: usize) -> u32 {
|
||||
XAUDIO_SYNTHETIC_HANDLE_BASE | (i as u32)
|
||||
}
|
||||
|
||||
/// Source code stamped into [`crate::SavedCallbackCtx::source`] when an
|
||||
/// audio callback is injected. Distinct from graphics-interrupt sources
|
||||
/// (`INTERRUPT_SOURCE_VSYNC = 0`, `INTERRUPT_SOURCE_CP = 1`) so logs and
|
||||
/// the audit trail can disambiguate.
|
||||
pub const INTERRUPT_SOURCE_AUDIO: u32 = 0x100;
|
||||
|
||||
/// Lockstep instruction-count period. Picked so the ratio against
|
||||
/// [`crate::interrupts::VSYNC_INSTR_PERIOD`] (`150_000`) ≈ 16.67 ms / 5.33 ms,
|
||||
/// matching canary's 256 samples / 48 kHz audio cadence.
|
||||
pub const XAUDIO_INSTR_PERIOD: u64 = 48_000;
|
||||
|
||||
/// Wall-clock period under `--parallel`. 256 / 48000 s = 5.333… ms.
|
||||
pub const XAUDIO_PERIOD: Duration = Duration::from_nanos(5_333_333);
|
||||
|
||||
/// Bound on the pending-fires FIFO. Stops a long-running export from
|
||||
/// queueing unbounded callbacks while injection is starved.
|
||||
pub const XAUDIO_QUEUE_CAP: usize = 16;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct XAudioClient {
|
||||
pub callback_pc: u32,
|
||||
pub callback_arg: u32,
|
||||
/// Guest pointer to the heap-allocated 4-byte buffer holding
|
||||
/// `callback_arg` big-endian — passed as r3 to the guest callback,
|
||||
/// matching canary's
|
||||
/// [audio_system.cc:225-228](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L225-L228)
|
||||
/// + [audio_system.cc:139-141](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L139-L141).
|
||||
pub wrapped_callback_arg: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct XAudioState {
|
||||
pub clients: [Option<XAudioClient>; XAUDIO_MAX_CLIENTS],
|
||||
pub pending: VecDeque<usize>,
|
||||
pub delivered: u64,
|
||||
pub dropped: u64,
|
||||
pub accumulator: u64,
|
||||
pub last_instr_count: u64,
|
||||
pub last_instant: Option<Instant>,
|
||||
/// AUDIT-032 Plan B: dedicated audio-worker thread per client slot.
|
||||
/// Mirrors xenia-canary's `apu/audio_system.cc:84-159` host worker but
|
||||
/// using a guest-side parked thread instead — registered at
|
||||
/// `XAudioRegisterRenderDriverClient` time and lazily looked up by
|
||||
/// `try_inject_audio_callback` via `scheduler.find_by_handle`. The
|
||||
/// worker is parked in `Blocked(WaitAny[SYNTHETIC_HANDLE])`; injection
|
||||
/// flips it to `ServicingIrq` and the `LR_HALT` restore path puts it
|
||||
/// back to `Blocked`. Each slot also remembers the kernel handle so
|
||||
/// `find_by_handle` can resolve a fresh `ThreadRef` after slot
|
||||
/// pruning/reordering. Phantom-typed for callers that don't link
|
||||
/// `xenia_cpu` (none currently) to keep this self-contained.
|
||||
pub worker_handles: [Option<u32>; XAUDIO_MAX_CLIENTS],
|
||||
pub worker_refs: [Option<ThreadRef>; XAUDIO_MAX_CLIENTS],
|
||||
}
|
||||
|
||||
impl Default for XAudioState {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
clients: [None; XAUDIO_MAX_CLIENTS],
|
||||
pending: VecDeque::new(),
|
||||
delivered: 0,
|
||||
dropped: 0,
|
||||
accumulator: 0,
|
||||
last_instr_count: 0,
|
||||
last_instant: None,
|
||||
worker_handles: [None; XAUDIO_MAX_CLIENTS],
|
||||
worker_refs: [None; XAUDIO_MAX_CLIENTS],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl XAudioState {
|
||||
pub fn register(&mut self, client: XAudioClient) -> Option<usize> {
|
||||
for (i, slot) in self.clients.iter_mut().enumerate() {
|
||||
if slot.is_none() {
|
||||
*slot = Some(client);
|
||||
return Some(i);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn unregister(&mut self, index: usize) {
|
||||
if index < XAUDIO_MAX_CLIENTS {
|
||||
self.clients[index] = None;
|
||||
self.pending.retain(|&i| i != index);
|
||||
// Worker thread (if any) stays parked on its synthetic handle
|
||||
// — Sylpheed never re-registers, so leaving it Blocked is
|
||||
// simpler than wiring a clean teardown. Clear our refs so a
|
||||
// future `register` rebuilds them.
|
||||
self.worker_handles[index] = None;
|
||||
self.worker_refs[index] = None;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> Option<XAudioClient> {
|
||||
self.clients.get(index).copied().flatten()
|
||||
}
|
||||
|
||||
pub fn any_registered(&self) -> bool {
|
||||
self.clients.iter().any(|c| c.is_some())
|
||||
}
|
||||
|
||||
fn enqueue_all_active(&mut self) {
|
||||
for i in 0..XAUDIO_MAX_CLIENTS {
|
||||
if self.clients[i].is_none() {
|
||||
continue;
|
||||
}
|
||||
if self.pending.len() >= XAUDIO_QUEUE_CAP {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
}
|
||||
self.pending.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn peek_next(&self) -> Option<usize> {
|
||||
self.pending.front().copied()
|
||||
}
|
||||
|
||||
pub fn take_next(&mut self) -> Option<usize> {
|
||||
self.pending.pop_front()
|
||||
}
|
||||
|
||||
/// Lockstep instruction-count ticker. Idempotently advances the
|
||||
/// accumulator from `last_instr_count` to `current_instr_count` and
|
||||
/// enqueues one fire-set per full [`XAUDIO_INSTR_PERIOD`] crossed.
|
||||
/// Returns `true` iff at least one fire was queued.
|
||||
pub fn tick_instr(&mut self, current_instr_count: u64) -> bool {
|
||||
if !self.any_registered() {
|
||||
self.last_instr_count = current_instr_count;
|
||||
self.accumulator = 0;
|
||||
return false;
|
||||
}
|
||||
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
||||
self.last_instr_count = current_instr_count;
|
||||
self.accumulator = self.accumulator.saturating_add(delta);
|
||||
if self.accumulator < XAUDIO_INSTR_PERIOD {
|
||||
return false;
|
||||
}
|
||||
let periods = self.accumulator / XAUDIO_INSTR_PERIOD;
|
||||
self.accumulator %= XAUDIO_INSTR_PERIOD;
|
||||
let to_fire = (periods as usize).min(XAUDIO_QUEUE_CAP);
|
||||
for _ in 0..to_fire {
|
||||
self.enqueue_all_active();
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Wall-clock ticker for `--parallel`. First call seeds the anchor
|
||||
/// (no fire). Subsequent calls fire `floor(elapsed / XAUDIO_PERIOD)`
|
||||
/// fire-sets and advance the anchor by that many full periods.
|
||||
pub fn tick_wallclock(&mut self) -> bool {
|
||||
if !self.any_registered() {
|
||||
self.last_instant = None;
|
||||
return false;
|
||||
}
|
||||
let now = Instant::now();
|
||||
let anchor = match self.last_instant {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
self.last_instant = Some(now);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
let elapsed = now.saturating_duration_since(anchor);
|
||||
let period_ns = XAUDIO_PERIOD.as_nanos() as u64;
|
||||
let elapsed_ns = elapsed.as_nanos() as u64;
|
||||
let periods = elapsed_ns / period_ns;
|
||||
if periods == 0 {
|
||||
return false;
|
||||
}
|
||||
let advance = Duration::from_nanos(periods * period_ns);
|
||||
self.last_instant = Some(anchor + advance);
|
||||
let to_fire = (periods as usize).min(XAUDIO_QUEUE_CAP);
|
||||
for _ in 0..to_fire {
|
||||
self.enqueue_all_active();
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn dummy_client(arg: u32) -> XAudioClient {
|
||||
XAudioClient {
|
||||
callback_pc: 0x8200_0000 + arg,
|
||||
callback_arg: arg,
|
||||
wrapped_callback_arg: 0x4000_0000 + arg,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn register_assigns_first_free_slot() {
|
||||
let mut s = XAudioState::default();
|
||||
let i0 = s.register(dummy_client(1)).unwrap();
|
||||
let i1 = s.register(dummy_client(2)).unwrap();
|
||||
assert_eq!(i0, 0);
|
||||
assert_eq!(i1, 1);
|
||||
assert_eq!(s.get(0).unwrap().callback_arg, 1);
|
||||
assert_eq!(s.get(1).unwrap().callback_arg, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unregister_clears_slot_and_pending() {
|
||||
let mut s = XAudioState::default();
|
||||
let i = s.register(dummy_client(1)).unwrap();
|
||||
s.pending.push_back(i);
|
||||
s.unregister(i);
|
||||
assert!(s.get(i).is_none());
|
||||
assert!(s.pending.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn register_returns_none_when_full() {
|
||||
let mut s = XAudioState::default();
|
||||
for k in 0..XAUDIO_MAX_CLIENTS {
|
||||
assert!(s.register(dummy_client(k as u32)).is_some());
|
||||
}
|
||||
assert!(s.register(dummy_client(99)).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_instr_no_clients_does_not_fire() {
|
||||
let mut s = XAudioState::default();
|
||||
assert!(!s.tick_instr(XAUDIO_INSTR_PERIOD * 10));
|
||||
assert!(s.pending.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_instr_fires_at_period() {
|
||||
let mut s = XAudioState::default();
|
||||
let i = s.register(dummy_client(7)).unwrap();
|
||||
assert!(!s.tick_instr(XAUDIO_INSTR_PERIOD - 1));
|
||||
assert!(s.pending.is_empty());
|
||||
assert!(s.tick_instr(XAUDIO_INSTR_PERIOD));
|
||||
assert_eq!(s.peek_next(), Some(i));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_instr_drains_multiple_periods_in_one_call() {
|
||||
let mut s = XAudioState::default();
|
||||
let i = s.register(dummy_client(7)).unwrap();
|
||||
assert!(s.tick_instr(XAUDIO_INSTR_PERIOD * 4));
|
||||
assert_eq!(s.pending.len(), 4);
|
||||
for _ in 0..4 {
|
||||
assert_eq!(s.take_next(), Some(i));
|
||||
}
|
||||
assert!(s.pending.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_instr_fires_for_each_registered_client() {
|
||||
let mut s = XAudioState::default();
|
||||
let a = s.register(dummy_client(1)).unwrap();
|
||||
let b = s.register(dummy_client(2)).unwrap();
|
||||
assert!(s.tick_instr(XAUDIO_INSTR_PERIOD));
|
||||
assert_eq!(s.pending.len(), 2);
|
||||
assert_eq!(s.take_next(), Some(a));
|
||||
assert_eq!(s.take_next(), Some(b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_instr_caps_queue_growth() {
|
||||
let mut s = XAudioState::default();
|
||||
s.register(dummy_client(1)).unwrap();
|
||||
s.tick_instr(XAUDIO_INSTR_PERIOD * (XAUDIO_QUEUE_CAP as u64 + 50));
|
||||
assert!(s.pending.len() <= XAUDIO_QUEUE_CAP);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_wallclock_first_call_seeds_anchor() {
|
||||
let mut s = XAudioState::default();
|
||||
s.register(dummy_client(1)).unwrap();
|
||||
assert!(!s.tick_wallclock());
|
||||
assert!(s.pending.is_empty());
|
||||
assert!(s.last_instant.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_wallclock_fires_after_period() {
|
||||
let mut s = XAudioState::default();
|
||||
let i = s.register(dummy_client(1)).unwrap();
|
||||
s.tick_wallclock();
|
||||
std::thread::sleep(XAUDIO_PERIOD + Duration::from_millis(2));
|
||||
assert!(s.tick_wallclock());
|
||||
assert!(!s.pending.is_empty());
|
||||
assert_eq!(s.peek_next(), Some(i));
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user