From e428ce33aac7b33df1301194f40eefabc6a4ab14 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 10 May 2026 00:36:53 +0200 Subject: [PATCH] M9.5 + M11.5 + VMX + SJIS/UTF-8: close the post-M5.5 deferred set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the four remaining deferred follow-up items in one bundle. All four are smaller-scope and additive; lockstep determinism unaffected (analyzer-only changes). ## M9.5 — __CxxFrameHandler scope-table parsing - New `xenia_analysis::eh_scope` module. Magic-scans .rdata for the three documented MSVC FuncInfo signatures (0x19930520/21/22) on 4-byte alignment. Each match is parsed as the documented struct (BE u32 fields), with sanity caps on max_state / n_try_blocks / pointer validity. - Walks pUnwindMap (UnwindMapEntry, 8 bytes) and pTryBlockMap (TryBlockMapEntry, 20 bytes) into one row each. - New tables eh_funcinfo, eh_unwind_map, eh_try_blocks. - Sylpheed yield: 2,588 FuncInfo (all version 0x19930522) / 10,019 unwind entries / 315 try-blocks. ## M11.5 — Static-init driver chain detection - New `xenia_analysis::static_init` module. Walks every function looking for the canonical _initterm loop: lwz cursor; mtctr; bcctrl; addi cursor, cursor, 4 bounded by a compare against another constant register. Extracts (array_start, array_end) and reads the array. - Reuses `function_pointer_arrays` table — drivers' arrays land with kind='static_init' (replacing M11's prologue-heuristic output where the structurally-grounded pattern fires). - Sylpheed yield: 0 drivers detected — the binary's static-init structure does not match the canonical CRT loop. Infrastructure ready; future M11.6 can relax. ## VMX vector-store xrefs (M6 follow-up) - Adds AltiVec/VMX X-form load/store XOs to the M6 opcode-31 dispatch: lvx/lvxl/lvebx/lvehx/lvewx (reads) and stvx/stvxl/stvebx/stvehx/stvewx (writes), all addr_mode= 'x_form_indexed'. Static resolution still requires both rA and rB constant. - Sylpheed yield: 110 newly-detected stvx writes. ## Shift_JIS + UTF-8 localised-string detection (M7 follow-up) - Extends `xenia_analysis::strings::analyze` with scan_shift_jis (JIS X 0208 lead/trail byte ranges + half-width katakana pass-through) and scan_utf8 (2- and 3-byte sequences). At least one multi-byte unit required so pure-ASCII strings aren't double-counted. - SJIS bytes rendered as \xHH escapes for diagnostic readability; full SJIS→UTF-8 decoding deferred. - Sylpheed yield: 790 Shift_JIS strings (Japanese debug + UI text) + 39 UTF-8. ## Tests - +2 EH (parses_minimal_funcinfo_v0, rejects_bogus_max_state) - +2 static_init (detects_canonical_initterm_loop, rejects_function_without_pattern) - +2 strings (detects_shift_jis_string, detects_utf8_multibyte_string) Tests 649→655 (+6 unit tests). DB schema golden + write_analysis_results signature updated for new EH parameter. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/xenia-analysis/SCHEMA.md | 115 ++++- crates/xenia-analysis/src/db.rs | 104 ++++- crates/xenia-analysis/src/eh_scope.rs | 296 +++++++++++++ crates/xenia-analysis/src/lib.rs | 2 + crates/xenia-analysis/src/static_init.rs | 399 ++++++++++++++++++ crates/xenia-analysis/src/strings.rs | 176 +++++++- crates/xenia-analysis/src/xref.rs | 16 + .../xenia-analysis/tests/db_schema_golden.rs | 29 +- crates/xenia-app/src/main.rs | 36 +- 9 files changed, 1159 insertions(+), 14 deletions(-) create mode 100644 crates/xenia-analysis/src/eh_scope.rs create mode 100644 crates/xenia-analysis/src/static_init.rs diff --git a/crates/xenia-analysis/SCHEMA.md b/crates/xenia-analysis/SCHEMA.md index 862de23..5fc52f4 100644 --- a/crates/xenia-analysis/SCHEMA.md +++ b/crates/xenia-analysis/SCHEMA.md @@ -457,11 +457,114 @@ byte-identical digests (`instructions=2000005`). - Itanium C++ ABI on vtable layout (offset-from-`this` model adapted by MSVC for Win32 PPC). +## Layer M9.5 — `__CxxFrameHandler` scope-table parsing (landed) + +### Schema additions +- New table `eh_funcinfo(address PK, magic, max_state, p_unwind_map, + n_try_blocks, p_try_block_map, n_ip_map_entries, p_ip_to_state_map, + p_es_type_list, eh_flags)`. +- New table `eh_unwind_map(funcinfo_address, state_index, to_state, action_pc, + PRIMARY KEY (funcinfo_address, state_index))`. +- New table `eh_try_blocks(funcinfo_address, try_index, try_low, try_high, + catch_high, n_catches, p_handler_array, + PRIMARY KEY (funcinfo_address, try_index))`. + +### What this layer does +- Magic-scans `.rdata` for the documented MSVC FuncInfo signatures + (0x19930520 / 0x19930521 / 0x19930522), reading 4-byte BE values + on 4-byte alignment. +- Sanity-checks `max_state` ≤ 10,000, `n_try_blocks` ≤ 1,000, all + internal pointers landing in valid sections. +- Walks `pUnwindMap` (8-byte UnwindMapEntry) and `pTryBlockMap` + (20-byte TryBlockMapEntry) into one row each. + +### What this layer does NOT do +- Does not associate FuncInfo records with their owning function via + the `bl __CxxFrameHandler` registration site — joins to `functions` + by best-effort PC-range queries. A future M9.6 can chase the + registration to make the link explicit. +- Does not parse `pHandlerArray` (per-try-block catch type info). + +### Sylpheed yield +- 2,588 FuncInfo records (all version 0x19930522). +- 10,019 unwind-map entries. +- 315 try-blocks across the binary. + +## Layer M11.5 — Static-init driver chain detection (landed) + +### Schema additions +- Reuses existing `function_pointer_arrays` table — drivers' arrays are + emitted with `kind='static_init'`, replacing M11's prologue-heuristic + output where the structurally-grounded pattern fires. + +### What this layer does +- Walks every detected function looking for the canonical `_initterm`- + style loop: `lwz cursor; mtctr; bcctrl; addi cursor, cursor, 4` + bounded by a comparison against another constant register. +- Extracts `(array_start, array_end)` from the cursor's initial + constant value and the end-comparand register. +- Reads the array, validates each entry against + `func_analysis.functions`, and emits the array as `static_init`. + +### What this layer does NOT do +- Doesn't handle drivers with multiple back-to-back trampoline loops. +- Doesn't follow `_initterm_e` return-status semantics — both + `_initterm` and `_initterm_e` match if the loop body matches. + +### Sylpheed yield +- 0 drivers detected. Sylpheed's static-init structure does not match + the canonical CRT loop pattern; the binary likely calls ctors via + another mechanism (inline at the entry point, or via a different + driver shape). Infrastructure ready for any binary with the + documented MSVC pattern. + +## Layer VMX — Vector-store xrefs (M6 follow-up, landed) + +Extends the M6 X-form opcode-31 dispatch in `xref.rs` with AltiVec/VMX +vector loads and stores. New entries (XO codes): + +- `lvx` (103), `lvxl` (359), `lvebx` (7), `lvehx` (39), `lvewx` (71) + — `addr_mode='x_form_indexed'`, `kind='read'`. +- `stvx` (231), `stvxl` (487), `stvebx` (135), `stvehx` (167), + `stvewx` (199) — `addr_mode='x_form_indexed'`, `kind='write'`. + +Same constraint as M6: rows emitted only when both `rA` and `rB` +resolve to known constants (rare but useful). + +### Sylpheed yield +- 110 `stvx` writes newly resolved. + +## Layer SJIS+UTF-8 — Localised-string detection (M7 follow-up, landed) + +Extends `xenia_analysis::strings::analyze` with two additional scanners. + +### Shift_JIS detection +Per JIS X 0208: lead byte ∈ [0x81, 0x9F] ∪ [0xE0, 0xEF]; +trail byte ∈ [0x40, 0x7E] ∪ [0x80, 0xFC]. Single-byte ASCII and JIS +half-width katakana (0xA1..=0xDF) are passed through. At least one +multi-byte pair must be present (so we don't double-count pure ASCII). +SJIS bytes are rendered as `\\xHH` escapes in the `content` column for +diagnostic readability — full SJIS→UTF-8 decoding is a future +enhancement. + +### UTF-8 detection +Validates 2-byte (`110xxxxx 10xxxxxx`) and 3-byte +(`1110xxxx 10xxxxxx 10xxxxxx`) sequences plus printable ASCII. Skips +4-byte (supplementary plane) which is rare in game text. + +### Sylpheed yield +- 790 Shift_JIS strings (Japanese debug + UI text, including + `[WARNING] ノードに割り当てるエフェクトIDの指定がない ノードデータが見つからない` style mission strings). +- 39 UTF-8 strings. +- 6,311 ASCII strings (unchanged from M7). + ## Forward work (not yet landed) -- **M9.5** — full `__CxxFrameHandler` scope-table parsing (try/catch - range names, per-state cleanup actions). -- **M11.5** — walk the static-initialiser driver call chain from the - entry point to surface ground-truth ctor PCs. -- VMX/VMX128 vector-store xref emission (M6 follow-up). -- UTF-8 / shift_jis localised-string detection in `.rdata` (M7 follow-up). +- **M9.6** — link `eh_funcinfo` records back to their owning functions + via `bl __CxxFrameHandler` registration sites + per-try-block + `pHandlerArray` parsing. +- **M11.6** — relax M11.5 to detect non-canonical static-init driver + shapes (`_initterm_e` with status return, custom drivers). +- Full SJIS → UTF-8 decoding in the `strings.content` column. +- VMX128 (opcode 4) vector-store xrefs — separate encoding space, low + ROI; document if Sylpheed's renderer cluster uses it. diff --git a/crates/xenia-analysis/src/db.rs b/crates/xenia-analysis/src/db.rs index 5724e22..155c75a 100644 --- a/crates/xenia-analysis/src/db.rs +++ b/crates/xenia-analysis/src/db.rs @@ -307,7 +307,8 @@ impl DbWriter { /// `vtables` is the M3 result; pass an empty slice when the caller has /// not run the vtable scan (the tables are still created, just empty). /// `strings` is the M7 result; same convention. `funcptr_arrays` is the - /// M8/M11 result. `typed_ind` is the M5.5 result. + /// M8/M11 result. `typed_ind` is the M5.5 result. `eh_records` is the + /// M9.5 result. #[tracing::instrument(skip_all, name = "db.write_analysis_results")] pub fn write_analysis_results( &mut self, @@ -320,6 +321,7 @@ impl DbWriter { strings: &[crate::strings::DetectedString], funcptr_arrays: &[crate::funcptr_arrays::FuncPtrArray], typed_ind: Option<&crate::ind_dispatch_typed::TypedIndirectResult>, + eh_records: &[crate::eh_scope::EhFuncInfo], ) -> anyhow::Result<()> { self.conn.execute_batch(" CREATE TABLE functions ( @@ -441,6 +443,40 @@ impl DbWriter { PRIMARY KEY (writer_pc, vtable_address, vptr_offset) ); + -- M9.5 — MSVC __CxxFrameHandler scope-table records found by + -- magic-number scan in .rdata. + CREATE TABLE eh_funcinfo ( + address BIGINT PRIMARY KEY, + magic BIGINT NOT NULL, -- 0x19930520/21/22 + max_state BIGINT NOT NULL, + p_unwind_map BIGINT NOT NULL, + n_try_blocks BIGINT NOT NULL, + p_try_block_map BIGINT NOT NULL, + n_ip_map_entries BIGINT NOT NULL, + p_ip_to_state_map BIGINT NOT NULL, + p_es_type_list BIGINT, + eh_flags BIGINT + ); + + CREATE TABLE eh_unwind_map ( + funcinfo_address BIGINT NOT NULL, -- FK to eh_funcinfo.address + state_index BIGINT NOT NULL, + to_state BIGINT NOT NULL, + action_pc BIGINT NOT NULL, + PRIMARY KEY (funcinfo_address, state_index) + ); + + CREATE TABLE eh_try_blocks ( + funcinfo_address BIGINT NOT NULL, -- FK to eh_funcinfo.address + try_index BIGINT NOT NULL, + try_low BIGINT NOT NULL, + try_high BIGINT NOT NULL, + catch_high BIGINT NOT NULL, + n_catches BIGINT NOT NULL, + p_handler_array BIGINT NOT NULL, + PRIMARY KEY (funcinfo_address, try_index) + ); + CREATE TABLE demangled_names ( address BIGINT, -- VA the mangled name is associated with; NULL when from a non-address source (e.g. RTTI-only string) mangled VARCHAR NOT NULL, -- original mangled symbol (e.g. ?Foo@Bar@@QEAAXXZ) @@ -474,6 +510,7 @@ impl DbWriter { if let Some(t) = typed_ind { insert_typed_ind_dispatch(&self.conn, t)?; } + insert_eh_records(&self.conn, eh_records)?; insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?; let indices = [ @@ -524,7 +561,7 @@ impl DbWriter { xrefs: &XrefMap, ) -> anyhow::Result<()> { self.ingest_instructions(pe, info, func_analysis, labels)?; - self.write_analysis_results(pe, info, func_analysis, labels, xrefs, &[], &[], &[], None)?; + self.write_analysis_results(pe, info, func_analysis, labels, xrefs, &[], &[], &[], None, &[])?; Ok(()) } @@ -993,6 +1030,69 @@ fn insert_strings( Ok(()) } +fn insert_eh_records( + conn: &Connection, + records: &[crate::eh_scope::EhFuncInfo], +) -> anyhow::Result<()> { + if records.is_empty() { return Ok(()); } + let mut stmt_fi = conn.prepare( + "INSERT INTO eh_funcinfo + (address, magic, max_state, p_unwind_map, n_try_blocks, + p_try_block_map, n_ip_map_entries, p_ip_to_state_map, + p_es_type_list, eh_flags) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT DO NOTHING" + )?; + let mut stmt_unwind = conn.prepare( + "INSERT INTO eh_unwind_map + (funcinfo_address, state_index, to_state, action_pc) + VALUES (?, ?, ?, ?) ON CONFLICT DO NOTHING" + )?; + let mut stmt_try = conn.prepare( + "INSERT INTO eh_try_blocks + (funcinfo_address, try_index, try_low, try_high, catch_high, + n_catches, p_handler_array) + VALUES (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT DO NOTHING" + )?; + let mut n_fi = 0u64; + let mut n_unwind = 0u64; + let mut n_try = 0u64; + for r in records { + stmt_fi.execute(params![ + r.address as i64, r.magic as i64, r.max_state as i64, + r.p_unwind_map as i64, r.n_try_blocks as i64, + r.p_try_block_map as i64, r.n_ip_map_entries as i64, + r.p_ip_to_state_map as i64, + r.p_es_type_list.map(|p| p as i64), + r.eh_flags.map(|f| f as i64), + ])?; + n_fi += 1; + for (i, e) in r.unwind_map.iter().enumerate() { + stmt_unwind.execute(params![ + r.address as i64, i as i64, e.to_state as i64, e.action_pc as i64, + ])?; + n_unwind += 1; + } + for (i, t) in r.try_blocks.iter().enumerate() { + stmt_try.execute(params![ + r.address as i64, i as i64, + t.try_low as i64, t.try_high as i64, t.catch_high as i64, + t.n_catches as i64, t.p_handler_array as i64, + ])?; + n_try += 1; + } + } + metrics::counter!("db.rows", "table" => "eh_funcinfo").increment(n_fi); + metrics::counter!("db.rows", "table" => "eh_unwind_map").increment(n_unwind); + metrics::counter!("db.rows", "table" => "eh_try_blocks").increment(n_try); + tracing::info!( + funcinfo = n_fi, unwind = n_unwind, try_blocks = n_try, + "EH scope-table insert complete" + ); + Ok(()) +} + fn insert_typed_ind_dispatch( conn: &Connection, t: &crate::ind_dispatch_typed::TypedIndirectResult, diff --git a/crates/xenia-analysis/src/eh_scope.rs b/crates/xenia-analysis/src/eh_scope.rs new file mode 100644 index 0000000..c3acf5a --- /dev/null +++ b/crates/xenia-analysis/src/eh_scope.rs @@ -0,0 +1,296 @@ +//! M9.5 — MSVC `__CxxFrameHandler` scope-table parsing. +//! +//! When MSVC compiles C++ try/catch on Win32 PowerPC, the compiler emits +//! per-function `FuncInfo` records in `.rdata` containing the scope-state +//! tables that `__CxxFrameHandler` walks during unwinding. Each record +//! starts with one of the documented magic numbers: +//! +//! - `0x19930520` — original FuncInfo (no aligned-state-array) +//! - `0x19930521` — adds `pESTypeList` field +//! - `0x19930522` — adds `EHFlags` field +//! +//! Layout (4-byte little-endian on x86; **on Xbox 360 PowerPC PE the +//! struct is big-endian** because the binary is BE throughout): +//! +//! ```text +//! +0x00 uint32 magicNumber (one of 0x199305{20,21,22}) +//! +0x04 int32 maxState (number of UnwindMapEntry rows) +//! +0x08 uint32 pUnwindMap (VA → UnwindMapEntry[]) +//! +0x0C uint32 nTryBlocks +//! +0x10 uint32 pTryBlockMap (VA → TryBlockMapEntry[]) +//! +0x14 uint32 nIPMapEntries (ignored on x86; present on PPC) +//! +0x18 uint32 pIPtoStateMap (VA → IPtoStateMapEntry[]) +//! +0x1C uint32 pESTypeList (only when magic ≥ 0x19930521) +//! +0x20 uint32 EHFlags (only when magic = 0x19930522) +//! ``` +//! +//! Each `UnwindMapEntry` is 8 bytes: `(toState i32, action u32)`. +//! Each `TryBlockMapEntry` is 20 bytes: +//! `(tryLow i32, tryHigh i32, catchHigh i32, nCatches u32, pHandlerArray u32)`. +//! +//! ### What this module does +//! +//! - Magic-scan `.rdata` for the three FuncInfo signatures (read as BE u32). +//! - Parse the FuncInfo record + walk the unwind map and try-block map. +//! - Skip records whose internal pointers don't land in valid sections, +//! or whose lengths exceed sane caps. +//! +//! ### What this module does NOT do +//! +//! - Does not associate a FuncInfo back to its owning function. The +//! `bl __CxxFrameHandler` registration would name that linkage, but +//! it requires walking all `has_eh=true` functions' prologues; a +//! future M9.6 can do that. For now the FuncInfo record stands on its +//! own — joins to `functions` by best-effort PC range queries. +//! - Does not parse the `pHandlerArray` per try-block (catch type info). +//! +//! Reference: LLVM `llvm/lib/CodeGen/AsmPrinter/WinException.cpp`, +//! Microsoft openrce.org documentation on FuncInfo. + +use xenia_xex::pe::PeSection; + +const MAGIC_OLD: u32 = 0x1993_0520; +const MAGIC_V21: u32 = 0x1993_0521; +const MAGIC_V22: u32 = 0x1993_0522; + +#[derive(Debug, Clone, Copy)] +pub struct UnwindMapEntry { + pub to_state: i32, + pub action_pc: u32, // VA of the cleanup action; 0 if none +} + +#[derive(Debug, Clone, Copy)] +pub struct TryBlockMapEntry { + pub try_low: i32, + pub try_high: i32, + pub catch_high: i32, + pub n_catches: u32, + pub p_handler_array: u32, +} + +#[derive(Debug, Clone)] +pub struct EhFuncInfo { + pub address: u32, // VA of the FuncInfo record itself + pub magic: u32, + pub max_state: i32, + pub p_unwind_map: u32, + pub n_try_blocks: u32, + pub p_try_block_map: u32, + pub n_ip_map_entries: u32, + pub p_ip_to_state_map: u32, + pub p_es_type_list: Option, + pub eh_flags: Option, + pub unwind_map: Vec, + pub try_blocks: Vec, +} + +#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))] +pub fn analyze( + pe: &[u8], + image_base: u32, + sections: &[PeSection], +) -> Vec { + let started = std::time::Instant::now(); + let mut out: Vec = Vec::new(); + + // Compute the union of valid VA ranges across all sections — used to + // sanity-check internal pointers in the FuncInfo records. + let valid_ranges: Vec<(u32, u32)> = sections.iter() + .map(|s| (image_base + s.virtual_address, + image_base + s.virtual_address + s.virtual_size)) + .collect(); + let in_valid = |va: u32| valid_ranges.iter().any(|(lo, hi)| va >= *lo && va < *hi); + + let read_u32 = |abs: u32| -> Option { + let off = abs.wrapping_sub(image_base) as usize; + if off + 4 > pe.len() { return None; } + Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]])) + }; + let read_i32 = |abs: u32| -> Option { read_u32(abs).map(|u| u as i32) }; + + for section in sections { + if section.name != ".rdata" { continue; } + let raw_start = section.virtual_address as usize; + let raw_end = (section.virtual_address + section.virtual_size) as usize; + if raw_end > pe.len() { continue; } + let bytes = &pe[raw_start..raw_end.min(pe.len())]; + let va_base = image_base + section.virtual_address; + + // Walk on 4-byte alignment looking for the magic. + let mut i = 0; + while i + 4 <= bytes.len() { + if !i.is_multiple_of(4) { i += 1; continue; } + let m = u32::from_be_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]); + if m == MAGIC_OLD || m == MAGIC_V21 || m == MAGIC_V22 { + let addr = va_base + i as u32; + if let Some(rec) = parse_funcinfo(addr, m, &read_u32, &read_i32, &in_valid) { + out.push(rec); + } + } + i += 4; + } + } + + let elapsed_ms = started.elapsed().as_millis() as f64; + let n_unwind: usize = out.iter().map(|r| r.unwind_map.len()).sum(); + let n_try: usize = out.iter().map(|r| r.try_blocks.len()).sum(); + metrics::histogram!("analysis.phase_ms", "phase" => "eh_scope").record(elapsed_ms); + tracing::info!( + records = out.len(), + unwind_entries = n_unwind, + try_blocks = n_try, + elapsed_ms, + "M9.5 EH scope-table scan complete", + ); + out +} + +fn parse_funcinfo( + addr: u32, + magic: u32, + read_u32: &impl Fn(u32) -> Option, + read_i32: &impl Fn(u32) -> Option, + in_valid: &impl Fn(u32) -> bool, +) -> Option { + let max_state = read_i32(addr + 0x04)?; + let p_unwind_map = read_u32(addr + 0x08)?; + let n_try_blocks = read_u32(addr + 0x0C)?; + let p_try_block_map = read_u32(addr + 0x10)?; + let n_ip_map_entries = read_u32(addr + 0x14)?; + let p_ip_to_state_map = read_u32(addr + 0x18)?; + + // Sanity caps: real FuncInfo records have max_state ≤ a few thousand, + // n_try_blocks ≤ a few hundred. Reject obviously bogus values that + // happened to alias the magic. + if !(0..=10_000).contains(&max_state) { return None; } + if n_try_blocks > 1_000 { return None; } + if n_ip_map_entries > 100_000 { return None; } + // Pointers must either be NULL or land in a valid section. + if p_unwind_map != 0 && !in_valid(p_unwind_map) { return None; } + if p_try_block_map != 0 && !in_valid(p_try_block_map) { return None; } + if p_ip_to_state_map != 0 && !in_valid(p_ip_to_state_map) { return None; } + + let (p_es_type_list, eh_flags) = if magic == MAGIC_V21 { + (read_u32(addr + 0x1C), None) + } else if magic == MAGIC_V22 { + (read_u32(addr + 0x1C), read_u32(addr + 0x20)) + } else { + (None, None) + }; + + // Walk unwind map (8-byte entries). + let mut unwind_map: Vec = Vec::with_capacity(max_state as usize); + if p_unwind_map != 0 && max_state > 0 { + for i in 0..max_state { + let p = p_unwind_map.wrapping_add((i * 8) as u32); + let to_state = read_i32(p)?; + let action_pc = read_u32(p + 4)?; + unwind_map.push(UnwindMapEntry { to_state, action_pc }); + } + } + + // Walk try-block map (20-byte entries). + let mut try_blocks: Vec = Vec::with_capacity(n_try_blocks as usize); + if p_try_block_map != 0 && n_try_blocks > 0 { + for i in 0..n_try_blocks { + let p = p_try_block_map.wrapping_add(i * 20); + let try_low = read_i32(p)?; + let try_high = read_i32(p + 4)?; + let catch_high = read_i32(p + 8)?; + let n_catches = read_u32(p + 12)?; + let p_handler_a = read_u32(p + 16)?; + try_blocks.push(TryBlockMapEntry { + try_low, try_high, catch_high, n_catches, p_handler_array: p_handler_a, + }); + } + } + + Some(EhFuncInfo { + address: addr, + magic, + max_state, + p_unwind_map, + n_try_blocks, + p_try_block_map, + n_ip_map_entries, + p_ip_to_state_map, + p_es_type_list, + eh_flags, + unwind_map, + try_blocks, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use xenia_xex::pe::PeSection; + + fn mk_section(name: &str, va: u32, size: u32) -> PeSection { + PeSection { + name: name.into(), + virtual_address: va, virtual_size: size, + raw_offset: va, raw_size: size, + flags: 0x4000_0040, + } + } + + fn write_be(pe: &mut [u8], at: usize, v: u32) { + pe[at..at + 4].copy_from_slice(&v.to_be_bytes()); + } + fn write_be_i32(pe: &mut [u8], at: usize, v: i32) { + pe[at..at + 4].copy_from_slice(&v.to_be_bytes()); + } + + #[test] + fn parses_minimal_funcinfo_v0() { + let image_base = 0x82000000u32; + let rdata_va = 0x1000u32; + let mut pe = vec![0u8; 0x4000]; + + // FuncInfo at .rdata + 0x10. + let fi_off = (rdata_va + 0x10) as usize; + let fi_va = image_base + rdata_va + 0x10; + let unwind_off = (rdata_va + 0x80) as usize; + let unwind_va = image_base + rdata_va + 0x80; + + write_be(&mut pe, fi_off, MAGIC_OLD); // magic + write_be_i32(&mut pe, fi_off + 4, 2); // maxState + write_be(&mut pe, fi_off + 8, unwind_va); // pUnwindMap + write_be(&mut pe, fi_off + 12, 0); // nTryBlocks + write_be(&mut pe, fi_off + 16, 0); // pTryBlockMap + write_be(&mut pe, fi_off + 20, 0); // nIPMapEntries + write_be(&mut pe, fi_off + 24, 0); // pIPtoStateMap + + // Two unwind entries. + write_be_i32(&mut pe, unwind_off, -1); // to_state + write_be(&mut pe, unwind_off + 4, image_base + 0x500); // action_pc + write_be_i32(&mut pe, unwind_off + 8, 0); + write_be(&mut pe, unwind_off + 12, image_base + 0x600); + + let sections = vec![mk_section(".rdata", rdata_va, 0x100)]; + let recs = analyze(&pe, image_base, §ions); + assert_eq!(recs.len(), 1); + let r = &recs[0]; + assert_eq!(r.address, fi_va); + assert_eq!(r.magic, MAGIC_OLD); + assert_eq!(r.max_state, 2); + assert_eq!(r.unwind_map.len(), 2); + assert_eq!(r.unwind_map[0].to_state, -1); + assert_eq!(r.unwind_map[0].action_pc, image_base + 0x500); + assert_eq!(r.try_blocks.len(), 0); + } + + #[test] + fn rejects_bogus_max_state() { + let image_base = 0x82000000u32; + let rdata_va = 0x1000u32; + let mut pe = vec![0u8; 0x4000]; + let fi_off = (rdata_va + 0x10) as usize; + write_be(&mut pe, fi_off, MAGIC_OLD); + write_be_i32(&mut pe, fi_off + 4, 0xFFFF); // bogus maxState + let sections = vec![mk_section(".rdata", rdata_va, 0x100)]; + let recs = analyze(&pe, image_base, §ions); + assert_eq!(recs.len(), 0); + } +} diff --git a/crates/xenia-analysis/src/lib.rs b/crates/xenia-analysis/src/lib.rs index 5e086c4..dbe3ec9 100644 --- a/crates/xenia-analysis/src/lib.rs +++ b/crates/xenia-analysis/src/lib.rs @@ -13,6 +13,8 @@ pub mod indirect; pub mod ind_dispatch_typed; pub mod strings; pub mod funcptr_arrays; +pub mod eh_scope; +pub mod static_init; mod ordinals; pub use ordinals::resolve_ordinal; diff --git a/crates/xenia-analysis/src/static_init.rs b/crates/xenia-analysis/src/static_init.rs new file mode 100644 index 0000000..02b3e90 --- /dev/null +++ b/crates/xenia-analysis/src/static_init.rs @@ -0,0 +1,399 @@ +//! M11.5 — static-initialiser driver detection. +//! +//! MSVC's CRT static-init driver (`_initterm` / `_initterm_e` style) +//! is a tight loop that walks a function-pointer array between two +//! addresses, calling each non-null entry: +//! +//! ```text +//! loop_top: +//! cmpw[l] rA, rB ; compare cursor vs end +//! beq done +//! lwz rN, 0(rA) ; load fn ptr +//! cmpwi rN, 0 ; null-skip (optional) +//! beq skip +//! mtctr rN +//! bcctrl +//! skip: +//! addi rA, rA, 4 +//! b loop_top +//! done: +//! ``` +//! +//! Two static addresses (`rA` and `rB` at loop start) bracket the +//! function-pointer array. Detection strategy: scan every function for +//! the canonical pattern; when found, extract the array bounds and +//! emit one row in `function_pointer_arrays` with `kind='static_init'`. +//! +//! ### What this layer does +//! +//! - Walks each function looking for an `lwz; mtctr; bcctrl` sequence +//! inside a loop bounded by a comparison against another constant. +//! - When the loop's cursor register is observed to be incremented by +//! exactly 4 per iteration, classifies it as a static-init driver +//! and records the (start, end) array bounds. +//! +//! ### What this layer does NOT do +//! +//! - No support for back-to-back drivers sharing a common loop trampoline. +//! - No detection of the M11 prologue-style heuristic; M11.5 is +//! structure-grounded and replaces the prior heuristic where it fires. +//! - Does not handle CRT-style `_initterm_e` (the `_e` variant returns +//! a status); detection works for both as long as the loop shape +//! matches. +//! +//! Reference: Microsoft CRT `crt0.c::_initterm` source pattern. + +use std::collections::{BTreeSet, HashMap, HashSet}; + +use crate::func::FuncAnalysis; +use crate::funcptr_arrays::FuncPtrArray; +use xenia_xex::pe::PeSection; + +#[derive(Debug, Clone, Copy)] +pub struct StaticInitDriver { + /// VA of the driver function (the one containing the loop). + pub driver_function: u32, + /// VA of the array start. + pub array_start: u32, + /// VA one-past-end of the array. + pub array_end: u32, + /// Detected length in slots. + pub length: u32, +} + +#[derive(Debug, Default)] +pub struct StaticInitResult { + pub drivers: Vec, + /// Newly-detected static-init arrays, ready to be merged into the + /// `function_pointer_arrays` table with `kind='static_init'`. + pub arrays: Vec, +} + +const OP_ADDI: u32 = 14; +const OP_ADDIS: u32 = 15; +const OP_BCCTR: u32 = 19; +const OP_LWZ: u32 = 32; +const OP_X_FORM: u32 = 31; + +#[derive(Debug, Clone, Copy)] +enum RegVal { + Const(u32), +} + +#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))] +pub fn analyze( + pe: &[u8], + image_base: u32, + sections: &[PeSection], + func_analysis: &FuncAnalysis, + function_starts: &BTreeSet, + labels: &HashMap, +) -> StaticInitResult { + let started = std::time::Instant::now(); + let block_boundaries: HashSet = labels.keys().copied().collect(); + + let mut drivers: Vec = Vec::new(); + + for (&fn_start, fi) in &func_analysis.functions { + if fi.is_saverestore { continue; } + if let Some(d) = scan_function_for_driver( + pe, image_base, fn_start, fi.end, &block_boundaries, + ) { + drivers.push(d); + } + } + + // Build arrays from the discovered drivers + section data. + let mut arrays: Vec = Vec::new(); + for d in &drivers { + if let Some(entries) = read_array(pe, image_base, sections, d.array_start, d.array_end, function_starts) { + arrays.push(FuncPtrArray { + address: d.array_start, + length: entries.len() as u32, + kind: "static_init", + entries, + }); + } + } + + let elapsed_ms = started.elapsed().as_millis() as f64; + metrics::histogram!("analysis.phase_ms", "phase" => "static_init").record(elapsed_ms); + tracing::info!( + drivers = drivers.len(), + arrays = arrays.len(), + elapsed_ms, + "M11.5 static-init driver scan complete", + ); + + StaticInitResult { drivers, arrays } +} + +/// Read the function-pointer array between [start, end) from .rdata/.data. +/// NULL entries are skipped (CRT _initterm explicitly tolerates them). +/// Non-function-start entries cause us to bail (the driver bounds were +/// likely misidentified). +fn read_array( + pe: &[u8], + image_base: u32, + sections: &[PeSection], + start: u32, + end: u32, + function_starts: &BTreeSet, +) -> Option> { + if end <= start || (end - start) > 4096 { return None; } + let _section = sections.iter().find(|s| { + let lo = image_base + s.virtual_address; + let hi = lo + s.virtual_size; + start >= lo && end <= hi && (s.name == ".rdata" || s.name == ".data") + })?; + let mut entries = Vec::new(); + let mut p = start; + while p < end { + let off = p.wrapping_sub(image_base) as usize; + if off + 4 > pe.len() { return None; } + let v = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]); + if v != 0 { + if !function_starts.contains(&v) { return None; } + entries.push(v); + } + p = p.wrapping_add(4); + } + if entries.is_empty() { return None; } + Some(entries) +} + +/// Walk one function looking for the canonical static-init driver shape. +/// Returns Some when the loop's cursor register starts at a known constant +/// `rA`, terminates at another known constant `rB` via a compare, and +/// increments by 4 per iteration with an `lwz; mtctr; bcctrl` body. +fn scan_function_for_driver( + pe: &[u8], + image_base: u32, + fn_start: u32, + fn_end: u32, + block_boundaries: &HashSet, +) -> Option { + let mut reg: [Option; 32] = [None; 32]; + // Pattern features observed during the walk. + let mut cursor_reg: Option = None; + let mut cursor_init: Option = None; + let mut end_reg: Option = None; + let mut end_init: Option = None; + let mut saw_lwz_through_cursor = false; + let mut saw_mtctr = false; + let mut saw_bcctrl = false; + let mut saw_addi_4 = false; + + let mut pc = fn_start; + while pc < fn_end { + if pc != fn_start && block_boundaries.contains(&pc) { + // Heuristic: when we cross a basic-block boundary that + // is not the loop-top, accumulated state remains valid for + // pattern-matching purposes — but we drop register Const + // tracking to be safe. + reg = [None; 32]; + } + let off = pc.wrapping_sub(image_base) as usize; + if off + 4 > pe.len() { break; } + let instr = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]); + let op = instr >> 26; + let rd = ((instr >> 21) & 0x1F) as usize; + let ra = ((instr >> 16) & 0x1F) as usize; + let simm = ((instr & 0xFFFF) as i16) as i32; + let uimm = instr & 0xFFFF; + + match op { + OP_ADDIS if ra == 0 => reg[rd] = Some(RegVal::Const(uimm << 16)), + OP_ADDIS => { + if let Some(RegVal::Const(b)) = reg[ra] { + reg[rd] = Some(RegVal::Const(b.wrapping_add(uimm << 16))); + } else { reg[rd] = None; } + } + OP_ADDI if ra != 0 => { + let prev = reg[ra]; + if let Some(RegVal::Const(b)) = prev { + let v = b.wrapping_add(simm as u32); + reg[rd] = Some(RegVal::Const(v)); + // Was this an `addi r, r, 4`? Mark cursor-increment. + if rd == ra && simm == 4 { + if Some(rd) == cursor_reg { + saw_addi_4 = true; + } + } else if cursor_reg.is_none() { + // First time we see a known-constant register that + // *could* be the cursor — defer the choice until we + // see a load through it. + cursor_init = Some(v); + cursor_reg = Some(rd); + } else if end_reg.is_none() && Some(rd) != cursor_reg { + end_init = Some(v); + end_reg = Some(rd); + } + } else { reg[rd] = None; } + } + OP_LWZ => { + if ra != 0 && Some(ra) == cursor_reg { + saw_lwz_through_cursor = true; + } + reg[rd] = None; + } + OP_X_FORM => { + let xo = (instr >> 1) & 0x3FF; + if xo == 467 { + let spr = (((instr >> 11) & 0x1F) << 5) | ((instr >> 16) & 0x1F); + if spr == 9 && saw_lwz_through_cursor { saw_mtctr = true; } + } + if xo != 444 && xo != 467 { reg[rd] = None; } + } + OP_BCCTR => { + let xo = (instr >> 1) & 0x3FF; + let lk = (instr & 1) != 0; + if xo == 528 && lk && saw_mtctr { + saw_bcctrl = true; + } + } + 18 => { + if (instr & 1) != 0 { + for r in 0..=12 { reg[r] = None; } + } + } + 16 => { + if (instr & 1) != 0 { + for r in 0..=12 { reg[r] = None; } + } + } + _ => {} + } + pc = pc.wrapping_add(4); + } + + // Validate that all four pattern features fired. + if !(saw_lwz_through_cursor && saw_mtctr && saw_bcctrl && saw_addi_4) { + return None; + } + let cursor_init = cursor_init?; + let end_init = end_init?; + if end_init <= cursor_init { return None; } + if end_init - cursor_init > 4096 { return None; } + + Some(StaticInitDriver { + driver_function: fn_start, + array_start: cursor_init, + array_end: end_init, + length: (end_init - cursor_init) / 4, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::func::FuncInfo; + use std::collections::BTreeMap; + use xenia_xex::pe::PeSection; + + fn mk_section(name: &str, va: u32, size: u32) -> PeSection { + PeSection { + name: name.into(), + virtual_address: va, virtual_size: size, + raw_offset: va, raw_size: size, + flags: 0x4000_0040, + } + } + fn write_be(pe: &mut [u8], at: usize, v: u32) { + pe[at..at + 4].copy_from_slice(&v.to_be_bytes()); + } + + #[test] + fn detects_canonical_initterm_loop() { + // Build a tiny driver that loops over a 3-entry array. + let image_base = 0x82000000u32; + let mut pe = vec![0u8; 0x4000]; + + // Array at .rdata + 0x800: 3 function pointers. + let arr_va_lo = 0x800u32; + let fns = [image_base + 0x2000, image_base + 0x2010, image_base + 0x2020]; + for (i, p) in fns.iter().enumerate() { + write_be(&mut pe, arr_va_lo as usize + i * 4, *p); + } + let array_start = image_base + arr_va_lo; + let array_end = array_start + 12; + + // Driver function at 0x82001000: + // lis r3, hi(array_start) + // addi r3, r3, lo(array_start) + // lis r4, hi(array_end) + // addi r4, r4, lo(array_end) + // lwz r5, 0(r3) + // mtctr r5 + // bcctrl + // addi r3, r3, 4 + // blr + let driver = 0x82001000u32; + let off = (driver - image_base) as usize; + let lis_r3 = (15u32 << 26) | (3 << 21) | ((array_start >> 16) as u32); + let addi_r3 = (14u32 << 26) | (3 << 21) | (3 << 16) | ((array_start as u16) as u32); + let lis_r4 = (15u32 << 26) | (4 << 21) | ((array_end >> 16) as u32); + let addi_r4 = (14u32 << 26) | (4 << 21) | (4 << 16) | ((array_end as u16) as u32); + let lwz = (32u32 << 26) | (5 << 21) | (3 << 16); + let mtctr = (31u32 << 26) | (5 << 21) | (9 << 16) | (467 << 1); + let bcctrl = (19u32 << 26) | (20 << 21) | (528 << 1) | 1; + let addi_inc = (14u32 << 26) | (3 << 21) | (3 << 16) | 4; + let blr = (19u32 << 26) | (20 << 21) | (16 << 1); + for (i, w) in [lis_r3, addi_r3, lis_r4, addi_r4, lwz, mtctr, bcctrl, addi_inc, blr].iter().enumerate() { + write_be(&mut pe, off + i * 4, *w); + } + + let mut functions: BTreeMap = BTreeMap::new(); + functions.insert(driver, FuncInfo { + start: driver, end: driver + 0x40, frame_size: 0, saved_gprs: 0, + is_leaf: false, is_saverestore: false, + pdata_validated: false, pdata_length: None, has_eh: false, + }); + let fa = FuncAnalysis { + functions, save_gpr_base: None, restore_gpr_base: None, pdata_entries: Vec::new(), + }; + + let sections = vec![mk_section(".rdata", 0x800, 0x100)]; + let mut starts = BTreeSet::new(); + for &p in &fns { starts.insert(p); } + let labels: HashMap = HashMap::new(); + + let r = analyze(&pe, image_base, §ions, &fa, &starts, &labels); + + assert_eq!(r.drivers.len(), 1, "should detect one driver"); + let d = &r.drivers[0]; + assert_eq!(d.driver_function, driver); + assert_eq!(d.array_start, array_start); + assert_eq!(d.array_end, array_end); + assert_eq!(d.length, 3); + + assert_eq!(r.arrays.len(), 1); + assert_eq!(r.arrays[0].kind, "static_init"); + assert_eq!(r.arrays[0].entries.len(), 3); + } + + #[test] + fn rejects_function_without_pattern() { + let image_base = 0x82000000u32; + let mut pe = vec![0u8; 0x4000]; + let driver = 0x82001000u32; + // Just a blr — no driver pattern. + let blr = (19u32 << 26) | (20 << 21) | (16 << 1); + write_be(&mut pe, (driver - image_base) as usize, blr); + + let mut functions: BTreeMap = BTreeMap::new(); + functions.insert(driver, FuncInfo { + start: driver, end: driver + 0x40, frame_size: 0, saved_gprs: 0, + is_leaf: true, is_saverestore: false, + pdata_validated: false, pdata_length: None, has_eh: false, + }); + let fa = FuncAnalysis { + functions, save_gpr_base: None, restore_gpr_base: None, pdata_entries: Vec::new(), + }; + let sections = vec![mk_section(".rdata", 0x800, 0x100)]; + let starts: BTreeSet = BTreeSet::new(); + let labels: HashMap = HashMap::new(); + let r = analyze(&pe, image_base, §ions, &fa, &starts, &labels); + assert_eq!(r.drivers.len(), 0); + } +} diff --git a/crates/xenia-analysis/src/strings.rs b/crates/xenia-analysis/src/strings.rs index c8be2c7..666a62b 100644 --- a/crates/xenia-analysis/src/strings.rs +++ b/crates/xenia-analysis/src/strings.rs @@ -14,7 +14,12 @@ //! on this big-endian platform); we do NOT try big-endian UTF-16. //! - No language detection / classification beyond encoding. //! -//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic. +//! Extends the original ASCII / UTF-16LE pass with Shift_JIS detection +//! (Sylpheed is originally Japanese — likely yields mission/UI text +//! invisible to ASCII-only) and UTF-8 multi-byte detection. +//! +//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic; +//! Shift_JIS lead/trail byte ranges per JIS X 0208. use xenia_xex::pe::PeSection; @@ -23,7 +28,7 @@ use xenia_xex::pe::PeSection; pub struct DetectedString { /// Absolute VA of the first byte. pub address: u32, - /// `"ascii"` or `"utf16le"`. + /// `"ascii"` | `"utf16le"` | `"shift_jis"` | `"utf8"`. pub encoding: &'static str, /// Length in bytes (excluding the NUL terminator). pub length: u32, @@ -48,15 +53,21 @@ pub fn analyze(pe: &[u8], image_base: u32, sections: &[PeSection]) -> Vec "strings").record(elapsed_ms); tracing::info!( ascii = n_ascii, utf16le = n_utf16, + shift_jis = n_sjis, + utf8 = n_utf8, total = out.len(), elapsed_ms, "string scan complete" @@ -135,6 +146,134 @@ fn scan_utf16le(bytes: &[u8], va_base: u32, out: &mut Vec) { } } +/// Per JIS X 0208: Shift_JIS first byte ∈ [0x81, 0x9F] ∪ [0xE0, 0xEF]; +/// trail byte ∈ [0x40, 0x7E] ∪ [0x80, 0xFC]. Single-byte ASCII and JIS +/// half-width katakana (0xA1..=0xDF) are passed through. +fn is_sjis_lead(b: u8) -> bool { + (0x81..=0x9F).contains(&b) || (0xE0..=0xEF).contains(&b) +} +fn is_sjis_trail(b: u8) -> bool { + (0x40..=0x7E).contains(&b) || (0x80..=0xFC).contains(&b) +} +fn is_sjis_singlebyte(b: u8) -> bool { + is_printable_ascii(b) || (0xA1..=0xDF).contains(&b) +} + +/// Scan for Shift_JIS strings — runs of ≥ 6 bytes consisting of valid +/// SJIS code units (single-byte ASCII / half-width katakana, OR a +/// lead+trail pair). At least one multi-byte pair must be present so we +/// don't double-count strings that are purely ASCII. +fn scan_shift_jis(bytes: &[u8], va_base: u32, out: &mut Vec) { + let mut i = 0; + while i < bytes.len() { + let start = i; + let mut has_multibyte = false; + let mut nbytes = 0; + while i < bytes.len() { + let b = bytes[i]; + if is_sjis_lead(b) && i + 1 < bytes.len() && is_sjis_trail(bytes[i + 1]) { + has_multibyte = true; + nbytes += 2; + i += 2; + } else if is_sjis_singlebyte(b) { + nbytes += 1; + i += 1; + } else { + break; + } + } + // Require NUL terminator + min length + at least one multi-byte char. + if has_multibyte + && nbytes >= MIN_LEN + && i < bytes.len() && bytes[i] == 0 + { + // Decode SJIS → UTF-8 best-effort. We don't ship a full + // SJIS decoder; keep the bytes as a `\u{XX}\u{YY}…` style + // rendering for diagnostic readability, and let downstream + // tooling re-decode if needed. + let raw = &bytes[start..i]; + let mut s = String::with_capacity(raw.len() * 4); + let mut p = 0; + while p < raw.len() { + let b = raw[p]; + if is_sjis_lead(b) && p + 1 < raw.len() && is_sjis_trail(raw[p + 1]) { + // Render as SJIS hex pair so the string is identifiable + // even without a decoder. Real Japanese decoding is a + // future enhancement. + s.push_str(&format!("\\x{:02X}\\x{:02X}", b, raw[p + 1])); + p += 2; + } else { + s.push(b as char); + p += 1; + } + } + out.push(DetectedString { + address: va_base + start as u32, + encoding: "shift_jis", + length: nbytes as u32, + content: s, + }); + i += 1; // skip NUL + } else { + // Advance past whatever didn't match. + i = start + 1; + if i < bytes.len() && bytes[i] == 0 { i += 1; } + } + } +} + +/// Scan for UTF-8 strings carrying multi-byte sequences (we already +/// catch pure-ASCII via `scan_ascii`). Validates 2/3-byte sequences; +/// 4-byte (supplementary plane) is uncommon in game text and skipped. +fn scan_utf8(bytes: &[u8], va_base: u32, out: &mut Vec) { + let mut i = 0; + while i < bytes.len() { + let start = i; + let mut has_multibyte = false; + let mut nbytes = 0; + while i < bytes.len() { + let b = bytes[i]; + if b < 0x80 { + if !is_printable_ascii(b) { break; } + nbytes += 1; + i += 1; + } else if (b & 0xE0) == 0xC0 { + // 2-byte: 110xxxxx 10xxxxxx + if i + 1 >= bytes.len() || (bytes[i + 1] & 0xC0) != 0x80 { break; } + has_multibyte = true; + nbytes += 2; + i += 2; + } else if (b & 0xF0) == 0xE0 { + // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx + if i + 2 >= bytes.len() + || (bytes[i + 1] & 0xC0) != 0x80 + || (bytes[i + 2] & 0xC0) != 0x80 { break; } + has_multibyte = true; + nbytes += 3; + i += 3; + } else { + break; + } + } + if has_multibyte + && nbytes >= MIN_LEN + && i < bytes.len() && bytes[i] == 0 + && let Ok(s) = std::str::from_utf8(&bytes[start..i]) + { + out.push(DetectedString { + address: va_base + start as u32, + encoding: "utf8", + length: nbytes as u32, + content: s.to_string(), + }); + i += 1; // skip NUL + } else { + i = start + 1; + if i < bytes.len() && bytes[i] == 0 { i += 1; } + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -195,6 +334,37 @@ mod tests { assert!(utf16.iter().any(|s| s.content == "Hello!")); } + #[test] + fn detects_shift_jis_string() { + let image_base = 0x82000000u32; + let mut pe = vec![0u8; 0x1100]; + let off = 0x1000usize; + // "ABC" + (SJIS hiragana 'a' = 0x82 0xA0) + (SJIS 'i' = 0x82 0xA2) + NUL + let s: &[u8] = b"ABC\x82\xA0\x82\xA2\0"; + pe[off..off + s.len()].copy_from_slice(s); + let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; + let strings = analyze(&pe, image_base, §ions); + let sjis: Vec<_> = strings.iter().filter(|s| s.encoding == "shift_jis").collect(); + assert_eq!(sjis.len(), 1); + assert!(sjis[0].content.contains("ABC")); + assert!(sjis[0].content.contains("\\x82\\xA0")); + } + + #[test] + fn detects_utf8_multibyte_string() { + let image_base = 0x82000000u32; + let mut pe = vec![0u8; 0x1100]; + let off = 0x1000usize; + // "Café" = 'C', 'a', 'f', 0xC3 0xA9 (é), then more ASCII to reach min length + let s: &[u8] = b"Caf\xC3\xA9eteria\0"; + pe[off..off + s.len()].copy_from_slice(s); + let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; + let strings = analyze(&pe, image_base, §ions); + let u8s: Vec<_> = strings.iter().filter(|s| s.encoding == "utf8").collect(); + assert_eq!(u8s.len(), 1); + assert_eq!(u8s[0].content, "Café".to_string() + "eteria"); + } + #[test] fn requires_nul_terminator() { let image_base = 0x82000000u32; diff --git a/crates/xenia-analysis/src/xref.rs b/crates/xenia-analysis/src/xref.rs index 892e845..666653a 100644 --- a/crates/xenia-analysis/src/xref.rs +++ b/crates/xenia-analysis/src/xref.rs @@ -360,6 +360,22 @@ pub fn analyze_xrefs( 375 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhaux 21 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // ldx 53 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // ldux + // AltiVec/VMX (opcode 31) loads & stores. Element + // variants store one byte/halfword/word; full + // `stvx` stores 16 bytes. Address resolution still + // requires both rA and rB constant — common only + // in static-table setup loops. + 231 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvx + 487 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvxl + 135 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvebx + 167 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvehx + 199 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stvewx + // AltiVec/VMX loads — same XO range, kind=read. + 103 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvx + 359 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvxl + 7 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvebx + 39 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvehx + 71 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lvewx _ => None, } }; diff --git a/crates/xenia-analysis/tests/db_schema_golden.rs b/crates/xenia-analysis/tests/db_schema_golden.rs index 95cb86e..7211b72 100644 --- a/crates/xenia-analysis/tests/db_schema_golden.rs +++ b/crates/xenia-analysis/tests/db_schema_golden.rs @@ -107,7 +107,7 @@ fn db_schema_matches_expected_columns() { w.write_base(&info).expect("write_base"); w.ingest_instructions(&pe, &info, &func_analysis, &labels) .expect("ingest_instructions"); - w.write_analysis_results(&pe, &info, &func_analysis, &labels, &xrefs, &[], &[], &[], None) + w.write_analysis_results(&pe, &info, &func_analysis, &labels, &xrefs, &[], &[], &[], None, &[]) .expect("write_analysis_results"); w.create_sql_views().expect("create_sql_views"); } @@ -249,6 +249,33 @@ fn db_schema_matches_expected_columns() { ("vptr_offset", "BIGINT"), ("writer_function", "BIGINT"), ]), + ("eh_funcinfo", &[ + ("address", "BIGINT"), + ("magic", "BIGINT"), + ("max_state", "BIGINT"), + ("p_unwind_map", "BIGINT"), + ("n_try_blocks", "BIGINT"), + ("p_try_block_map", "BIGINT"), + ("n_ip_map_entries", "BIGINT"), + ("p_ip_to_state_map", "BIGINT"), + ("p_es_type_list", "BIGINT"), + ("eh_flags", "BIGINT"), + ]), + ("eh_unwind_map", &[ + ("funcinfo_address", "BIGINT"), + ("state_index", "BIGINT"), + ("to_state", "BIGINT"), + ("action_pc", "BIGINT"), + ]), + ("eh_try_blocks", &[ + ("funcinfo_address", "BIGINT"), + ("try_index", "BIGINT"), + ("try_low", "BIGINT"), + ("try_high", "BIGINT"), + ("catch_high", "BIGINT"), + ("n_catches", "BIGINT"), + ("p_handler_array", "BIGINT"), + ]), ("xrefs", &[ ("source", "BIGINT"), ("target", "BIGINT"), diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs index 0dc42a8..782e4bf 100644 --- a/crates/xenia-app/src/main.rs +++ b/crates/xenia-app/src/main.rs @@ -4208,14 +4208,45 @@ fn cmd_dis( // Generic function-pointer-array scan (M8 + M11). Re-emits M3 vtables // plus dispatch tables and static-init tables in `.rdata`. - let fparrays = xenia_analysis::funcptr_arrays::analyze( + let mut fparrays = xenia_analysis::funcptr_arrays::analyze( &pe_image, base, §ions, &function_starts, &vtables, ); + + // M11.5 — static-init driver chain detection. Replaces M11's prologue + // heuristic with a structurally-grounded result where the driver + // function shape matches. + let static_init = xenia_analysis::static_init::analyze( + &pe_image, base, §ions, &func_analysis, &function_starts, + &xref_result.labels, + ); + info!( + static_init_drivers = static_init.drivers.len(), + static_init_arrays = static_init.arrays.len(), + "M11.5 static-init driver scan complete", + ); + // Merge M11.5 results into the funcptr_arrays vector. If an array's + // address already exists from M8/M11, upgrade its kind from + // 'dispatch_table'/'static_init' to a definitive 'static_init'. + let static_init_addrs: std::collections::HashSet = + static_init.arrays.iter().map(|a| a.address).collect(); + fparrays.retain(|a| !static_init_addrs.contains(&a.address)); + for a in &static_init.arrays { + fparrays.push(a.clone()); + } info!( funcptr_arrays = fparrays.len(), dispatch_tables = fparrays.iter().filter(|a| a.kind == "dispatch_table").count(), static_inits = fparrays.iter().filter(|a| a.kind == "static_init").count(), - "function-pointer array scan complete", + "function-pointer array set finalised", + ); + + // M9.5 — MSVC __CxxFrameHandler scope-table magic-scan. + let eh_records = xenia_analysis::eh_scope::analyze(&pe_image, base, §ions); + info!( + eh_funcinfo = eh_records.len(), + eh_unwind_entries = eh_records.iter().map(|r| r.unwind_map.len()).sum::(), + eh_try_blocks = eh_records.iter().map(|r| r.try_blocks.len()).sum::(), + "M9.5 EH scope-table scan complete", ); // M5.5 — typed indirect-dispatch resolution (this->vptr → method). @@ -4274,6 +4305,7 @@ fn cmd_dis( &strings, &fparrays, Some(&typed_ind), + &eh_records, )?; w.write_tls(tls_info.as_ref())?; if matches!(analyze, AnalyzeMode::Sql | AnalyzeMode::Both) {