M5+M7: indirect-dispatch reachability + .rdata string detection
Two MEDIUM milestones bundled (both opportunistic per plan; both small).
## M5 — indirect-dispatch reachability
- `xenia_analysis::indirect`: per-basic-block register tracker over each
detected function. Recognises the canonical static-vtable pattern
`lis+addi → lwz off(rA) → mtctr → bcctrl` where rA holds a known M3
vtable address. Emits one `Xref { kind: IndirectCall }` per resolvable
bcctrl site.
- PowerPC ABI awareness: `bl`-style calls clobber volatile r0..r12 + ctr
but preserve non-volatile r13..r31, so a vtable pointer parked in r30/r31
before a call survives.
- Label-based basic-block boundaries kill register state — bounds
false-positive risk for jump-IN paths.
- New `XrefKind::IndirectCall` variant (DB tag `'ind_call'`).
- New SQL view `v_indirect_reachability_from_entry` — strict superset of
`v_reachability_from_entry`, taking `ind_call` edges in the BFS.
Sylpheed yield: 0 edges detected. The binary's 1,001 static lis+addi
references into vtables are nearly all constructor-side vptr writes, not
dispatches; real method dispatch goes through `this->vptr` which requires
alias analysis we explicitly don't do. Documented in SCHEMA.md as the
expected limitation. Three unit tests cover the synthetic-correctness path.
## M7 — string / constant-pool detection
- `xenia_analysis::strings`: scans `.rdata` for runs of ≥ 6 printable
ASCII bytes (NUL-terminated) and ≥ 6 UTF-16LE code units (basic-plane
printable ASCII, NUL u16 terminator).
- New `strings(address PK, encoding, length, content)` table + encoding index.
- Implicit cross-ref via existing `xrefs.kind='ref'` rows whose target
matches a strings.address.
Sylpheed yield: 6,311 ASCII strings (including embedded HLSL shader source
and AS_CB_SURFACE_SWIZZLE_* assertion strings). 9,132 lis+addi sites
cross-reference detected strings — names source PCs near each string in
one query. Four unit tests cover encoding detection, NUL termination, and
short-run rejection.
Tests 626→633 (+3 indirect, +4 strings).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
212
crates/xenia-analysis/src/strings.rs
Normal file
212
crates/xenia-analysis/src/strings.rs
Normal file
@@ -0,0 +1,212 @@
|
||||
//! String / constant-pool detection in `.rdata`.
|
||||
//!
|
||||
//! Scans the `.rdata` section for runs of printable ASCII or null-terminated
|
||||
//! UTF-16LE characters of length ≥ 6, emitting one row per discovered string.
|
||||
//! Cross-references against `xrefs.target` are computed by the caller —
|
||||
//! this module only finds the strings; downstream queries can join.
|
||||
//!
|
||||
//! ### What this layer does NOT do
|
||||
//!
|
||||
//! - No UTF-8 multibyte detection — Xbox 360 game binaries reliably use
|
||||
//! ASCII for debug strings and UTF-16LE for localised text.
|
||||
//! - Strings in `.data` (mutable globals) are not scanned by default.
|
||||
//! - Wide strings on Xbox 360 are little-endian (compiler convention even
|
||||
//! on this big-endian platform); we do NOT try big-endian UTF-16.
|
||||
//! - No language detection / classification beyond encoding.
|
||||
//!
|
||||
//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic.
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
/// One detected string.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DetectedString {
|
||||
/// Absolute VA of the first byte.
|
||||
pub address: u32,
|
||||
/// `"ascii"` or `"utf16le"`.
|
||||
pub encoding: &'static str,
|
||||
/// Length in bytes (excluding the NUL terminator).
|
||||
pub length: u32,
|
||||
/// UTF-8 representation of the string content.
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
/// Scan all `.rdata` sections (and any other read-only data section the user
|
||||
/// configures) for ASCII and UTF-16LE strings.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(pe: &[u8], image_base: u32, sections: &[PeSection]) -> Vec<DetectedString> {
|
||||
let started = std::time::Instant::now();
|
||||
let mut out: Vec<DetectedString> = Vec::new();
|
||||
|
||||
for section in sections {
|
||||
if section.name != ".rdata" { continue; }
|
||||
let raw_start = section.virtual_address as usize;
|
||||
let raw_end = (section.virtual_address + section.virtual_size) as usize;
|
||||
if raw_end > pe.len() { continue; }
|
||||
let bytes = &pe[raw_start..raw_end.min(pe.len())];
|
||||
let va_base = image_base + section.virtual_address;
|
||||
|
||||
scan_ascii(bytes, va_base, &mut out);
|
||||
scan_utf16le(bytes, va_base, &mut out);
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
let n_ascii = out.iter().filter(|s| s.encoding == "ascii").count();
|
||||
let n_utf16 = out.len() - n_ascii;
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "strings").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
ascii = n_ascii,
|
||||
utf16le = n_utf16,
|
||||
total = out.len(),
|
||||
elapsed_ms,
|
||||
"string scan complete"
|
||||
);
|
||||
out
|
||||
}
|
||||
|
||||
const MIN_LEN: usize = 6;
|
||||
|
||||
fn is_printable_ascii(b: u8) -> bool {
|
||||
// Printable + the common whitespace characters used in real strings.
|
||||
matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r')
|
||||
}
|
||||
|
||||
fn scan_ascii(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
if !is_printable_ascii(bytes[i]) {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
let start = i;
|
||||
while i < bytes.len() && is_printable_ascii(bytes[i]) { i += 1; }
|
||||
let run_len = i - start;
|
||||
// Require NUL termination and minimum length.
|
||||
if run_len >= MIN_LEN && i < bytes.len() && bytes[i] == 0 {
|
||||
let s = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
|
||||
out.push(DetectedString {
|
||||
address: va_base + start as u32,
|
||||
encoding: "ascii",
|
||||
length: run_len as u32,
|
||||
content: s.to_string(),
|
||||
});
|
||||
}
|
||||
// Skip the NUL (if any) before continuing.
|
||||
if i < bytes.len() && bytes[i] == 0 { i += 1; }
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_utf16le(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||
// UTF-16LE strings are 2-byte aligned in MSVC output. Walk on even
|
||||
// offsets to avoid misaligned hits.
|
||||
let mut i = 0;
|
||||
while i + 2 <= bytes.len() {
|
||||
if !i.is_multiple_of(2) { i += 1; continue; }
|
||||
let lo = bytes[i];
|
||||
let hi = bytes[i + 1];
|
||||
// Restrict scan-start to printable ASCII range with a zero high byte —
|
||||
// this is what real Xbox 360 wide strings look like.
|
||||
if hi != 0 || !is_printable_ascii(lo) {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
let start = i;
|
||||
let mut codeunits: Vec<u16> = Vec::new();
|
||||
while i + 2 <= bytes.len() {
|
||||
let l = bytes[i];
|
||||
let h = bytes[i + 1];
|
||||
if h != 0 || !is_printable_ascii(l) { break; }
|
||||
codeunits.push((h as u16) << 8 | l as u16);
|
||||
i += 2;
|
||||
}
|
||||
// Require NUL u16 terminator.
|
||||
let nul_terminated = i + 2 <= bytes.len() && bytes[i] == 0 && bytes[i + 1] == 0;
|
||||
if codeunits.len() >= MIN_LEN && nul_terminated {
|
||||
let s: String = String::from_utf16_lossy(&codeunits);
|
||||
out.push(DetectedString {
|
||||
address: va_base + start as u32,
|
||||
encoding: "utf16le",
|
||||
length: ((i - start) as u32),
|
||||
content: s,
|
||||
});
|
||||
}
|
||||
// Skip past the terminator.
|
||||
if nul_terminated { i += 2; }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn mk_section(name: &str, va: u32, size: u32) -> PeSection {
|
||||
PeSection {
|
||||
name: name.into(),
|
||||
virtual_address: va,
|
||||
virtual_size: size,
|
||||
raw_offset: va,
|
||||
raw_size: size,
|
||||
flags: 0x4000_0040,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_ascii_string() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
let s = b"Hello, world!\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(strings.len(), 1);
|
||||
assert_eq!(strings[0].encoding, "ascii");
|
||||
assert_eq!(strings[0].content, "Hello, world!");
|
||||
assert_eq!(strings[0].address, image_base + 0x1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_short_runs() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
let s = b"Hi\0longer string here\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(strings.len(), 1);
|
||||
assert_eq!(strings[0].content, "longer string here");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_utf16le_string() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
let off = 0x1000usize;
|
||||
// "Hello!" in UTF-16LE + NUL u16
|
||||
let s: &[u8] = b"H\0e\0l\0l\0o\0!\0\0\0";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
// Both ASCII and UTF-16 may detect — UTF-16 should find it as wide;
|
||||
// ASCII pass scans bytes and won't see this as a contiguous run
|
||||
// because of the interleaved 0 bytes (non-printable).
|
||||
let utf16: Vec<_> = strings.iter().filter(|s| s.encoding == "utf16le").collect();
|
||||
assert!(utf16.iter().any(|s| s.content == "Hello!"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn requires_nul_terminator() {
|
||||
let image_base = 0x82000000u32;
|
||||
let mut pe = vec![0u8; 0x1100];
|
||||
// No trailing NUL — should NOT be detected.
|
||||
let off = 0x1000usize;
|
||||
let s = b"abcdefghij";
|
||||
pe[off..off + s.len()].copy_from_slice(s);
|
||||
// Fill rest of section with 0xFF so the run terminates cleanly without NUL.
|
||||
for j in off + s.len()..off + 0x100 { pe[j] = 0xFF; }
|
||||
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||
let strings = analyze(&pe, image_base, §ions);
|
||||
assert_eq!(strings.len(), 0);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user