Merge analysis-overhaul/m5-indirect-reach (M5 + M7)
This commit is contained in:
@@ -166,11 +166,70 @@ See `crates/xenia-analysis/src/lookup.rs`.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Forward work (M5–M12, not yet landed)
|
## Layer M5 — Indirect-dispatch reachability (landed)
|
||||||
|
|
||||||
|
### Schema additions
|
||||||
|
- New value `'ind_call'` in the `xrefs.kind` set.
|
||||||
|
- New SQL view `v_indirect_reachability_from_entry` — strict superset of
|
||||||
|
`v_reachability_from_entry`, taking `ind_call` edges in the BFS.
|
||||||
|
|
||||||
|
### What this layer does
|
||||||
|
- Walks each `FuncAnalysis.functions` entry with a per-basic-block register
|
||||||
|
tracker. Recognises the canonical static-vtable pattern:
|
||||||
|
`lis+addi → lwz off(rA) → mtctr → bcctrl`, where `rA` ends up holding a
|
||||||
|
known vtable's start address from M3.
|
||||||
|
- Honours the PowerPC ABI: `bl`-style calls (op 18 / 16 with LK=1) clobber
|
||||||
|
volatile r0..r12 + ctr but preserve non-volatile r13..r31, so a vtable
|
||||||
|
pointer parked in r30/r31 before a call survives.
|
||||||
|
- Treats every M3 `loc_*` label as a basic-block boundary (kills register
|
||||||
|
state) so jump-IN paths cannot induce false positives.
|
||||||
|
|
||||||
|
### What this layer does NOT do (and observed impact)
|
||||||
|
- Vtable pointer loaded from a `this`-pointer field
|
||||||
|
(`lwz r_vt, off(rA)` where `rA = this`) — by far the dominant pattern in
|
||||||
|
real C++ — is unresolvable without alias / points-to analysis.
|
||||||
|
- On Sylpheed: the layer detects 0 edges. The binary's 1,001 lis+addi
|
||||||
|
references into vtables are mostly constructor-side **vptr writes**
|
||||||
|
(`stw rVtable, vptr_offset(this)`), not direct dispatches. The renderer
|
||||||
|
hunt's audit-009 cluster therefore needs a future M5.5 with `this`-flow
|
||||||
|
tracking before this layer surfaces it.
|
||||||
|
|
||||||
|
### Reference docs
|
||||||
|
- IBM PowerPC ABI: register-save convention (volatile r0..r12 + ctr,
|
||||||
|
non-volatile r13..r31).
|
||||||
|
|
||||||
|
## Layer M7 — String / constant-pool detection (landed)
|
||||||
|
|
||||||
|
### Schema additions
|
||||||
|
- New table `strings(address PK, encoding, length, content)`.
|
||||||
|
- Index `idx_strings_encoding`.
|
||||||
|
|
||||||
|
### What this layer does
|
||||||
|
- Scans `.rdata` for runs of length ≥ 6 of printable ASCII bytes followed by
|
||||||
|
a NUL terminator.
|
||||||
|
- Scans `.rdata` for UTF-16LE runs of length ≥ 6 code units (printable-ASCII
|
||||||
|
basic plane only) followed by a u16 NUL terminator.
|
||||||
|
- Cross-reference is implicit: existing `xrefs.kind='ref'` rows whose
|
||||||
|
`target` falls in `strings.address`'s exact match set name the referencing
|
||||||
|
PCs. SQL: `SELECT s.content, x.source FROM xrefs x JOIN strings s
|
||||||
|
ON s.address = x.target WHERE x.kind='ref'`.
|
||||||
|
|
||||||
|
### What this layer does NOT do
|
||||||
|
- No UTF-8 multibyte / non-ASCII basic plane in either encoding.
|
||||||
|
- No `.data` scan (read-only-section bias).
|
||||||
|
- No multi-byte CJK encodings — Japanese text in localised builds may be
|
||||||
|
represented in shift_jis / utf-8 with non-printable bytes that this
|
||||||
|
scanner skips.
|
||||||
|
|
||||||
|
### Sylpheed yield
|
||||||
|
- 6,311 ASCII strings (including full embedded HLSL shader source).
|
||||||
|
- 0 UTF-16LE strings (binary uses ASCII / native CJK encoding).
|
||||||
|
- 9,132 lis+addi sites cross-reference into the detected strings — names
|
||||||
|
the source PCs that reference each string.
|
||||||
|
|
||||||
|
## Forward work (M6, M8–M12, not yet landed)
|
||||||
|
|
||||||
- **M5** — indirect-dispatch reachability via vtable+CTR dataflow.
|
|
||||||
- **M6** — extended `xrefs.kind='write'` for indexed/byte-reverse/multiword/VMX/DCBZ/atomic stores with `addr_mode` column.
|
- **M6** — extended `xrefs.kind='write'` for indexed/byte-reverse/multiword/VMX/DCBZ/atomic stores with `addr_mode` column.
|
||||||
- **M7** — `.rdata` ASCII / UTF-16 string pool detection cross-referenced with PCs.
|
|
||||||
- **M8** — dispatch-table heuristics beyond vtables (e.g. function-pointer arrays in `.data`).
|
- **M8** — dispatch-table heuristics beyond vtables (e.g. function-pointer arrays in `.data`).
|
||||||
- **M9** — `__CxxFrameHandler` exception scope-table parsing.
|
- **M9** — `__CxxFrameHandler` exception scope-table parsing.
|
||||||
- **M10** — `.tls` section / TLS slot tracking.
|
- **M10** — `.tls` section / TLS slot tracking.
|
||||||
|
|||||||
@@ -306,6 +306,7 @@ impl DbWriter {
|
|||||||
///
|
///
|
||||||
/// `vtables` is the M3 result; pass an empty slice when the caller has
|
/// `vtables` is the M3 result; pass an empty slice when the caller has
|
||||||
/// not run the vtable scan (the tables are still created, just empty).
|
/// not run the vtable scan (the tables are still created, just empty).
|
||||||
|
/// `strings` is the M7 result; same convention.
|
||||||
#[tracing::instrument(skip_all, name = "db.write_analysis_results")]
|
#[tracing::instrument(skip_all, name = "db.write_analysis_results")]
|
||||||
pub fn write_analysis_results(
|
pub fn write_analysis_results(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -315,6 +316,7 @@ impl DbWriter {
|
|||||||
labels: &HashMap<u32, String>,
|
labels: &HashMap<u32, String>,
|
||||||
xrefs: &XrefMap,
|
xrefs: &XrefMap,
|
||||||
vtables: &[crate::vtables::Vtable],
|
vtables: &[crate::vtables::Vtable],
|
||||||
|
strings: &[crate::strings::DetectedString],
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
self.conn.execute_batch("
|
self.conn.execute_batch("
|
||||||
CREATE TABLE functions (
|
CREATE TABLE functions (
|
||||||
@@ -368,6 +370,13 @@ impl DbWriter {
|
|||||||
base_classes_json VARCHAR -- JSON of base class names (NULL when stripped)
|
base_classes_json VARCHAR -- JSON of base class names (NULL when stripped)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
CREATE TABLE strings (
|
||||||
|
address BIGINT PRIMARY KEY, -- absolute VA of first byte
|
||||||
|
encoding VARCHAR NOT NULL, -- 'ascii' or 'utf16le'
|
||||||
|
length BIGINT NOT NULL, -- length in bytes (excluding NUL terminator)
|
||||||
|
content VARCHAR NOT NULL -- UTF-8 representation of the string
|
||||||
|
);
|
||||||
|
|
||||||
CREATE TABLE demangled_names (
|
CREATE TABLE demangled_names (
|
||||||
address BIGINT, -- VA the mangled name is associated with; NULL when from a non-address source (e.g. RTTI-only string)
|
address BIGINT, -- VA the mangled name is associated with; NULL when from a non-address source (e.g. RTTI-only string)
|
||||||
mangled VARCHAR NOT NULL, -- original mangled symbol (e.g. ?Foo@Bar@@QEAAXXZ)
|
mangled VARCHAR NOT NULL, -- original mangled symbol (e.g. ?Foo@Bar@@QEAAXXZ)
|
||||||
@@ -395,6 +404,7 @@ impl DbWriter {
|
|||||||
insert_demangled_from_labels(&self.conn, labels, info.import_libraries)?;
|
insert_demangled_from_labels(&self.conn, labels, info.import_libraries)?;
|
||||||
insert_vtables(&self.conn, vtables, pe, info.image_base)?;
|
insert_vtables(&self.conn, vtables, pe, info.image_base)?;
|
||||||
insert_methods_and_classes(&self.conn, vtables, labels)?;
|
insert_methods_and_classes(&self.conn, vtables, labels)?;
|
||||||
|
insert_strings(&self.conn, strings)?;
|
||||||
insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?;
|
insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?;
|
||||||
|
|
||||||
let indices = [
|
let indices = [
|
||||||
@@ -407,6 +417,7 @@ impl DbWriter {
|
|||||||
("idx_demangled_method", "CREATE INDEX idx_demangled_method ON demangled_names(method_name)"),
|
("idx_demangled_method", "CREATE INDEX idx_demangled_method ON demangled_names(method_name)"),
|
||||||
("idx_methods_function", "CREATE INDEX idx_methods_function ON methods(function_address)"),
|
("idx_methods_function", "CREATE INDEX idx_methods_function ON methods(function_address)"),
|
||||||
("idx_classes_rtti", "CREATE INDEX idx_classes_rtti ON classes(rtti_present)"),
|
("idx_classes_rtti", "CREATE INDEX idx_classes_rtti ON classes(rtti_present)"),
|
||||||
|
("idx_strings_encoding", "CREATE INDEX idx_strings_encoding ON strings(encoding)"),
|
||||||
("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"),
|
("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"),
|
||||||
("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"),
|
("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"),
|
||||||
("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"),
|
("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"),
|
||||||
@@ -423,7 +434,8 @@ impl DbWriter {
|
|||||||
|
|
||||||
/// Back-compat wrapper for callers that want the full pre-Phase-3
|
/// Back-compat wrapper for callers that want the full pre-Phase-3
|
||||||
/// "everything in one shot" behaviour. Equivalent to
|
/// "everything in one shot" behaviour. Equivalent to
|
||||||
/// `ingest_instructions` + `write_analysis_results` with no M3 vtables.
|
/// `ingest_instructions` + `write_analysis_results` with no M3 vtables /
|
||||||
|
/// M7 strings.
|
||||||
#[tracing::instrument(skip_all, name = "db.write_disasm")]
|
#[tracing::instrument(skip_all, name = "db.write_disasm")]
|
||||||
pub fn write_disasm(
|
pub fn write_disasm(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -434,7 +446,7 @@ impl DbWriter {
|
|||||||
xrefs: &XrefMap,
|
xrefs: &XrefMap,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
self.ingest_instructions(pe, info, func_analysis, labels)?;
|
self.ingest_instructions(pe, info, func_analysis, labels)?;
|
||||||
self.write_analysis_results(pe, info, func_analysis, labels, xrefs, &[])?;
|
self.write_analysis_results(pe, info, func_analysis, labels, xrefs, &[], &[])?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -846,6 +858,30 @@ fn insert_methods_and_classes(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn insert_strings(
|
||||||
|
conn: &Connection,
|
||||||
|
strings: &[crate::strings::DetectedString],
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
if strings.is_empty() { return Ok(()); }
|
||||||
|
let mut stmt = conn.prepare(
|
||||||
|
"INSERT INTO strings (address, encoding, length, content) VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT DO NOTHING"
|
||||||
|
)?;
|
||||||
|
let mut count = 0u64;
|
||||||
|
for s in strings {
|
||||||
|
stmt.execute(params![
|
||||||
|
s.address as i64,
|
||||||
|
s.encoding,
|
||||||
|
s.length as i64,
|
||||||
|
s.content.as_str(),
|
||||||
|
])?;
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
metrics::counter!("db.rows", "table" => "strings").increment(count);
|
||||||
|
tracing::info!(rows = count, table = "strings", "bulk insert complete");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn insert_demangled_from_labels(
|
fn insert_demangled_from_labels(
|
||||||
conn: &Connection,
|
conn: &Connection,
|
||||||
labels: &HashMap<u32, String>,
|
labels: &HashMap<u32, String>,
|
||||||
|
|||||||
468
crates/xenia-analysis/src/indirect.rs
Normal file
468
crates/xenia-analysis/src/indirect.rs
Normal file
@@ -0,0 +1,468 @@
|
|||||||
|
//! Indirect-dispatch reachability for vtable-bound `bcctrl`/`bctrl` sites.
|
||||||
|
//!
|
||||||
|
//! Walks each detected function with a tiny per-basic-block register tracker,
|
||||||
|
//! recognising the canonical MSVC PowerPC pattern that loads a slot from a
|
||||||
|
//! statically-addressed vtable into CTR and indirectly calls it:
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! lis rA, hi
|
||||||
|
//! addi rA, rA, lo ; rA = vtable_address
|
||||||
|
//! lwz rB, slot*4(rA) ; rB = vtable[slot]
|
||||||
|
//! mtctr rB ; CTR = vtable[slot]
|
||||||
|
//! bcctrl ; indirect call → vtable[slot]
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! Pattern hits are emitted as `(source_pc, target_pc)` pairs that callers
|
||||||
|
//! insert into the `xrefs` table with `kind='ind_call'`.
|
||||||
|
//!
|
||||||
|
//! ### What this does NOT cover
|
||||||
|
//!
|
||||||
|
//! - Vtable pointer loaded from a `this`-pointer field (`lwz rA, off(this)`)
|
||||||
|
//! is the dominant pattern in real C++ code; resolving it requires
|
||||||
|
//! alias / points-to analysis that's far beyond this layer's scope.
|
||||||
|
//! - Indirect calls via function-pointer fields (callbacks) are similarly
|
||||||
|
//! unresolvable without object-flow analysis.
|
||||||
|
//! - Register state is intentionally killed at every label (basic-block
|
||||||
|
//! boundary) — we don't try to do flow-sensitive merging across joins.
|
||||||
|
//!
|
||||||
|
//! Reference: IBM PowerPC ABI on register-save convention, plus the
|
||||||
|
//! `xenia_analysis::xref` `lis+addi`/`lis+ori` tracker which we mirror
|
||||||
|
//! conceptually.
|
||||||
|
|
||||||
|
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||||
|
|
||||||
|
use crate::func::FuncAnalysis;
|
||||||
|
use crate::vtables::Vtable;
|
||||||
|
|
||||||
|
/// One detected indirect-call edge: `bcctrl` at `source` jumps to `target`.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct IndirectEdge {
|
||||||
|
pub source: u32,
|
||||||
|
pub target: u32,
|
||||||
|
/// Vtable the source resolved through.
|
||||||
|
pub via_vtable: u32,
|
||||||
|
/// Method slot index within the vtable.
|
||||||
|
pub slot: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
enum RegVal {
|
||||||
|
/// Register holds a known constant (e.g. after `lis+addi`).
|
||||||
|
Const(u32),
|
||||||
|
/// Register holds a method pointer loaded from a known vtable slot.
|
||||||
|
MethodPtr {
|
||||||
|
vtable_addr: u32,
|
||||||
|
slot: u32,
|
||||||
|
method_pc: u32,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
const OP_ADDI: u32 = 14;
|
||||||
|
const OP_ADDIS: u32 = 15;
|
||||||
|
const OP_BCCTR: u32 = 19; // also covers blr — distinguish via XO
|
||||||
|
const OP_LWZ: u32 = 32;
|
||||||
|
const OP_ORI: u32 = 24;
|
||||||
|
const OP_X_FORM: u32 = 31; // mtspr / mr / etc.
|
||||||
|
|
||||||
|
/// Run the static indirect-dispatch scan. Returns one edge per resolvable
|
||||||
|
/// `bcctrl` site.
|
||||||
|
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||||
|
pub fn analyze(
|
||||||
|
pe: &[u8],
|
||||||
|
image_base: u32,
|
||||||
|
func_analysis: &FuncAnalysis,
|
||||||
|
vtables: &[Vtable],
|
||||||
|
labels: &HashMap<u32, String>,
|
||||||
|
) -> Vec<IndirectEdge> {
|
||||||
|
let started = std::time::Instant::now();
|
||||||
|
// Index vtables by their start VA so the lwz handler can decide
|
||||||
|
// whether a given Const(addr) is "really" a vtable.
|
||||||
|
let vtable_by_addr: BTreeMap<u32, &Vtable> =
|
||||||
|
vtables.iter().map(|v| (v.address, v)).collect();
|
||||||
|
|
||||||
|
// Set of all "label"-bearing PCs in the analyzed binary. We treat each
|
||||||
|
// label as a basic-block boundary (anything `loc_*` is a jump target,
|
||||||
|
// so register state arriving at it is unreliable).
|
||||||
|
let mut block_boundaries: HashSet<u32> = HashSet::with_capacity(labels.len());
|
||||||
|
for &addr in labels.keys() {
|
||||||
|
block_boundaries.insert(addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut edges: Vec<IndirectEdge> = Vec::new();
|
||||||
|
|
||||||
|
for (&fn_start, fi) in &func_analysis.functions {
|
||||||
|
if fi.is_saverestore { continue; }
|
||||||
|
let mut reg: [Option<RegVal>; 32] = [None; 32];
|
||||||
|
let mut ctr: Option<RegVal> = None;
|
||||||
|
let mut pc = fn_start;
|
||||||
|
while pc < fi.end {
|
||||||
|
// Reset register state on basic-block entry. We don't reset on
|
||||||
|
// the function entry itself (PC == fn_start) because labels and
|
||||||
|
// function-starts coincide; the initial state is already None.
|
||||||
|
if pc != fn_start && block_boundaries.contains(&pc) {
|
||||||
|
reg = [None; 32];
|
||||||
|
ctr = None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let instr = match read_instr(pe, image_base, pc) {
|
||||||
|
Some(i) => i,
|
||||||
|
None => break,
|
||||||
|
};
|
||||||
|
|
||||||
|
let op = instr >> 26;
|
||||||
|
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||||
|
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||||
|
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||||
|
let uimm = instr & 0xFFFF;
|
||||||
|
|
||||||
|
match op {
|
||||||
|
// lis rD, IMM (== addis rD, r0, IMM)
|
||||||
|
OP_ADDIS if ra == 0 => {
|
||||||
|
reg[rd] = Some(RegVal::Const(uimm << 16));
|
||||||
|
}
|
||||||
|
// addis rD, rA, IMM
|
||||||
|
OP_ADDIS => {
|
||||||
|
if let Some(RegVal::Const(b)) = reg[ra] {
|
||||||
|
reg[rd] = Some(RegVal::Const(b.wrapping_add(uimm << 16)));
|
||||||
|
} else {
|
||||||
|
reg[rd] = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// addi rD, rA, IMM
|
||||||
|
OP_ADDI if ra != 0 => {
|
||||||
|
if let Some(RegVal::Const(b)) = reg[ra] {
|
||||||
|
reg[rd] = Some(RegVal::Const(b.wrapping_add(simm as u32)));
|
||||||
|
} else {
|
||||||
|
reg[rd] = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// li rD, IMM (== addi rD, 0, IMM)
|
||||||
|
OP_ADDI => {
|
||||||
|
reg[rd] = Some(RegVal::Const(simm as u32));
|
||||||
|
}
|
||||||
|
// ori rA, rS, IMM — note operand order: bits 21..25 = rS, 16..20 = rA
|
||||||
|
OP_ORI => {
|
||||||
|
let rs = rd; // bits 21..25 = source
|
||||||
|
if let Some(RegVal::Const(b)) = reg[rs] {
|
||||||
|
reg[ra] = Some(RegVal::Const(b | uimm));
|
||||||
|
} else {
|
||||||
|
reg[ra] = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// lwz rD, off(rA) — try to resolve as vtable slot load.
|
||||||
|
OP_LWZ => {
|
||||||
|
if ra != 0
|
||||||
|
&& let Some(RegVal::Const(base)) = reg[ra]
|
||||||
|
{
|
||||||
|
let target = base.wrapping_add(simm as u32);
|
||||||
|
// Two-step lookup so we accept both:
|
||||||
|
// (a) base = exact vtable head, simm/4 = slot
|
||||||
|
// (b) base + simm = exact vtable head (rare;
|
||||||
|
// compiler hoists the slot offset into addi)
|
||||||
|
let resolved = resolve_vtable_slot(target, &vtable_by_addr)
|
||||||
|
.or_else(|| resolve_vtable_slot_via_off(base, simm, &vtable_by_addr));
|
||||||
|
reg[rd] = resolved.map(|(vt, slot, pc)| RegVal::MethodPtr {
|
||||||
|
vtable_addr: vt, slot, method_pc: pc,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
reg[rd] = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// X-form: mtspr/mtctr, bcctrl, mr, etc.
|
||||||
|
OP_X_FORM => {
|
||||||
|
let xo = (instr >> 1) & 0x3FF;
|
||||||
|
match xo {
|
||||||
|
467 => {
|
||||||
|
// mtspr SPR, rS — PPC SPR field is split: high 5 bits
|
||||||
|
// in PPC bits 16:20 (= Rust bits 11..15), low 5 bits
|
||||||
|
// in PPC bits 11:15 (= Rust bits 16..20). Mirrors
|
||||||
|
// the convention in `func.rs::is_mfspr_lr`.
|
||||||
|
let spr = (((instr >> 11) & 0x1F) << 5) | ((instr >> 16) & 0x1F);
|
||||||
|
if spr == 9 {
|
||||||
|
ctr = reg[rd];
|
||||||
|
}
|
||||||
|
// Otherwise no observable effect on tracked state.
|
||||||
|
}
|
||||||
|
// Anything that writes rD (most arithmetic, loads, etc.) clobbers it.
|
||||||
|
// Conservative: invalidate rD on any X-form that has rD in bits 21..25
|
||||||
|
// and is NOT a comparison or branch.
|
||||||
|
_ => {
|
||||||
|
// Heuristic: most X-form ops with non-zero RC encode rD; we
|
||||||
|
// invalidate to avoid stale Const propagation past arithmetic.
|
||||||
|
// This is over-eager but safe (false negatives on edges, never
|
||||||
|
// false positives).
|
||||||
|
reg[rd] = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// bcctr/bcctrl — opcode 19, XO=528. LK in low bit.
|
||||||
|
OP_BCCTR => {
|
||||||
|
let xo = (instr >> 1) & 0x3FF;
|
||||||
|
if xo == 528 {
|
||||||
|
let lk = (instr & 1) != 0;
|
||||||
|
if lk
|
||||||
|
&& let Some(RegVal::MethodPtr { vtable_addr, slot, method_pc }) = ctr
|
||||||
|
{
|
||||||
|
edges.push(IndirectEdge {
|
||||||
|
source: pc,
|
||||||
|
target: method_pc,
|
||||||
|
via_vtable: vtable_addr,
|
||||||
|
slot,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// After the call, CTR is preserved but rD register
|
||||||
|
// values across the call boundary are not trustworthy.
|
||||||
|
// Don't touch reg state — most ABIs preserve only
|
||||||
|
// some regs anyway.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// op 18: b / bl / ba / bla. LK=1 is a call; LK=0 is an
|
||||||
|
// unconditional branch with no fall-through (next PC is
|
||||||
|
// reached only via a different basic block, which the
|
||||||
|
// label-based reset already handles). On a call, the
|
||||||
|
// PowerPC ABI marks r0..r12 + ctr as volatile and
|
||||||
|
// r13..r31 as non-volatile (callee-saved); preserve the
|
||||||
|
// non-volatile half so vtable pointers loaded into r30/r31
|
||||||
|
// before a `bl` survive the call.
|
||||||
|
18 => {
|
||||||
|
let lk = (instr & 1) != 0;
|
||||||
|
if lk {
|
||||||
|
for r in 0..=12 { reg[r] = None; }
|
||||||
|
ctr = None;
|
||||||
|
}
|
||||||
|
// LK=0 (`b`) makes fall-through unreachable; nothing to do —
|
||||||
|
// any next reachable PC will hit a label boundary.
|
||||||
|
}
|
||||||
|
// Conditional branches (op 16) fall through; preserve all reg
|
||||||
|
// state for the fall-through path. The label-based join-point
|
||||||
|
// invalidation bounds false-positive risk for jump-IN paths.
|
||||||
|
16 => {
|
||||||
|
let lk = (instr & 1) != 0;
|
||||||
|
if lk {
|
||||||
|
for r in 0..=12 { reg[r] = None; }
|
||||||
|
ctr = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Stores and loads we don't track explicitly clobber rD only
|
||||||
|
// when rD is on the destination side; the conservative rule
|
||||||
|
// is "any non-recognised opcode that may write rD invalidates it".
|
||||||
|
36..=55 => {
|
||||||
|
// Loads write rD; stores don't. The safe pessimisation is
|
||||||
|
// to invalidate rD for the load family (32..=35, 40..=43, etc.)
|
||||||
|
// and leave it alone for stores. We've already handled lwz
|
||||||
|
// above; for the rest, invalidate rD.
|
||||||
|
if matches!(op, 32..=35 | 40..=43 | 48..=51) {
|
||||||
|
reg[rd] = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
pc = pc.wrapping_add(4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||||
|
metrics::histogram!("analysis.phase_ms", "phase" => "indirect").record(elapsed_ms);
|
||||||
|
tracing::info!(
|
||||||
|
edges = edges.len(),
|
||||||
|
elapsed_ms,
|
||||||
|
"indirect-dispatch scan complete"
|
||||||
|
);
|
||||||
|
edges
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_instr(pe: &[u8], image_base: u32, addr: u32) -> Option<u32> {
|
||||||
|
let off = addr.wrapping_sub(image_base) as usize;
|
||||||
|
if off + 4 > pe.len() { return None; }
|
||||||
|
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `target = base + simm` where `target` is an exact vtable head (rare,
|
||||||
|
/// compiler hoists the slot offset into the addi).
|
||||||
|
fn resolve_vtable_slot_via_off(
|
||||||
|
base: u32,
|
||||||
|
simm: i32,
|
||||||
|
vtable_by_addr: &BTreeMap<u32, &Vtable>,
|
||||||
|
) -> Option<(u32, u32, u32)> {
|
||||||
|
let target = base.wrapping_add(simm as u32);
|
||||||
|
if let Some(v) = vtable_by_addr.get(&target)
|
||||||
|
&& !v.methods.is_empty()
|
||||||
|
{
|
||||||
|
return Some((v.address, 0, v.methods[0]));
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `target` is an absolute address. If it falls inside a known vtable's
|
||||||
|
/// `[address, address + length*4)` range AND is 4-aligned to a slot,
|
||||||
|
/// return `(vtable_addr, slot, method_pc)`.
|
||||||
|
fn resolve_vtable_slot(
|
||||||
|
target: u32,
|
||||||
|
vtable_by_addr: &BTreeMap<u32, &Vtable>,
|
||||||
|
) -> Option<(u32, u32, u32)> {
|
||||||
|
// BTreeMap range search for the largest key ≤ target.
|
||||||
|
let (&vt_addr, vt) = vtable_by_addr.range(..=target).next_back()?;
|
||||||
|
if target < vt_addr { return None; }
|
||||||
|
let off = target - vt_addr;
|
||||||
|
if !off.is_multiple_of(4) { return None; }
|
||||||
|
let slot = off / 4;
|
||||||
|
if slot >= vt.length { return None; }
|
||||||
|
let method_pc = *vt.methods.get(slot as usize)?;
|
||||||
|
Some((vt_addr, slot, method_pc))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::func::FuncInfo;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
fn mk_vtable(addr: u32, methods: Vec<u32>) -> Vtable {
|
||||||
|
Vtable {
|
||||||
|
address: addr,
|
||||||
|
length: methods.len() as u32,
|
||||||
|
col_address: None,
|
||||||
|
class_name: "ANON_test".into(),
|
||||||
|
rtti_present: false,
|
||||||
|
base_classes_json: None,
|
||||||
|
methods,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode the canonical pattern at PC `start`:
|
||||||
|
/// lis r3, hi
|
||||||
|
/// addi r3, r3, lo ; r3 = vtable_addr
|
||||||
|
/// lwz r4, slot*4(r3) ; r4 = vtable[slot]
|
||||||
|
/// mtctr r4
|
||||||
|
/// bcctrl
|
||||||
|
fn encode_pattern(buf: &mut [u8], offset: usize, vtable_addr: u32, slot_off: i32) {
|
||||||
|
let hi = (vtable_addr >> 16) as u16;
|
||||||
|
let lo = (vtable_addr & 0xFFFF) as i16;
|
||||||
|
let lis = (15u32 << 26) | (3 << 21) | (0 << 16) | (hi as u32);
|
||||||
|
// addi r3, r3, lo (signed) — note: addi is treated as signed
|
||||||
|
let addi = (14u32 << 26) | (3 << 21) | (3 << 16) | ((lo as u16) as u32);
|
||||||
|
let lwz = (32u32 << 26) | (4 << 21) | (3 << 16) | ((slot_off as u16) as u32);
|
||||||
|
// mtctr r4 = mtspr CTR(=9), r4. SPR_low (=9) → Rust bits 16-20;
|
||||||
|
// SPR_high (=0) → Rust bits 11-15. Rc bit 0.
|
||||||
|
let mtctr = (31u32 << 26) | (4 << 21) | (9 << 16) | (0 << 11) | (467 << 1);
|
||||||
|
let bcctrl = (19u32 << 26) | (20 << 21) | (528 << 1) | 1; // bcctrl 20, 0
|
||||||
|
let words = [lis, addi, lwz, mtctr, bcctrl];
|
||||||
|
for (i, w) in words.iter().enumerate() {
|
||||||
|
buf[offset + i * 4..offset + i * 4 + 4].copy_from_slice(&w.to_be_bytes());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn detects_canonical_lis_addi_lwz_mtctr_bcctrl() {
|
||||||
|
let image_base = 0x82000000u32;
|
||||||
|
let text_va = 0x1000u32;
|
||||||
|
let pc_start = image_base + text_va;
|
||||||
|
let vtable_addr = 0x82010000u32;
|
||||||
|
|
||||||
|
// PE: just the .text we'll write the pattern into.
|
||||||
|
let mut pe = vec![0u8; 0x1100];
|
||||||
|
encode_pattern(&mut pe, text_va as usize, vtable_addr, 8); // slot 2
|
||||||
|
|
||||||
|
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||||
|
functions.insert(pc_start, FuncInfo {
|
||||||
|
start: pc_start,
|
||||||
|
end: pc_start + 5 * 4,
|
||||||
|
frame_size: 0,
|
||||||
|
saved_gprs: 0,
|
||||||
|
is_leaf: false,
|
||||||
|
is_saverestore: false,
|
||||||
|
pdata_validated: false,
|
||||||
|
pdata_length: None,
|
||||||
|
});
|
||||||
|
let func_analysis = FuncAnalysis {
|
||||||
|
functions,
|
||||||
|
save_gpr_base: None,
|
||||||
|
restore_gpr_base: None,
|
||||||
|
pdata_entries: Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let vtables = vec![mk_vtable(vtable_addr, vec![0xAA, 0xBB, 0xCC, 0xDD])];
|
||||||
|
let labels: HashMap<u32, String> = HashMap::new();
|
||||||
|
let edges = analyze(&pe, image_base, &func_analysis, &vtables, &labels);
|
||||||
|
|
||||||
|
assert_eq!(edges.len(), 1);
|
||||||
|
assert_eq!(edges[0].source, pc_start + 4 * 4); // bcctrl at 5th instruction
|
||||||
|
assert_eq!(edges[0].target, 0xCC); // slot 2
|
||||||
|
assert_eq!(edges[0].via_vtable, vtable_addr);
|
||||||
|
assert_eq!(edges[0].slot, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn out_of_range_slot_yields_no_edge() {
|
||||||
|
let image_base = 0x82000000u32;
|
||||||
|
let text_va = 0x1000u32;
|
||||||
|
let pc_start = image_base + text_va;
|
||||||
|
let vtable_addr = 0x82010000u32;
|
||||||
|
|
||||||
|
let mut pe = vec![0u8; 0x1100];
|
||||||
|
// Encode slot 12, but vtable only has 4 methods.
|
||||||
|
encode_pattern(&mut pe, text_va as usize, vtable_addr, 48);
|
||||||
|
|
||||||
|
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||||
|
functions.insert(pc_start, FuncInfo {
|
||||||
|
start: pc_start,
|
||||||
|
end: pc_start + 5 * 4,
|
||||||
|
frame_size: 0,
|
||||||
|
saved_gprs: 0,
|
||||||
|
is_leaf: false,
|
||||||
|
is_saverestore: false,
|
||||||
|
pdata_validated: false,
|
||||||
|
pdata_length: None,
|
||||||
|
});
|
||||||
|
let func_analysis = FuncAnalysis {
|
||||||
|
functions,
|
||||||
|
save_gpr_base: None,
|
||||||
|
restore_gpr_base: None,
|
||||||
|
pdata_entries: Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let vtables = vec![mk_vtable(vtable_addr, vec![0xAA, 0xBB, 0xCC, 0xDD])];
|
||||||
|
let labels: HashMap<u32, String> = HashMap::new();
|
||||||
|
let edges = analyze(&pe, image_base, &func_analysis, &vtables, &labels);
|
||||||
|
assert_eq!(edges.len(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn label_in_middle_kills_state() {
|
||||||
|
let image_base = 0x82000000u32;
|
||||||
|
let text_va = 0x1000u32;
|
||||||
|
let pc_start = image_base + text_va;
|
||||||
|
let vtable_addr = 0x82010000u32;
|
||||||
|
|
||||||
|
let mut pe = vec![0u8; 0x1100];
|
||||||
|
encode_pattern(&mut pe, text_va as usize, vtable_addr, 0);
|
||||||
|
|
||||||
|
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||||
|
functions.insert(pc_start, FuncInfo {
|
||||||
|
start: pc_start,
|
||||||
|
end: pc_start + 5 * 4,
|
||||||
|
frame_size: 0,
|
||||||
|
saved_gprs: 0,
|
||||||
|
is_leaf: false,
|
||||||
|
is_saverestore: false,
|
||||||
|
pdata_validated: false,
|
||||||
|
pdata_length: None,
|
||||||
|
});
|
||||||
|
let func_analysis = FuncAnalysis {
|
||||||
|
functions,
|
||||||
|
save_gpr_base: None,
|
||||||
|
restore_gpr_base: None,
|
||||||
|
pdata_entries: Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let vtables = vec![mk_vtable(vtable_addr, vec![0xAA, 0xBB])];
|
||||||
|
|
||||||
|
// Label between addi and lwz — must kill the Const tracking.
|
||||||
|
let mut labels: HashMap<u32, String> = HashMap::new();
|
||||||
|
labels.insert(pc_start + 8, "loc_mid".to_string());
|
||||||
|
|
||||||
|
let edges = analyze(&pe, image_base, &func_analysis, &vtables, &labels);
|
||||||
|
assert_eq!(edges.len(), 0, "label in middle of pattern must kill register state");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,6 +9,8 @@ pub mod sql_views;
|
|||||||
pub mod demangle;
|
pub mod demangle;
|
||||||
pub mod vtables;
|
pub mod vtables;
|
||||||
pub mod lookup;
|
pub mod lookup;
|
||||||
|
pub mod indirect;
|
||||||
|
pub mod strings;
|
||||||
|
|
||||||
mod ordinals;
|
mod ordinals;
|
||||||
pub use ordinals::resolve_ordinal;
|
pub use ordinals::resolve_ordinal;
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ pub const ALL_VIEWS: &[(&str, &str)] = &[
|
|||||||
("v_branch_xrefs", V_BRANCH_XREFS),
|
("v_branch_xrefs", V_BRANCH_XREFS),
|
||||||
("v_call_graph", V_CALL_GRAPH),
|
("v_call_graph", V_CALL_GRAPH),
|
||||||
("v_reachability_from_entry", V_REACHABILITY_FROM_ENTRY),
|
("v_reachability_from_entry", V_REACHABILITY_FROM_ENTRY),
|
||||||
|
("v_indirect_reachability_from_entry", V_INDIRECT_REACHABILITY_FROM_ENTRY),
|
||||||
("v_function_first_instruction", V_FUNCTION_FIRST_INSTRUCTION),
|
("v_function_first_instruction", V_FUNCTION_FIRST_INSTRUCTION),
|
||||||
("v_imports_called", V_IMPORTS_CALLED),
|
("v_imports_called", V_IMPORTS_CALLED),
|
||||||
];
|
];
|
||||||
@@ -110,6 +111,29 @@ WITH RECURSIVE reach(fn) AS (
|
|||||||
SELECT fn AS addr FROM reach;
|
SELECT fn AS addr FROM reach;
|
||||||
";
|
";
|
||||||
|
|
||||||
|
/// Reachability extended over `kind='ind_call'` edges from M5. Strict
|
||||||
|
/// superset of `v_reachability_from_entry` — every fn there is also here,
|
||||||
|
/// plus any function reached only via a vtable bcctrl whose vtable+slot
|
||||||
|
/// the M5 dataflow could resolve. Sample 5 newly-reachable PCs in canary
|
||||||
|
/// before trusting widely; the analysis intentionally leaves out alias-
|
||||||
|
/// dependent indirect calls (vtable loaded from a `this` field).
|
||||||
|
const V_INDIRECT_REACHABILITY_FROM_ENTRY: &str = "
|
||||||
|
CREATE OR REPLACE VIEW v_indirect_reachability_from_entry AS
|
||||||
|
WITH RECURSIVE reach(fn) AS (
|
||||||
|
SELECT i.function FROM instructions i
|
||||||
|
JOIN labels l ON l.address = i.address
|
||||||
|
WHERE l.name = 'entry_point' AND i.function IS NOT NULL
|
||||||
|
UNION
|
||||||
|
SELECT tgt.function FROM xrefs x
|
||||||
|
JOIN instructions src ON src.address = x.source
|
||||||
|
JOIN instructions tgt ON tgt.address = x.target
|
||||||
|
JOIN reach r ON src.function = r.fn
|
||||||
|
WHERE x.kind IN ('call', 'ind_call', 'j', 'br')
|
||||||
|
AND tgt.function IS NOT NULL
|
||||||
|
)
|
||||||
|
SELECT fn AS addr FROM reach;
|
||||||
|
";
|
||||||
|
|
||||||
/// Convenience join: each function's first decoded instruction. Useful for
|
/// Convenience join: each function's first decoded instruction. Useful for
|
||||||
/// quickly inspecting prologue patterns without computing offsets manually.
|
/// quickly inspecting prologue patterns without computing offsets manually.
|
||||||
const V_FUNCTION_FIRST_INSTRUCTION: &str = "
|
const V_FUNCTION_FIRST_INSTRUCTION: &str = "
|
||||||
|
|||||||
212
crates/xenia-analysis/src/strings.rs
Normal file
212
crates/xenia-analysis/src/strings.rs
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
//! String / constant-pool detection in `.rdata`.
|
||||||
|
//!
|
||||||
|
//! Scans the `.rdata` section for runs of printable ASCII or null-terminated
|
||||||
|
//! UTF-16LE characters of length ≥ 6, emitting one row per discovered string.
|
||||||
|
//! Cross-references against `xrefs.target` are computed by the caller —
|
||||||
|
//! this module only finds the strings; downstream queries can join.
|
||||||
|
//!
|
||||||
|
//! ### What this layer does NOT do
|
||||||
|
//!
|
||||||
|
//! - No UTF-8 multibyte detection — Xbox 360 game binaries reliably use
|
||||||
|
//! ASCII for debug strings and UTF-16LE for localised text.
|
||||||
|
//! - Strings in `.data` (mutable globals) are not scanned by default.
|
||||||
|
//! - Wide strings on Xbox 360 are little-endian (compiler convention even
|
||||||
|
//! on this big-endian platform); we do NOT try big-endian UTF-16.
|
||||||
|
//! - No language detection / classification beyond encoding.
|
||||||
|
//!
|
||||||
|
//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic.
|
||||||
|
|
||||||
|
use xenia_xex::pe::PeSection;
|
||||||
|
|
||||||
|
/// One detected string.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct DetectedString {
|
||||||
|
/// Absolute VA of the first byte.
|
||||||
|
pub address: u32,
|
||||||
|
/// `"ascii"` or `"utf16le"`.
|
||||||
|
pub encoding: &'static str,
|
||||||
|
/// Length in bytes (excluding the NUL terminator).
|
||||||
|
pub length: u32,
|
||||||
|
/// UTF-8 representation of the string content.
|
||||||
|
pub content: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scan all `.rdata` sections (and any other read-only data section the user
|
||||||
|
/// configures) for ASCII and UTF-16LE strings.
|
||||||
|
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||||
|
pub fn analyze(pe: &[u8], image_base: u32, sections: &[PeSection]) -> Vec<DetectedString> {
|
||||||
|
let started = std::time::Instant::now();
|
||||||
|
let mut out: Vec<DetectedString> = Vec::new();
|
||||||
|
|
||||||
|
for section in sections {
|
||||||
|
if section.name != ".rdata" { continue; }
|
||||||
|
let raw_start = section.virtual_address as usize;
|
||||||
|
let raw_end = (section.virtual_address + section.virtual_size) as usize;
|
||||||
|
if raw_end > pe.len() { continue; }
|
||||||
|
let bytes = &pe[raw_start..raw_end.min(pe.len())];
|
||||||
|
let va_base = image_base + section.virtual_address;
|
||||||
|
|
||||||
|
scan_ascii(bytes, va_base, &mut out);
|
||||||
|
scan_utf16le(bytes, va_base, &mut out);
|
||||||
|
}
|
||||||
|
|
||||||
|
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||||
|
let n_ascii = out.iter().filter(|s| s.encoding == "ascii").count();
|
||||||
|
let n_utf16 = out.len() - n_ascii;
|
||||||
|
metrics::histogram!("analysis.phase_ms", "phase" => "strings").record(elapsed_ms);
|
||||||
|
tracing::info!(
|
||||||
|
ascii = n_ascii,
|
||||||
|
utf16le = n_utf16,
|
||||||
|
total = out.len(),
|
||||||
|
elapsed_ms,
|
||||||
|
"string scan complete"
|
||||||
|
);
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
const MIN_LEN: usize = 6;
|
||||||
|
|
||||||
|
fn is_printable_ascii(b: u8) -> bool {
|
||||||
|
// Printable + the common whitespace characters used in real strings.
|
||||||
|
matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r')
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scan_ascii(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||||
|
let mut i = 0;
|
||||||
|
while i < bytes.len() {
|
||||||
|
if !is_printable_ascii(bytes[i]) {
|
||||||
|
i += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let start = i;
|
||||||
|
while i < bytes.len() && is_printable_ascii(bytes[i]) { i += 1; }
|
||||||
|
let run_len = i - start;
|
||||||
|
// Require NUL termination and minimum length.
|
||||||
|
if run_len >= MIN_LEN && i < bytes.len() && bytes[i] == 0 {
|
||||||
|
let s = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
|
||||||
|
out.push(DetectedString {
|
||||||
|
address: va_base + start as u32,
|
||||||
|
encoding: "ascii",
|
||||||
|
length: run_len as u32,
|
||||||
|
content: s.to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Skip the NUL (if any) before continuing.
|
||||||
|
if i < bytes.len() && bytes[i] == 0 { i += 1; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scan_utf16le(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
|
||||||
|
// UTF-16LE strings are 2-byte aligned in MSVC output. Walk on even
|
||||||
|
// offsets to avoid misaligned hits.
|
||||||
|
let mut i = 0;
|
||||||
|
while i + 2 <= bytes.len() {
|
||||||
|
if !i.is_multiple_of(2) { i += 1; continue; }
|
||||||
|
let lo = bytes[i];
|
||||||
|
let hi = bytes[i + 1];
|
||||||
|
// Restrict scan-start to printable ASCII range with a zero high byte —
|
||||||
|
// this is what real Xbox 360 wide strings look like.
|
||||||
|
if hi != 0 || !is_printable_ascii(lo) {
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let start = i;
|
||||||
|
let mut codeunits: Vec<u16> = Vec::new();
|
||||||
|
while i + 2 <= bytes.len() {
|
||||||
|
let l = bytes[i];
|
||||||
|
let h = bytes[i + 1];
|
||||||
|
if h != 0 || !is_printable_ascii(l) { break; }
|
||||||
|
codeunits.push((h as u16) << 8 | l as u16);
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
// Require NUL u16 terminator.
|
||||||
|
let nul_terminated = i + 2 <= bytes.len() && bytes[i] == 0 && bytes[i + 1] == 0;
|
||||||
|
if codeunits.len() >= MIN_LEN && nul_terminated {
|
||||||
|
let s: String = String::from_utf16_lossy(&codeunits);
|
||||||
|
out.push(DetectedString {
|
||||||
|
address: va_base + start as u32,
|
||||||
|
encoding: "utf16le",
|
||||||
|
length: ((i - start) as u32),
|
||||||
|
content: s,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Skip past the terminator.
|
||||||
|
if nul_terminated { i += 2; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn mk_section(name: &str, va: u32, size: u32) -> PeSection {
|
||||||
|
PeSection {
|
||||||
|
name: name.into(),
|
||||||
|
virtual_address: va,
|
||||||
|
virtual_size: size,
|
||||||
|
raw_offset: va,
|
||||||
|
raw_size: size,
|
||||||
|
flags: 0x4000_0040,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn detects_ascii_string() {
|
||||||
|
let image_base = 0x82000000u32;
|
||||||
|
let mut pe = vec![0u8; 0x1100];
|
||||||
|
let off = 0x1000usize;
|
||||||
|
let s = b"Hello, world!\0";
|
||||||
|
pe[off..off + s.len()].copy_from_slice(s);
|
||||||
|
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||||
|
let strings = analyze(&pe, image_base, §ions);
|
||||||
|
assert_eq!(strings.len(), 1);
|
||||||
|
assert_eq!(strings[0].encoding, "ascii");
|
||||||
|
assert_eq!(strings[0].content, "Hello, world!");
|
||||||
|
assert_eq!(strings[0].address, image_base + 0x1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rejects_short_runs() {
|
||||||
|
let image_base = 0x82000000u32;
|
||||||
|
let mut pe = vec![0u8; 0x1100];
|
||||||
|
let off = 0x1000usize;
|
||||||
|
let s = b"Hi\0longer string here\0";
|
||||||
|
pe[off..off + s.len()].copy_from_slice(s);
|
||||||
|
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||||
|
let strings = analyze(&pe, image_base, §ions);
|
||||||
|
assert_eq!(strings.len(), 1);
|
||||||
|
assert_eq!(strings[0].content, "longer string here");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn detects_utf16le_string() {
|
||||||
|
let image_base = 0x82000000u32;
|
||||||
|
let mut pe = vec![0u8; 0x1100];
|
||||||
|
let off = 0x1000usize;
|
||||||
|
// "Hello!" in UTF-16LE + NUL u16
|
||||||
|
let s: &[u8] = b"H\0e\0l\0l\0o\0!\0\0\0";
|
||||||
|
pe[off..off + s.len()].copy_from_slice(s);
|
||||||
|
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||||
|
let strings = analyze(&pe, image_base, §ions);
|
||||||
|
// Both ASCII and UTF-16 may detect — UTF-16 should find it as wide;
|
||||||
|
// ASCII pass scans bytes and won't see this as a contiguous run
|
||||||
|
// because of the interleaved 0 bytes (non-printable).
|
||||||
|
let utf16: Vec<_> = strings.iter().filter(|s| s.encoding == "utf16le").collect();
|
||||||
|
assert!(utf16.iter().any(|s| s.content == "Hello!"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn requires_nul_terminator() {
|
||||||
|
let image_base = 0x82000000u32;
|
||||||
|
let mut pe = vec![0u8; 0x1100];
|
||||||
|
// No trailing NUL — should NOT be detected.
|
||||||
|
let off = 0x1000usize;
|
||||||
|
let s = b"abcdefghij";
|
||||||
|
pe[off..off + s.len()].copy_from_slice(s);
|
||||||
|
// Fill rest of section with 0xFF so the run terminates cleanly without NUL.
|
||||||
|
for j in off + s.len()..off + 0x100 { pe[j] = 0xFF; }
|
||||||
|
let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
|
||||||
|
let strings = analyze(&pe, image_base, §ions);
|
||||||
|
assert_eq!(strings.len(), 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -8,23 +8,25 @@ use crate::func::FuncAnalysis;
|
|||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub enum XrefKind {
|
pub enum XrefKind {
|
||||||
Call, // bl
|
Call, // bl
|
||||||
Jump, // b (unconditional)
|
IndirectCall, // bcctrl through a statically-resolvable vtable slot (M5)
|
||||||
Branch, // bc / bXX (conditional)
|
Jump, // b (unconditional)
|
||||||
DataRead, // lwz, lbz, lhz, lha, lfs, lfd, etc. from resolved address
|
Branch, // bc / bXX (conditional)
|
||||||
DataWrite, // stw, stb, sth, stfs, stfd, etc. to resolved address
|
DataRead, // lwz, lbz, lhz, lha, lfs, lfd, etc. from resolved address
|
||||||
DataRef, // address computed via lis+addi/ori but not directly loaded/stored
|
DataWrite, // stw, stb, sth, stfs, stfd, etc. to resolved address
|
||||||
|
DataRef, // address computed via lis+addi/ori but not directly loaded/stored
|
||||||
}
|
}
|
||||||
|
|
||||||
impl XrefKind {
|
impl XrefKind {
|
||||||
pub fn tag(self) -> &'static str {
|
pub fn tag(self) -> &'static str {
|
||||||
match self {
|
match self {
|
||||||
XrefKind::Call => "call",
|
XrefKind::Call => "call",
|
||||||
XrefKind::Jump => "j",
|
XrefKind::IndirectCall => "ind_call",
|
||||||
XrefKind::Branch => "br",
|
XrefKind::Jump => "j",
|
||||||
XrefKind::DataRead => "read",
|
XrefKind::Branch => "br",
|
||||||
XrefKind::DataWrite => "write",
|
XrefKind::DataRead => "read",
|
||||||
XrefKind::DataRef => "ref",
|
XrefKind::DataWrite => "write",
|
||||||
|
XrefKind::DataRef => "ref",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ fn db_schema_matches_expected_columns() {
|
|||||||
w.write_base(&info).expect("write_base");
|
w.write_base(&info).expect("write_base");
|
||||||
w.ingest_instructions(&pe, &info, &func_analysis, &labels)
|
w.ingest_instructions(&pe, &info, &func_analysis, &labels)
|
||||||
.expect("ingest_instructions");
|
.expect("ingest_instructions");
|
||||||
w.write_analysis_results(&pe, &info, &func_analysis, &labels, &xrefs, &[])
|
w.write_analysis_results(&pe, &info, &func_analysis, &labels, &xrefs, &[], &[])
|
||||||
.expect("write_analysis_results");
|
.expect("write_analysis_results");
|
||||||
w.create_sql_views().expect("create_sql_views");
|
w.create_sql_views().expect("create_sql_views");
|
||||||
}
|
}
|
||||||
@@ -202,6 +202,12 @@ fn db_schema_matches_expected_columns() {
|
|||||||
("rtti_present", "BOOLEAN"),
|
("rtti_present", "BOOLEAN"),
|
||||||
("base_classes_json", "VARCHAR"),
|
("base_classes_json", "VARCHAR"),
|
||||||
]),
|
]),
|
||||||
|
("strings", &[
|
||||||
|
("address", "BIGINT"),
|
||||||
|
("encoding", "VARCHAR"),
|
||||||
|
("length", "BIGINT"),
|
||||||
|
("content", "VARCHAR"),
|
||||||
|
]),
|
||||||
("xrefs", &[
|
("xrefs", &[
|
||||||
("source", "BIGINT"),
|
("source", "BIGINT"),
|
||||||
("target", "BIGINT"),
|
("target", "BIGINT"),
|
||||||
@@ -269,6 +275,7 @@ fn db_schema_matches_expected_columns() {
|
|||||||
"v_call_graph",
|
"v_call_graph",
|
||||||
"v_function_first_instruction",
|
"v_function_first_instruction",
|
||||||
"v_imports_called",
|
"v_imports_called",
|
||||||
|
"v_indirect_reachability_from_entry",
|
||||||
"v_reachability_from_entry",
|
"v_reachability_from_entry",
|
||||||
];
|
];
|
||||||
for v in expected_views {
|
for v in expected_views {
|
||||||
|
|||||||
@@ -4081,7 +4081,7 @@ fn cmd_dis(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Cross-reference analysis
|
// Cross-reference analysis
|
||||||
let xref_result = xenia_analysis::xref::analyze_xrefs(
|
let mut xref_result = xenia_analysis::xref::analyze_xrefs(
|
||||||
&pe_image, base, entry, §ions, &func_analysis, &import_map,
|
&pe_image, base, entry, §ions, &func_analysis, &import_map,
|
||||||
);
|
);
|
||||||
let total_xrefs: usize = xref_result.xrefs.values().map(|v| v.len()).sum();
|
let total_xrefs: usize = xref_result.xrefs.values().map(|v| v.len()).sum();
|
||||||
@@ -4106,6 +4106,28 @@ fn cmd_dis(
|
|||||||
"vtable scan complete",
|
"vtable scan complete",
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Indirect-dispatch reachability (M5). Walks each function looking for
|
||||||
|
// the canonical lis+addi → lwz off(vtable) → mtctr → bcctrl pattern and
|
||||||
|
// emits one xref edge per resolvable site. Inserted into xrefs as
|
||||||
|
// kind='ind_call'.
|
||||||
|
let indirect_edges = xenia_analysis::indirect::analyze(
|
||||||
|
&pe_image, base, &func_analysis, &vtables, &xref_result.labels,
|
||||||
|
);
|
||||||
|
info!(indirect_edges = indirect_edges.len(), "indirect-dispatch scan complete");
|
||||||
|
for edge in &indirect_edges {
|
||||||
|
xref_result.xrefs
|
||||||
|
.entry(edge.target)
|
||||||
|
.or_default()
|
||||||
|
.push(xenia_analysis::xref::Xref {
|
||||||
|
source: edge.source,
|
||||||
|
kind: xenia_analysis::xref::XrefKind::IndirectCall,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// String / constant-pool detection (M7).
|
||||||
|
let strings = xenia_analysis::strings::analyze(&pe_image, base, §ions);
|
||||||
|
info!(strings = strings.len(), "string scan complete");
|
||||||
|
|
||||||
// Build DisasmInfo
|
// Build DisasmInfo
|
||||||
let disasm_info = xenia_analysis::formatter::DisasmInfo {
|
let disasm_info = xenia_analysis::formatter::DisasmInfo {
|
||||||
image_base: base,
|
image_base: base,
|
||||||
@@ -4130,6 +4152,7 @@ fn cmd_dis(
|
|||||||
&xref_result.labels,
|
&xref_result.labels,
|
||||||
&xref_result.xrefs,
|
&xref_result.xrefs,
|
||||||
&vtables,
|
&vtables,
|
||||||
|
&strings,
|
||||||
)?;
|
)?;
|
||||||
if matches!(analyze, AnalyzeMode::Sql | AnalyzeMode::Both) {
|
if matches!(analyze, AnalyzeMode::Sql | AnalyzeMode::Both) {
|
||||||
w.create_sql_views()?;
|
w.create_sql_views()?;
|
||||||
|
|||||||
Reference in New Issue
Block a user