M3: vtable scan + MSVC RTTI walk + 3 new tables
Adds detection of statically-allocated MSVC vtables in .rdata/.data: - New `xenia_analysis::vtables` walks read-only sections looking for runs of ≥3 contiguous big-endian u32 values where each value lands on a known function start (from M1's corrected functions table). 2-slot runs are rejected to keep false-positive rate down. - For each candidate the MSVC RTTI walk vtable[-1] → CompleteObjectLocator → TypeDescriptor → mangled name is attempted; on success the demangled class name is recorded along with a best-effort RTTIClassHierarchyDescriptor walk to fill base_classes_json. On failure (RTTI stripped — common for shipped game binaries) the class is named ANON_Class_<fnv1a-hash> keyed by sorted method-PC list, so identical vtables collapse to one entry. - DB: new tables `vtables`, `methods`, `classes` with indices on function_address and rtti_present. `write_analysis_results` takes a `&[Vtable]` slice; `write_disasm` (back-compat) passes empty. - cmd_dis wires the scan after xref analysis using `func_analysis.functions.keys()` as the function-start oracle. Validation on Sylpheed (RTTI stripped, as expected): 722 vtables / 499 unique classes / 5571 methods. Sanity invariant: every methods.function_address joins to functions.address (0 broken refs). Largest vtable: 131 slots. Tests 617→621 (+4 vtable unit tests covering 3-slot detect, 2-slot reject, synth name stability, and synth name divergence). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
424
crates/xenia-analysis/src/vtables.rs
Normal file
424
crates/xenia-analysis/src/vtables.rs
Normal file
@@ -0,0 +1,424 @@
|
||||
//! MSVC vtable + RTTI detection.
|
||||
//!
|
||||
//! Heuristic two-pass scan over the binary's read-only data sections. Pass 1
|
||||
//! finds candidate vtables — runs of ≥3 contiguous big-endian u32 values that
|
||||
//! all land on known function entries. Pass 2 attempts the MSVC RTTI walk
|
||||
//! `vtable[-1] → CompleteObjectLocator → TypeDescriptor → mangled name`. When
|
||||
//! RTTI is stripped (typical for shipped game binaries), each anonymous vtable
|
||||
//! gets a deterministic name `ANON_Class_<hex>` keyed by a hash of its
|
||||
//! sorted method PCs (so identical vtables across multiple class instances
|
||||
//! collapse to one entry).
|
||||
//!
|
||||
//! What this module does NOT do:
|
||||
//! - Vtables in heap-allocated memory (built at runtime by ctors) are out of
|
||||
//! scope — only vtables present statically in `.rdata` / `.data`.
|
||||
//! - RTTI inheritance (`BaseClassDescriptor` walk) is best-effort; we record
|
||||
//! the first-level base list when present and leave it NULL otherwise.
|
||||
//! - Multiple-inheritance "extra" vftables (one per base subobject) are
|
||||
//! detected as independent vtables; we don't link them.
|
||||
//!
|
||||
//! Reference: openrce.org "Reversing Microsoft Visual C++" RTTI articles
|
||||
//! (CompleteObjectLocator / TypeDescriptor / BaseClassDescriptor layout).
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::demangle;
|
||||
|
||||
/// One detected vtable.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Vtable {
|
||||
/// Absolute VA of `vtable[0]` (first method slot).
|
||||
pub address: u32,
|
||||
/// Number of methods in the vtable.
|
||||
pub length: u32,
|
||||
/// Absolute VA of the `CompleteObjectLocator` from `vtable[-1]`, if it
|
||||
/// looked like a valid pointer into `.rdata`. NULL when no RTTI / stripped.
|
||||
pub col_address: Option<u32>,
|
||||
/// Class name. Demangled from RTTI when available, otherwise the synthetic
|
||||
/// `ANON_Class_<hex>` form.
|
||||
pub class_name: String,
|
||||
/// True when the COL → TypeDescriptor walk succeeded.
|
||||
pub rtti_present: bool,
|
||||
/// First-level base class names from `RTTIClassHierarchyDescriptor`, JSON-encoded.
|
||||
/// `None` when not parseable.
|
||||
pub base_classes_json: Option<String>,
|
||||
/// One entry per slot: function VA in `.text`.
|
||||
pub methods: Vec<u32>,
|
||||
}
|
||||
|
||||
/// Run the vtable scan + RTTI walk. `function_starts` is the set of valid
|
||||
/// `.text` function entry VAs from M1's corrected `functions` table.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
sections: &[PeSection],
|
||||
function_starts: &std::collections::BTreeSet<u32>,
|
||||
) -> Vec<Vtable> {
|
||||
let started = std::time::Instant::now();
|
||||
// Sections we'll scan for vtable bodies.
|
||||
let scan_targets: Vec<&PeSection> = sections
|
||||
.iter()
|
||||
.filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
|
||||
.collect();
|
||||
|
||||
// Range table for "is this VA in .rdata or .data?"
|
||||
let rdata_ranges: Vec<(u32, u32)> = sections
|
||||
.iter()
|
||||
.filter(|s| s.name == ".rdata")
|
||||
.map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size))
|
||||
.collect();
|
||||
|
||||
let mut candidates: Vec<Vtable> = Vec::new();
|
||||
|
||||
for section in scan_targets {
|
||||
let va_start = image_base + section.virtual_address;
|
||||
let va_end = va_start + section.virtual_size;
|
||||
let raw_start = section.virtual_address as usize;
|
||||
let raw_end = (section.virtual_address + section.virtual_size) as usize;
|
||||
if raw_end > pe.len() { continue; }
|
||||
let bytes = &pe[raw_start..raw_end.min(pe.len())];
|
||||
|
||||
let mut i = 0usize;
|
||||
while i + 12 <= bytes.len() {
|
||||
// Try to start a run at this 4-aligned offset.
|
||||
if !i.is_multiple_of(4) { i += 1; continue; }
|
||||
let mut run_len = 0usize;
|
||||
let mut methods: Vec<u32> = Vec::new();
|
||||
let mut j = i;
|
||||
while j + 4 <= bytes.len() {
|
||||
let val = u32::from_be_bytes([bytes[j], bytes[j + 1], bytes[j + 2], bytes[j + 3]]);
|
||||
if function_starts.contains(&val) {
|
||||
methods.push(val);
|
||||
run_len += 1;
|
||||
j += 4;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if run_len >= 3 {
|
||||
let address = va_start + (i as u32);
|
||||
candidates.push(Vtable {
|
||||
address,
|
||||
length: run_len as u32,
|
||||
col_address: None,
|
||||
class_name: synth_anon_name(&methods),
|
||||
rtti_present: false,
|
||||
base_classes_json: None,
|
||||
methods,
|
||||
});
|
||||
i += run_len * 4;
|
||||
} else {
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
let _ = (va_start, va_end);
|
||||
}
|
||||
|
||||
// RTTI walk: for each candidate, look at vtable[-1].
|
||||
let pe_image_base = image_base;
|
||||
for v in &mut candidates {
|
||||
if v.address < 4 { continue; }
|
||||
let col_off = (v.address - pe_image_base - 4) as usize;
|
||||
if col_off + 4 > pe.len() { continue; }
|
||||
let col_ptr = u32::from_be_bytes([pe[col_off], pe[col_off + 1], pe[col_off + 2], pe[col_off + 3]]);
|
||||
if col_ptr == 0 { continue; }
|
||||
if !is_in_ranges(col_ptr, &rdata_ranges) { continue; }
|
||||
|
||||
// Try to extract the TypeDescriptor mangled-name string.
|
||||
if let Some((td_ptr, hierarchy_ptr)) = read_col(pe, image_base, col_ptr)
|
||||
&& let Some(mangled) = read_typedescriptor_name(pe, image_base, td_ptr, &rdata_ranges)
|
||||
&& let Some(class) = demangle_rtti_typename(&mangled)
|
||||
{
|
||||
v.col_address = Some(col_ptr);
|
||||
v.class_name = class;
|
||||
v.rtti_present = true;
|
||||
v.base_classes_json = read_class_hierarchy(pe, image_base, hierarchy_ptr, &rdata_ranges);
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
let rtti_count = candidates.iter().filter(|v| v.rtti_present).count();
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "vtables").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
vtables = candidates.len(),
|
||||
rtti = rtti_count,
|
||||
anon = candidates.len() - rtti_count,
|
||||
elapsed_ms,
|
||||
"vtable scan complete"
|
||||
);
|
||||
candidates
|
||||
}
|
||||
|
||||
fn is_in_ranges(addr: u32, ranges: &[(u32, u32)]) -> bool {
|
||||
ranges.iter().any(|&(s, e)| addr >= s && addr < e)
|
||||
}
|
||||
|
||||
/// Read 4 big-endian bytes at absolute VA `addr` from the PE image.
|
||||
fn read_be_u32(pe: &[u8], image_base: u32, addr: u32) -> Option<u32> {
|
||||
let off = addr.wrapping_sub(image_base) as usize;
|
||||
if off + 4 > pe.len() { return None; }
|
||||
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
|
||||
}
|
||||
|
||||
/// Parse a `CompleteObjectLocator` at VA `col`. Returns
|
||||
/// `(type_descriptor_ptr, class_hierarchy_descriptor_ptr)` on success.
|
||||
///
|
||||
/// Layout (32-bit MSVC):
|
||||
/// ```text
|
||||
/// +0x00 signature (0 for x86 without /GR-, can be 1)
|
||||
/// +0x04 offset within complete object
|
||||
/// +0x08 cdOffset (this-pointer adjuster)
|
||||
/// +0x0C TypeDescriptor *
|
||||
/// +0x10 RTTIClassHierarchyDescriptor *
|
||||
/// ```
|
||||
fn read_col(pe: &[u8], image_base: u32, col: u32) -> Option<(u32, u32)> {
|
||||
let td = read_be_u32(pe, image_base, col + 0x0C)?;
|
||||
let chd = read_be_u32(pe, image_base, col + 0x10)?;
|
||||
if td == 0 { return None; }
|
||||
Some((td, chd))
|
||||
}
|
||||
|
||||
/// Read a TypeDescriptor's mangled-name string at VA `td`.
|
||||
///
|
||||
/// Layout: `+0x00` vftable ptr, `+0x04` "spare", `+0x08` zero-terminated
|
||||
/// mangled name (e.g. `.?AVClassName@@`).
|
||||
fn read_typedescriptor_name(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
td: u32,
|
||||
rdata_ranges: &[(u32, u32)],
|
||||
) -> Option<String> {
|
||||
if !is_in_ranges(td, rdata_ranges) { return None; }
|
||||
let name_va = td + 0x08;
|
||||
let off = name_va.wrapping_sub(image_base) as usize;
|
||||
if off + 1 > pe.len() { return None; }
|
||||
// Read up to 256 bytes or until NUL.
|
||||
let mut end = off;
|
||||
while end < pe.len().min(off + 256) && pe[end] != 0 { end += 1; }
|
||||
if end == off { return None; }
|
||||
let s = std::str::from_utf8(&pe[off..end]).ok()?;
|
||||
// Sanity: MSVC RTTI names always start with `.?A`.
|
||||
if !s.starts_with(".?A") { return None; }
|
||||
Some(s.to_string())
|
||||
}
|
||||
|
||||
/// Demangle an RTTI type-name string of the form `.?AVClassName@ns@@`.
|
||||
/// MSVC convention: leading `.` is the marker for an RTTI string; strip it
|
||||
/// before passing to the demangler.
|
||||
fn demangle_rtti_typename(rtti_name: &str) -> Option<String> {
|
||||
let stripped = rtti_name.strip_prefix('.')?;
|
||||
let raw = msvc_demangler::demangle(stripped, msvc_demangler::DemangleFlags::llvm()).ok()?;
|
||||
// Output looks like `class xe::apu::AudioSystem` or `struct foo::Bar`.
|
||||
let cls = raw
|
||||
.strip_prefix("class ")
|
||||
.or_else(|| raw.strip_prefix("struct "))
|
||||
.or_else(|| raw.strip_prefix("union "))
|
||||
.unwrap_or(&raw);
|
||||
Some(cls.to_string())
|
||||
}
|
||||
|
||||
/// Best-effort `RTTIClassHierarchyDescriptor` walk: read the
|
||||
/// `BaseClassArray` entries and demangle each base's TypeDescriptor name.
|
||||
/// Returns a JSON array string on success.
|
||||
///
|
||||
/// Layout:
|
||||
/// ```text
|
||||
/// RTTIClassHierarchyDescriptor:
|
||||
/// +0x00 signature
|
||||
/// +0x04 attributes
|
||||
/// +0x08 numBaseClasses
|
||||
/// +0x0C BaseClassArray * (-> array of BaseClassDescriptor *)
|
||||
/// BaseClassDescriptor:
|
||||
/// +0x00 TypeDescriptor *
|
||||
/// +0x04 numContainedBases
|
||||
/// ...
|
||||
/// ```
|
||||
fn read_class_hierarchy(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
chd: u32,
|
||||
rdata_ranges: &[(u32, u32)],
|
||||
) -> Option<String> {
|
||||
if !is_in_ranges(chd, rdata_ranges) { return None; }
|
||||
let num_bases = read_be_u32(pe, image_base, chd + 0x08)?;
|
||||
if num_bases == 0 || num_bases > 256 { return None; } // sanity cap
|
||||
let bca_ptr = read_be_u32(pe, image_base, chd + 0x0C)?;
|
||||
if !is_in_ranges(bca_ptr, rdata_ranges) { return None; }
|
||||
|
||||
let mut names: Vec<String> = Vec::new();
|
||||
for i in 0..num_bases {
|
||||
let bcd_ptr = match read_be_u32(pe, image_base, bca_ptr + i * 4) {
|
||||
Some(p) if is_in_ranges(p, rdata_ranges) => p,
|
||||
_ => return None,
|
||||
};
|
||||
let td_ptr = match read_be_u32(pe, image_base, bcd_ptr) {
|
||||
Some(p) if is_in_ranges(p, rdata_ranges) => p,
|
||||
_ => return None,
|
||||
};
|
||||
let mangled = match read_typedescriptor_name(pe, image_base, td_ptr, rdata_ranges) {
|
||||
Some(s) => s,
|
||||
None => return None,
|
||||
};
|
||||
let cls = demangle_rtti_typename(&mangled).unwrap_or(mangled);
|
||||
names.push(cls);
|
||||
}
|
||||
serde_json::to_string(&names).ok()
|
||||
}
|
||||
|
||||
/// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of
|
||||
/// the sorted method-PC list. Two vtables with identical method ordering
|
||||
/// collapse to the same anonymous name.
|
||||
fn synth_anon_name(methods: &[u32]) -> String {
|
||||
// FNV-1a 64-bit on the sorted PC list; we only use 32 bits for brevity.
|
||||
let mut sorted = methods.to_vec();
|
||||
sorted.sort_unstable();
|
||||
let mut h: u64 = 0xcbf29ce484222325;
|
||||
for pc in &sorted {
|
||||
for b in pc.to_le_bytes() {
|
||||
h ^= b as u64;
|
||||
h = h.wrapping_mul(0x100000001b3);
|
||||
}
|
||||
}
|
||||
format!("ANON_Class_{:08X}", (h as u32))
|
||||
}
|
||||
|
||||
/// Build the per-method `(vtable_address, slot, function_address)` list for
|
||||
/// DB insertion, with optional demangled-name lookup for any function that
|
||||
/// has a matching `?…` label. Skips slots whose function isn't in the
|
||||
/// supplied label map.
|
||||
pub fn methods_table(
|
||||
vtables: &[Vtable],
|
||||
labels: &std::collections::HashMap<u32, String>,
|
||||
) -> Vec<(u32, u32, u32, Option<String>, Option<String>)> {
|
||||
let mut out = Vec::new();
|
||||
for v in vtables {
|
||||
for (slot, &fn_va) in v.methods.iter().enumerate() {
|
||||
let label = labels.get(&fn_va).cloned();
|
||||
let demangled = label.as_ref()
|
||||
.and_then(|l| demangle::demangle(l).map(|d| d.raw_demangled));
|
||||
out.push((v.address, slot as u32, fn_va, label, demangled));
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Build a `class_name → Vtable` summary for the `classes` table. Multiple
|
||||
/// vtables sharing the same class name (multiple instances at link time)
|
||||
/// collapse via `BTreeMap` — the first detected vtable wins.
|
||||
pub fn classes_table(vtables: &[Vtable]) -> Vec<(String, u32, bool, Option<String>)> {
|
||||
let mut by_name: BTreeMap<String, &Vtable> = BTreeMap::new();
|
||||
for v in vtables {
|
||||
by_name.entry(v.class_name.clone()).or_insert(v);
|
||||
}
|
||||
by_name
|
||||
.into_iter()
|
||||
.map(|(name, v)| (name, v.address, v.rtti_present, v.base_classes_json.clone()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn synth_anon_name_is_stable() {
|
||||
let a = synth_anon_name(&[0x82001000, 0x82001100, 0x82001200]);
|
||||
let b = synth_anon_name(&[0x82001200, 0x82001000, 0x82001100]);
|
||||
assert_eq!(a, b, "anon name must be order-independent");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn synth_anon_name_differs_for_different_methods() {
|
||||
let a = synth_anon_name(&[0x82001000, 0x82001100]);
|
||||
let b = synth_anon_name(&[0x82002000, 0x82002100]);
|
||||
assert_ne!(a, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detects_3_method_vtable_in_rdata() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let text_va = 0x2000u32;
|
||||
let rdata_size = 16u32;
|
||||
let text_size = 0x100u32;
|
||||
|
||||
// PE buffer big enough for both sections.
|
||||
let total = (text_va + text_size) as usize;
|
||||
let mut pe = vec![0u8; total];
|
||||
|
||||
// Vtable: 3 method PCs at .rdata start, all valid function entries.
|
||||
let m: [u32; 3] = [image_base + text_va, image_base + text_va + 0x10, image_base + text_va + 0x20];
|
||||
for (i, val) in m.iter().enumerate() {
|
||||
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
|
||||
.copy_from_slice(&val.to_be_bytes());
|
||||
}
|
||||
|
||||
let sections = vec![
|
||||
PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: rdata_va,
|
||||
virtual_size: rdata_size,
|
||||
raw_offset: rdata_va,
|
||||
raw_size: rdata_size,
|
||||
flags: 0x4000_0040,
|
||||
},
|
||||
PeSection {
|
||||
name: ".text".into(),
|
||||
virtual_address: text_va,
|
||||
virtual_size: text_size,
|
||||
raw_offset: text_va,
|
||||
raw_size: text_size,
|
||||
flags: 0x6000_0020,
|
||||
},
|
||||
];
|
||||
let mut function_starts = std::collections::BTreeSet::new();
|
||||
for &pc in &m { function_starts.insert(pc); }
|
||||
|
||||
let vtables = analyze(&pe, image_base, §ions, &function_starts);
|
||||
assert_eq!(vtables.len(), 1);
|
||||
assert_eq!(vtables[0].length, 3);
|
||||
assert_eq!(vtables[0].address, image_base + rdata_va);
|
||||
assert!(vtables[0].class_name.starts_with("ANON_Class_"));
|
||||
assert!(!vtables[0].rtti_present);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_2_method_run() {
|
||||
let image_base = 0x82000000u32;
|
||||
let rdata_va = 0x1000u32;
|
||||
let text_va = 0x2000u32;
|
||||
|
||||
let total = (text_va + 0x100) as usize;
|
||||
let mut pe = vec![0u8; total];
|
||||
let m: [u32; 2] = [image_base + text_va, image_base + text_va + 0x10];
|
||||
for (i, val) in m.iter().enumerate() {
|
||||
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
|
||||
.copy_from_slice(&val.to_be_bytes());
|
||||
}
|
||||
let sections = vec![
|
||||
PeSection {
|
||||
name: ".rdata".into(),
|
||||
virtual_address: rdata_va,
|
||||
virtual_size: 8,
|
||||
raw_offset: rdata_va,
|
||||
raw_size: 8,
|
||||
flags: 0x4000_0040,
|
||||
},
|
||||
PeSection {
|
||||
name: ".text".into(),
|
||||
virtual_address: text_va,
|
||||
virtual_size: 0x100,
|
||||
raw_offset: text_va,
|
||||
raw_size: 0x100,
|
||||
flags: 0x6000_0020,
|
||||
},
|
||||
];
|
||||
let mut function_starts = std::collections::BTreeSet::new();
|
||||
for &pc in &m { function_starts.insert(pc); }
|
||||
let vtables = analyze(&pe, image_base, §ions, &function_starts);
|
||||
assert_eq!(vtables.len(), 0, "runs of 2 must be rejected to keep false-positive rate down");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user