M3: vtable scan + MSVC RTTI walk + 3 new tables

Adds detection of statically-allocated MSVC vtables in .rdata/.data:
- New `xenia_analysis::vtables` walks read-only sections looking for runs of
  ≥3 contiguous big-endian u32 values where each value lands on a known
  function start (from M1's corrected functions table). 2-slot runs are
  rejected to keep false-positive rate down.
- For each candidate the MSVC RTTI walk vtable[-1] → CompleteObjectLocator
  → TypeDescriptor → mangled name is attempted; on success the demangled
  class name is recorded along with a best-effort RTTIClassHierarchyDescriptor
  walk to fill base_classes_json. On failure (RTTI stripped — common for
  shipped game binaries) the class is named ANON_Class_<fnv1a-hash> keyed
  by sorted method-PC list, so identical vtables collapse to one entry.
- DB: new tables `vtables`, `methods`, `classes` with indices on
  function_address and rtti_present. `write_analysis_results` takes a
  `&[Vtable]` slice; `write_disasm` (back-compat) passes empty.
- cmd_dis wires the scan after xref analysis using
  `func_analysis.functions.keys()` as the function-start oracle.

Validation on Sylpheed (RTTI stripped, as expected): 722 vtables / 499
unique classes / 5571 methods. Sanity invariant: every methods.function_address
joins to functions.address (0 broken refs). Largest vtable: 131 slots.

Tests 617→621 (+4 vtable unit tests covering 3-slot detect, 2-slot reject,
synth name stability, and synth name divergence).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-08 20:17:45 +02:00
parent bd5753311e
commit 1d6c51fbf8
6 changed files with 620 additions and 8 deletions

View File

@@ -0,0 +1,424 @@
//! MSVC vtable + RTTI detection.
//!
//! Heuristic two-pass scan over the binary's read-only data sections. Pass 1
//! finds candidate vtables — runs of ≥3 contiguous big-endian u32 values that
//! all land on known function entries. Pass 2 attempts the MSVC RTTI walk
//! `vtable[-1] → CompleteObjectLocator → TypeDescriptor → mangled name`. When
//! RTTI is stripped (typical for shipped game binaries), each anonymous vtable
//! gets a deterministic name `ANON_Class_<hex>` keyed by a hash of its
//! sorted method PCs (so identical vtables across multiple class instances
//! collapse to one entry).
//!
//! What this module does NOT do:
//! - Vtables in heap-allocated memory (built at runtime by ctors) are out of
//! scope — only vtables present statically in `.rdata` / `.data`.
//! - RTTI inheritance (`BaseClassDescriptor` walk) is best-effort; we record
//! the first-level base list when present and leave it NULL otherwise.
//! - Multiple-inheritance "extra" vftables (one per base subobject) are
//! detected as independent vtables; we don't link them.
//!
//! Reference: openrce.org "Reversing Microsoft Visual C++" RTTI articles
//! (CompleteObjectLocator / TypeDescriptor / BaseClassDescriptor layout).
use std::collections::BTreeMap;
use xenia_xex::pe::PeSection;
use crate::demangle;
/// One detected vtable.
#[derive(Debug, Clone)]
pub struct Vtable {
/// Absolute VA of `vtable[0]` (first method slot).
pub address: u32,
/// Number of methods in the vtable.
pub length: u32,
/// Absolute VA of the `CompleteObjectLocator` from `vtable[-1]`, if it
/// looked like a valid pointer into `.rdata`. NULL when no RTTI / stripped.
pub col_address: Option<u32>,
/// Class name. Demangled from RTTI when available, otherwise the synthetic
/// `ANON_Class_<hex>` form.
pub class_name: String,
/// True when the COL → TypeDescriptor walk succeeded.
pub rtti_present: bool,
/// First-level base class names from `RTTIClassHierarchyDescriptor`, JSON-encoded.
/// `None` when not parseable.
pub base_classes_json: Option<String>,
/// One entry per slot: function VA in `.text`.
pub methods: Vec<u32>,
}
/// Run the vtable scan + RTTI walk. `function_starts` is the set of valid
/// `.text` function entry VAs from M1's corrected `functions` table.
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
pub fn analyze(
pe: &[u8],
image_base: u32,
sections: &[PeSection],
function_starts: &std::collections::BTreeSet<u32>,
) -> Vec<Vtable> {
let started = std::time::Instant::now();
// Sections we'll scan for vtable bodies.
let scan_targets: Vec<&PeSection> = sections
.iter()
.filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
.collect();
// Range table for "is this VA in .rdata or .data?"
let rdata_ranges: Vec<(u32, u32)> = sections
.iter()
.filter(|s| s.name == ".rdata")
.map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size))
.collect();
let mut candidates: Vec<Vtable> = Vec::new();
for section in scan_targets {
let va_start = image_base + section.virtual_address;
let va_end = va_start + section.virtual_size;
let raw_start = section.virtual_address as usize;
let raw_end = (section.virtual_address + section.virtual_size) as usize;
if raw_end > pe.len() { continue; }
let bytes = &pe[raw_start..raw_end.min(pe.len())];
let mut i = 0usize;
while i + 12 <= bytes.len() {
// Try to start a run at this 4-aligned offset.
if !i.is_multiple_of(4) { i += 1; continue; }
let mut run_len = 0usize;
let mut methods: Vec<u32> = Vec::new();
let mut j = i;
while j + 4 <= bytes.len() {
let val = u32::from_be_bytes([bytes[j], bytes[j + 1], bytes[j + 2], bytes[j + 3]]);
if function_starts.contains(&val) {
methods.push(val);
run_len += 1;
j += 4;
} else {
break;
}
}
if run_len >= 3 {
let address = va_start + (i as u32);
candidates.push(Vtable {
address,
length: run_len as u32,
col_address: None,
class_name: synth_anon_name(&methods),
rtti_present: false,
base_classes_json: None,
methods,
});
i += run_len * 4;
} else {
i += 4;
}
}
let _ = (va_start, va_end);
}
// RTTI walk: for each candidate, look at vtable[-1].
let pe_image_base = image_base;
for v in &mut candidates {
if v.address < 4 { continue; }
let col_off = (v.address - pe_image_base - 4) as usize;
if col_off + 4 > pe.len() { continue; }
let col_ptr = u32::from_be_bytes([pe[col_off], pe[col_off + 1], pe[col_off + 2], pe[col_off + 3]]);
if col_ptr == 0 { continue; }
if !is_in_ranges(col_ptr, &rdata_ranges) { continue; }
// Try to extract the TypeDescriptor mangled-name string.
if let Some((td_ptr, hierarchy_ptr)) = read_col(pe, image_base, col_ptr)
&& let Some(mangled) = read_typedescriptor_name(pe, image_base, td_ptr, &rdata_ranges)
&& let Some(class) = demangle_rtti_typename(&mangled)
{
v.col_address = Some(col_ptr);
v.class_name = class;
v.rtti_present = true;
v.base_classes_json = read_class_hierarchy(pe, image_base, hierarchy_ptr, &rdata_ranges);
}
}
let elapsed_ms = started.elapsed().as_millis() as f64;
let rtti_count = candidates.iter().filter(|v| v.rtti_present).count();
metrics::histogram!("analysis.phase_ms", "phase" => "vtables").record(elapsed_ms);
tracing::info!(
vtables = candidates.len(),
rtti = rtti_count,
anon = candidates.len() - rtti_count,
elapsed_ms,
"vtable scan complete"
);
candidates
}
fn is_in_ranges(addr: u32, ranges: &[(u32, u32)]) -> bool {
ranges.iter().any(|&(s, e)| addr >= s && addr < e)
}
/// Read 4 big-endian bytes at absolute VA `addr` from the PE image.
fn read_be_u32(pe: &[u8], image_base: u32, addr: u32) -> Option<u32> {
let off = addr.wrapping_sub(image_base) as usize;
if off + 4 > pe.len() { return None; }
Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
}
/// Parse a `CompleteObjectLocator` at VA `col`. Returns
/// `(type_descriptor_ptr, class_hierarchy_descriptor_ptr)` on success.
///
/// Layout (32-bit MSVC):
/// ```text
/// +0x00 signature (0 for x86 without /GR-, can be 1)
/// +0x04 offset within complete object
/// +0x08 cdOffset (this-pointer adjuster)
/// +0x0C TypeDescriptor *
/// +0x10 RTTIClassHierarchyDescriptor *
/// ```
fn read_col(pe: &[u8], image_base: u32, col: u32) -> Option<(u32, u32)> {
let td = read_be_u32(pe, image_base, col + 0x0C)?;
let chd = read_be_u32(pe, image_base, col + 0x10)?;
if td == 0 { return None; }
Some((td, chd))
}
/// Read a TypeDescriptor's mangled-name string at VA `td`.
///
/// Layout: `+0x00` vftable ptr, `+0x04` "spare", `+0x08` zero-terminated
/// mangled name (e.g. `.?AVClassName@@`).
fn read_typedescriptor_name(
pe: &[u8],
image_base: u32,
td: u32,
rdata_ranges: &[(u32, u32)],
) -> Option<String> {
if !is_in_ranges(td, rdata_ranges) { return None; }
let name_va = td + 0x08;
let off = name_va.wrapping_sub(image_base) as usize;
if off + 1 > pe.len() { return None; }
// Read up to 256 bytes or until NUL.
let mut end = off;
while end < pe.len().min(off + 256) && pe[end] != 0 { end += 1; }
if end == off { return None; }
let s = std::str::from_utf8(&pe[off..end]).ok()?;
// Sanity: MSVC RTTI names always start with `.?A`.
if !s.starts_with(".?A") { return None; }
Some(s.to_string())
}
/// Demangle an RTTI type-name string of the form `.?AVClassName@ns@@`.
/// MSVC convention: leading `.` is the marker for an RTTI string; strip it
/// before passing to the demangler.
fn demangle_rtti_typename(rtti_name: &str) -> Option<String> {
let stripped = rtti_name.strip_prefix('.')?;
let raw = msvc_demangler::demangle(stripped, msvc_demangler::DemangleFlags::llvm()).ok()?;
// Output looks like `class xe::apu::AudioSystem` or `struct foo::Bar`.
let cls = raw
.strip_prefix("class ")
.or_else(|| raw.strip_prefix("struct "))
.or_else(|| raw.strip_prefix("union "))
.unwrap_or(&raw);
Some(cls.to_string())
}
/// Best-effort `RTTIClassHierarchyDescriptor` walk: read the
/// `BaseClassArray` entries and demangle each base's TypeDescriptor name.
/// Returns a JSON array string on success.
///
/// Layout:
/// ```text
/// RTTIClassHierarchyDescriptor:
/// +0x00 signature
/// +0x04 attributes
/// +0x08 numBaseClasses
/// +0x0C BaseClassArray * (-> array of BaseClassDescriptor *)
/// BaseClassDescriptor:
/// +0x00 TypeDescriptor *
/// +0x04 numContainedBases
/// ...
/// ```
fn read_class_hierarchy(
pe: &[u8],
image_base: u32,
chd: u32,
rdata_ranges: &[(u32, u32)],
) -> Option<String> {
if !is_in_ranges(chd, rdata_ranges) { return None; }
let num_bases = read_be_u32(pe, image_base, chd + 0x08)?;
if num_bases == 0 || num_bases > 256 { return None; } // sanity cap
let bca_ptr = read_be_u32(pe, image_base, chd + 0x0C)?;
if !is_in_ranges(bca_ptr, rdata_ranges) { return None; }
let mut names: Vec<String> = Vec::new();
for i in 0..num_bases {
let bcd_ptr = match read_be_u32(pe, image_base, bca_ptr + i * 4) {
Some(p) if is_in_ranges(p, rdata_ranges) => p,
_ => return None,
};
let td_ptr = match read_be_u32(pe, image_base, bcd_ptr) {
Some(p) if is_in_ranges(p, rdata_ranges) => p,
_ => return None,
};
let mangled = match read_typedescriptor_name(pe, image_base, td_ptr, rdata_ranges) {
Some(s) => s,
None => return None,
};
let cls = demangle_rtti_typename(&mangled).unwrap_or(mangled);
names.push(cls);
}
serde_json::to_string(&names).ok()
}
/// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of
/// the sorted method-PC list. Two vtables with identical method ordering
/// collapse to the same anonymous name.
fn synth_anon_name(methods: &[u32]) -> String {
// FNV-1a 64-bit on the sorted PC list; we only use 32 bits for brevity.
let mut sorted = methods.to_vec();
sorted.sort_unstable();
let mut h: u64 = 0xcbf29ce484222325;
for pc in &sorted {
for b in pc.to_le_bytes() {
h ^= b as u64;
h = h.wrapping_mul(0x100000001b3);
}
}
format!("ANON_Class_{:08X}", (h as u32))
}
/// Build the per-method `(vtable_address, slot, function_address)` list for
/// DB insertion, with optional demangled-name lookup for any function that
/// has a matching `?…` label. Skips slots whose function isn't in the
/// supplied label map.
pub fn methods_table(
vtables: &[Vtable],
labels: &std::collections::HashMap<u32, String>,
) -> Vec<(u32, u32, u32, Option<String>, Option<String>)> {
let mut out = Vec::new();
for v in vtables {
for (slot, &fn_va) in v.methods.iter().enumerate() {
let label = labels.get(&fn_va).cloned();
let demangled = label.as_ref()
.and_then(|l| demangle::demangle(l).map(|d| d.raw_demangled));
out.push((v.address, slot as u32, fn_va, label, demangled));
}
}
out
}
/// Build a `class_name → Vtable` summary for the `classes` table. Multiple
/// vtables sharing the same class name (multiple instances at link time)
/// collapse via `BTreeMap` — the first detected vtable wins.
pub fn classes_table(vtables: &[Vtable]) -> Vec<(String, u32, bool, Option<String>)> {
let mut by_name: BTreeMap<String, &Vtable> = BTreeMap::new();
for v in vtables {
by_name.entry(v.class_name.clone()).or_insert(v);
}
by_name
.into_iter()
.map(|(name, v)| (name, v.address, v.rtti_present, v.base_classes_json.clone()))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn synth_anon_name_is_stable() {
let a = synth_anon_name(&[0x82001000, 0x82001100, 0x82001200]);
let b = synth_anon_name(&[0x82001200, 0x82001000, 0x82001100]);
assert_eq!(a, b, "anon name must be order-independent");
}
#[test]
fn synth_anon_name_differs_for_different_methods() {
let a = synth_anon_name(&[0x82001000, 0x82001100]);
let b = synth_anon_name(&[0x82002000, 0x82002100]);
assert_ne!(a, b);
}
#[test]
fn detects_3_method_vtable_in_rdata() {
let image_base = 0x82000000u32;
let rdata_va = 0x1000u32;
let text_va = 0x2000u32;
let rdata_size = 16u32;
let text_size = 0x100u32;
// PE buffer big enough for both sections.
let total = (text_va + text_size) as usize;
let mut pe = vec![0u8; total];
// Vtable: 3 method PCs at .rdata start, all valid function entries.
let m: [u32; 3] = [image_base + text_va, image_base + text_va + 0x10, image_base + text_va + 0x20];
for (i, val) in m.iter().enumerate() {
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
.copy_from_slice(&val.to_be_bytes());
}
let sections = vec![
PeSection {
name: ".rdata".into(),
virtual_address: rdata_va,
virtual_size: rdata_size,
raw_offset: rdata_va,
raw_size: rdata_size,
flags: 0x4000_0040,
},
PeSection {
name: ".text".into(),
virtual_address: text_va,
virtual_size: text_size,
raw_offset: text_va,
raw_size: text_size,
flags: 0x6000_0020,
},
];
let mut function_starts = std::collections::BTreeSet::new();
for &pc in &m { function_starts.insert(pc); }
let vtables = analyze(&pe, image_base, &sections, &function_starts);
assert_eq!(vtables.len(), 1);
assert_eq!(vtables[0].length, 3);
assert_eq!(vtables[0].address, image_base + rdata_va);
assert!(vtables[0].class_name.starts_with("ANON_Class_"));
assert!(!vtables[0].rtti_present);
}
#[test]
fn rejects_2_method_run() {
let image_base = 0x82000000u32;
let rdata_va = 0x1000u32;
let text_va = 0x2000u32;
let total = (text_va + 0x100) as usize;
let mut pe = vec![0u8; total];
let m: [u32; 2] = [image_base + text_va, image_base + text_va + 0x10];
for (i, val) in m.iter().enumerate() {
pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
.copy_from_slice(&val.to_be_bytes());
}
let sections = vec![
PeSection {
name: ".rdata".into(),
virtual_address: rdata_va,
virtual_size: 8,
raw_offset: rdata_va,
raw_size: 8,
flags: 0x4000_0040,
},
PeSection {
name: ".text".into(),
virtual_address: text_va,
virtual_size: 0x100,
raw_offset: text_va,
raw_size: 0x100,
flags: 0x6000_0020,
},
];
let mut function_starts = std::collections::BTreeSet::new();
for &pc in &m { function_starts.insert(pc); }
let vtables = analyze(&pe, image_base, &sections, &function_starts);
assert_eq!(vtables.len(), 0, "runs of 2 must be rejected to keep false-positive rate down");
}
}