//! MSVC vtable + RTTI detection. //! //! Heuristic two-pass scan over the binary's read-only data sections. Pass 1 //! finds candidate vtables — runs of ≥3 contiguous big-endian u32 values that //! all land on known function entries. Pass 2 attempts the MSVC RTTI walk //! `vtable[-1] → CompleteObjectLocator → TypeDescriptor → mangled name`. When //! RTTI is stripped (typical for shipped game binaries), each anonymous vtable //! gets a deterministic name `ANON_Class_` keyed by a hash of its //! sorted method PCs (so identical vtables across multiple class instances //! collapse to one entry). //! //! What this module does NOT do: //! - Vtables in heap-allocated memory (built at runtime by ctors) are out of //! scope — only vtables present statically in `.rdata` / `.data`. //! - RTTI inheritance (`BaseClassDescriptor` walk) is best-effort; we record //! the first-level base list when present and leave it NULL otherwise. //! - Multiple-inheritance "extra" vftables (one per base subobject) are //! detected as independent vtables; we don't link them. //! //! Reference: openrce.org "Reversing Microsoft Visual C++" RTTI articles //! (CompleteObjectLocator / TypeDescriptor / BaseClassDescriptor layout). use std::collections::BTreeMap; use xenia_xex::pe::PeSection; use crate::demangle; /// One detected vtable. #[derive(Debug, Clone)] pub struct Vtable { /// Absolute VA of `vtable[0]` (first method slot). pub address: u32, /// Number of methods in the vtable. pub length: u32, /// Absolute VA of the `CompleteObjectLocator` from `vtable[-1]`, if it /// looked like a valid pointer into `.rdata`. NULL when no RTTI / stripped. pub col_address: Option, /// Class name. Demangled from RTTI when available, otherwise the synthetic /// `ANON_Class_` form. pub class_name: String, /// True when the COL → TypeDescriptor walk succeeded. pub rtti_present: bool, /// First-level base class names from `RTTIClassHierarchyDescriptor`, JSON-encoded. /// `None` when not parseable. pub base_classes_json: Option, /// One entry per slot: function VA in `.text`. pub methods: Vec, } /// Run the vtable scan + RTTI walk. `function_starts` is the set of valid /// `.text` function entry VAs from M1's corrected `functions` table. #[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))] pub fn analyze( pe: &[u8], image_base: u32, sections: &[PeSection], function_starts: &std::collections::BTreeSet, ) -> Vec { let started = std::time::Instant::now(); // Sections we'll scan for vtable bodies. let scan_targets: Vec<&PeSection> = sections .iter() .filter(|s| matches!(s.name.as_str(), ".rdata" | ".data")) .collect(); // Range table for "is this VA in .rdata or .data?" let rdata_ranges: Vec<(u32, u32)> = sections .iter() .filter(|s| s.name == ".rdata") .map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size)) .collect(); let mut candidates: Vec = Vec::new(); for section in scan_targets { let va_start = image_base + section.virtual_address; let va_end = va_start + section.virtual_size; let raw_start = section.virtual_address as usize; let raw_end = (section.virtual_address + section.virtual_size) as usize; if raw_end > pe.len() { continue; } let bytes = &pe[raw_start..raw_end.min(pe.len())]; let mut i = 0usize; while i + 12 <= bytes.len() { // Try to start a run at this 4-aligned offset. if !i.is_multiple_of(4) { i += 1; continue; } let mut run_len = 0usize; let mut methods: Vec = Vec::new(); let mut j = i; while j + 4 <= bytes.len() { let val = u32::from_be_bytes([bytes[j], bytes[j + 1], bytes[j + 2], bytes[j + 3]]); if function_starts.contains(&val) { methods.push(val); run_len += 1; j += 4; } else { break; } } if run_len >= 3 { let address = va_start + (i as u32); candidates.push(Vtable { address, length: run_len as u32, col_address: None, class_name: synth_anon_name(&methods), rtti_present: false, base_classes_json: None, methods, }); i += run_len * 4; } else { i += 4; } } let _ = (va_start, va_end); } // RTTI walk: for each candidate, look at vtable[-1]. let pe_image_base = image_base; for v in &mut candidates { if v.address < 4 { continue; } let col_off = (v.address - pe_image_base - 4) as usize; if col_off + 4 > pe.len() { continue; } let col_ptr = u32::from_be_bytes([pe[col_off], pe[col_off + 1], pe[col_off + 2], pe[col_off + 3]]); if col_ptr == 0 { continue; } if !is_in_ranges(col_ptr, &rdata_ranges) { continue; } // Try to extract the TypeDescriptor mangled-name string. if let Some((td_ptr, hierarchy_ptr)) = read_col(pe, image_base, col_ptr) && let Some(mangled) = read_typedescriptor_name(pe, image_base, td_ptr, &rdata_ranges) && let Some(class) = demangle_rtti_typename(&mangled) { v.col_address = Some(col_ptr); v.class_name = class; v.rtti_present = true; v.base_classes_json = read_class_hierarchy(pe, image_base, hierarchy_ptr, &rdata_ranges); } } let elapsed_ms = started.elapsed().as_millis() as f64; let rtti_count = candidates.iter().filter(|v| v.rtti_present).count(); metrics::histogram!("analysis.phase_ms", "phase" => "vtables").record(elapsed_ms); tracing::info!( vtables = candidates.len(), rtti = rtti_count, anon = candidates.len() - rtti_count, elapsed_ms, "vtable scan complete" ); candidates } fn is_in_ranges(addr: u32, ranges: &[(u32, u32)]) -> bool { ranges.iter().any(|&(s, e)| addr >= s && addr < e) } /// Read 4 big-endian bytes at absolute VA `addr` from the PE image. fn read_be_u32(pe: &[u8], image_base: u32, addr: u32) -> Option { let off = addr.wrapping_sub(image_base) as usize; if off + 4 > pe.len() { return None; } Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]])) } /// Parse a `CompleteObjectLocator` at VA `col`. Returns /// `(type_descriptor_ptr, class_hierarchy_descriptor_ptr)` on success. /// /// Layout (32-bit MSVC): /// ```text /// +0x00 signature (0 for x86 without /GR-, can be 1) /// +0x04 offset within complete object /// +0x08 cdOffset (this-pointer adjuster) /// +0x0C TypeDescriptor * /// +0x10 RTTIClassHierarchyDescriptor * /// ``` fn read_col(pe: &[u8], image_base: u32, col: u32) -> Option<(u32, u32)> { let td = read_be_u32(pe, image_base, col + 0x0C)?; let chd = read_be_u32(pe, image_base, col + 0x10)?; if td == 0 { return None; } Some((td, chd)) } /// Read a TypeDescriptor's mangled-name string at VA `td`. /// /// Layout: `+0x00` vftable ptr, `+0x04` "spare", `+0x08` zero-terminated /// mangled name (e.g. `.?AVClassName@@`). fn read_typedescriptor_name( pe: &[u8], image_base: u32, td: u32, rdata_ranges: &[(u32, u32)], ) -> Option { if !is_in_ranges(td, rdata_ranges) { return None; } let name_va = td + 0x08; let off = name_va.wrapping_sub(image_base) as usize; if off + 1 > pe.len() { return None; } // Read up to 256 bytes or until NUL. let mut end = off; while end < pe.len().min(off + 256) && pe[end] != 0 { end += 1; } if end == off { return None; } let s = std::str::from_utf8(&pe[off..end]).ok()?; // Sanity: MSVC RTTI names always start with `.?A`. if !s.starts_with(".?A") { return None; } Some(s.to_string()) } /// Demangle an RTTI type-name string of the form `.?AVClassName@ns@@`. /// MSVC convention: leading `.` is the marker for an RTTI string; strip it /// before passing to the demangler. fn demangle_rtti_typename(rtti_name: &str) -> Option { let stripped = rtti_name.strip_prefix('.')?; let raw = msvc_demangler::demangle(stripped, msvc_demangler::DemangleFlags::llvm()).ok()?; // Output looks like `class xe::apu::AudioSystem` or `struct foo::Bar`. let cls = raw .strip_prefix("class ") .or_else(|| raw.strip_prefix("struct ")) .or_else(|| raw.strip_prefix("union ")) .unwrap_or(&raw); Some(cls.to_string()) } /// Best-effort `RTTIClassHierarchyDescriptor` walk: read the /// `BaseClassArray` entries and demangle each base's TypeDescriptor name. /// Returns a JSON array string on success. /// /// Layout: /// ```text /// RTTIClassHierarchyDescriptor: /// +0x00 signature /// +0x04 attributes /// +0x08 numBaseClasses /// +0x0C BaseClassArray * (-> array of BaseClassDescriptor *) /// BaseClassDescriptor: /// +0x00 TypeDescriptor * /// +0x04 numContainedBases /// ... /// ``` fn read_class_hierarchy( pe: &[u8], image_base: u32, chd: u32, rdata_ranges: &[(u32, u32)], ) -> Option { if !is_in_ranges(chd, rdata_ranges) { return None; } let num_bases = read_be_u32(pe, image_base, chd + 0x08)?; if num_bases == 0 || num_bases > 256 { return None; } // sanity cap let bca_ptr = read_be_u32(pe, image_base, chd + 0x0C)?; if !is_in_ranges(bca_ptr, rdata_ranges) { return None; } let mut names: Vec = Vec::new(); for i in 0..num_bases { let bcd_ptr = match read_be_u32(pe, image_base, bca_ptr + i * 4) { Some(p) if is_in_ranges(p, rdata_ranges) => p, _ => return None, }; let td_ptr = match read_be_u32(pe, image_base, bcd_ptr) { Some(p) if is_in_ranges(p, rdata_ranges) => p, _ => return None, }; let mangled = match read_typedescriptor_name(pe, image_base, td_ptr, rdata_ranges) { Some(s) => s, None => return None, }; let cls = demangle_rtti_typename(&mangled).unwrap_or(mangled); names.push(cls); } serde_json::to_string(&names).ok() } /// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of /// the sorted method-PC list. Two vtables with identical method ordering /// collapse to the same anonymous name. fn synth_anon_name(methods: &[u32]) -> String { // FNV-1a 64-bit on the sorted PC list; we only use 32 bits for brevity. let mut sorted = methods.to_vec(); sorted.sort_unstable(); let mut h: u64 = 0xcbf29ce484222325; for pc in &sorted { for b in pc.to_le_bytes() { h ^= b as u64; h = h.wrapping_mul(0x100000001b3); } } format!("ANON_Class_{:08X}", (h as u32)) } /// Build the per-method `(vtable_address, slot, function_address)` list for /// DB insertion, with optional demangled-name lookup for any function that /// has a matching `?…` label. Skips slots whose function isn't in the /// supplied label map. pub fn methods_table( vtables: &[Vtable], labels: &std::collections::HashMap, ) -> Vec<(u32, u32, u32, Option, Option)> { let mut out = Vec::new(); for v in vtables { for (slot, &fn_va) in v.methods.iter().enumerate() { let label = labels.get(&fn_va).cloned(); let demangled = label.as_ref() .and_then(|l| demangle::demangle(l).map(|d| d.raw_demangled)); out.push((v.address, slot as u32, fn_va, label, demangled)); } } out } /// Build a `class_name → Vtable` summary for the `classes` table. Multiple /// vtables sharing the same class name (multiple instances at link time) /// collapse via `BTreeMap` — the first detected vtable wins. pub fn classes_table(vtables: &[Vtable]) -> Vec<(String, u32, bool, Option)> { let mut by_name: BTreeMap = BTreeMap::new(); for v in vtables { by_name.entry(v.class_name.clone()).or_insert(v); } by_name .into_iter() .map(|(name, v)| (name, v.address, v.rtti_present, v.base_classes_json.clone())) .collect() } #[cfg(test)] mod tests { use super::*; #[test] fn synth_anon_name_is_stable() { let a = synth_anon_name(&[0x82001000, 0x82001100, 0x82001200]); let b = synth_anon_name(&[0x82001200, 0x82001000, 0x82001100]); assert_eq!(a, b, "anon name must be order-independent"); } #[test] fn synth_anon_name_differs_for_different_methods() { let a = synth_anon_name(&[0x82001000, 0x82001100]); let b = synth_anon_name(&[0x82002000, 0x82002100]); assert_ne!(a, b); } #[test] fn detects_3_method_vtable_in_rdata() { let image_base = 0x82000000u32; let rdata_va = 0x1000u32; let text_va = 0x2000u32; let rdata_size = 16u32; let text_size = 0x100u32; // PE buffer big enough for both sections. let total = (text_va + text_size) as usize; let mut pe = vec![0u8; total]; // Vtable: 3 method PCs at .rdata start, all valid function entries. let m: [u32; 3] = [image_base + text_va, image_base + text_va + 0x10, image_base + text_va + 0x20]; for (i, val) in m.iter().enumerate() { pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4] .copy_from_slice(&val.to_be_bytes()); } let sections = vec![ PeSection { name: ".rdata".into(), virtual_address: rdata_va, virtual_size: rdata_size, raw_offset: rdata_va, raw_size: rdata_size, flags: 0x4000_0040, }, PeSection { name: ".text".into(), virtual_address: text_va, virtual_size: text_size, raw_offset: text_va, raw_size: text_size, flags: 0x6000_0020, }, ]; let mut function_starts = std::collections::BTreeSet::new(); for &pc in &m { function_starts.insert(pc); } let vtables = analyze(&pe, image_base, §ions, &function_starts); assert_eq!(vtables.len(), 1); assert_eq!(vtables[0].length, 3); assert_eq!(vtables[0].address, image_base + rdata_va); assert!(vtables[0].class_name.starts_with("ANON_Class_")); assert!(!vtables[0].rtti_present); } #[test] fn rejects_2_method_run() { let image_base = 0x82000000u32; let rdata_va = 0x1000u32; let text_va = 0x2000u32; let total = (text_va + 0x100) as usize; let mut pe = vec![0u8; total]; let m: [u32; 2] = [image_base + text_va, image_base + text_va + 0x10]; for (i, val) in m.iter().enumerate() { pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4] .copy_from_slice(&val.to_be_bytes()); } let sections = vec![ PeSection { name: ".rdata".into(), virtual_address: rdata_va, virtual_size: 8, raw_offset: rdata_va, raw_size: 8, flags: 0x4000_0040, }, PeSection { name: ".text".into(), virtual_address: text_va, virtual_size: 0x100, raw_offset: text_va, raw_size: 0x100, flags: 0x6000_0020, }, ]; let mut function_starts = std::collections::BTreeSet::new(); for &pc in &m { function_starts.insert(pc); } let vtables = analyze(&pe, image_base, §ions, &function_starts); assert_eq!(vtables.len(), 0, "runs of 2 must be rejected to keep false-positive rate down"); } }