M2: MSVC C++ demangler + demangled_names DB table
Adds an MSVC name-demangling layer in front of M3's vtable / RTTI work: - New `xenia_analysis::demangle` wraps the `msvc-demangler` crate (a Rust port of LLVM's `MicrosoftDemangle.cpp`). `demangle()` short-circuits on non-mangled inputs (`?` prefix check); `demangle_or_raw()` always returns a record (raw passthrough on parse failure). - Heuristic split of the formatted demangled string into structured fields `(namespace_path, class_name, method_name, params_signature)`. Top-level paren / template-bracket aware, so `a::b<c::d>::e` and signatures with templated arg types parse correctly. - DB: new `demangled_names(address, mangled, raw_demangled, namespace_path, class_name, method_name, params_signature)` with indices on address / class_name / method_name. Populated from any label whose name starts with `?` plus any import name that happens to be mangled. For Sylpheed (a fully stripped binary) this table is empty out-of-the-box; the layer's value lands in M3, which will append rows for every RTTI TypeDescriptor name found in `.rdata`. Tests 610→617 (+7 demangler unit tests covering early-out, raw fallback, member function form, RTTI form, qname split, paren-template safety, and top-level `::` splitting). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
277
crates/xenia-analysis/src/demangle.rs
Normal file
277
crates/xenia-analysis/src/demangle.rs
Normal file
@@ -0,0 +1,277 @@
|
||||
//! MSVC C++ name demangling for Xbox 360 binaries.
|
||||
//!
|
||||
//! Wraps [`msvc_demangler::demangle`] (a Rust port of LLVM's
|
||||
//! `MicrosoftDemangle.cpp`) and splits the resulting human-readable string
|
||||
//! into structured fields (namespace path, class name, method name, params
|
||||
//! signature) for storage in the `demangled_names` DB table.
|
||||
//!
|
||||
//! The structured split is heuristic — it operates on the formatted output,
|
||||
//! not the parsed AST. This is good enough for typical RTTI strings of the
|
||||
//! form `?AVClassName@Namespace@@` and standard member functions; exotic
|
||||
//! template / lambda forms degrade gracefully (the structured fields end up
|
||||
//! `None` while `raw_demangled` retains the full LLVM-style output).
|
||||
//!
|
||||
//! Reference: <https://docs.rs/msvc-demangler> (LLVM `MicrosoftDemangle.cpp` port).
|
||||
|
||||
use msvc_demangler::DemangleFlags;
|
||||
|
||||
/// Structured view of one demangled MSVC symbol.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Demangled {
|
||||
/// Original mangled string.
|
||||
pub mangled: String,
|
||||
/// Full LLVM-style demangled output (e.g. `xe::apu::AudioSystem::Setup(void)`).
|
||||
pub raw_demangled: String,
|
||||
/// `::`-joined namespace path leading up to the class, e.g. `xe::apu`. None
|
||||
/// when the symbol is at global scope.
|
||||
pub namespace_path: Option<String>,
|
||||
/// Class name for member functions, e.g. `AudioSystem`. None when the
|
||||
/// symbol is a free function.
|
||||
pub class_name: Option<String>,
|
||||
/// Method or free-function name, e.g. `Setup`. None when the heuristic
|
||||
/// could not separate the name from the rest of the demangled string.
|
||||
pub method_name: Option<String>,
|
||||
/// Parameter signature without the surrounding parens, e.g. `void` or
|
||||
/// `int, char *`. None when not a function or no `(...)` was found.
|
||||
pub params_signature: Option<String>,
|
||||
}
|
||||
|
||||
/// Demangle one mangled MSVC C++ symbol. Returns `None` if the input does not
|
||||
/// start with `?` (early-out for non-mangled names) OR if the underlying
|
||||
/// demangler fails to parse it. Callers that want a "best effort" record
|
||||
/// (NULL fields + raw=mangled) should use [`demangle_or_raw`] instead.
|
||||
pub fn demangle(mangled: &str) -> Option<Demangled> {
|
||||
if !mangled.starts_with('?') {
|
||||
return None;
|
||||
}
|
||||
let raw = msvc_demangler::demangle(mangled, DemangleFlags::llvm()).ok()?;
|
||||
Some(split_structured(mangled.to_string(), raw))
|
||||
}
|
||||
|
||||
/// Demangle, or fall back to a record that just carries the original mangled
|
||||
/// string in `raw_demangled` and leaves all structured fields `None`. Useful
|
||||
/// for DB insert paths that want one row per mangled input regardless of
|
||||
/// parser success.
|
||||
pub fn demangle_or_raw(mangled: &str) -> Demangled {
|
||||
if let Some(d) = demangle(mangled) {
|
||||
return d;
|
||||
}
|
||||
Demangled {
|
||||
mangled: mangled.to_string(),
|
||||
raw_demangled: mangled.to_string(),
|
||||
namespace_path: None,
|
||||
class_name: None,
|
||||
method_name: None,
|
||||
params_signature: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split a fully-formatted demangled string into structured fields.
|
||||
///
|
||||
/// Strategy:
|
||||
/// 1. Find the first un-nested `(` — everything before it is the qualified
|
||||
/// name; everything inside the matching parens is `params_signature`.
|
||||
/// 2. Strip leading return-type tokens before the qualified name (everything
|
||||
/// up to the LAST whitespace not inside `<...>` or `(...)` brackets).
|
||||
/// 3. Split the qualified name on `::` (top-level only) — last segment is
|
||||
/// `method_name`, second-to-last is `class_name`, the rest joined back
|
||||
/// with `::` is `namespace_path`.
|
||||
fn split_structured(mangled: String, raw: String) -> Demangled {
|
||||
let raw_view = raw.as_str();
|
||||
|
||||
let (qualified_name, params) = match find_paren_split(raw_view) {
|
||||
Some((before, inside)) => (before.trim_end().to_string(), Some(inside.to_string())),
|
||||
None => (raw_view.to_string(), None),
|
||||
};
|
||||
|
||||
// Drop any return-type prefix: keep everything after the last top-level
|
||||
// whitespace boundary (where "top-level" means depth-0 in <...>/(...)).
|
||||
let qname_clean = strip_return_type_prefix(&qualified_name);
|
||||
|
||||
let (namespace_path, class_name, method_name) = split_qname(&qname_clean);
|
||||
|
||||
Demangled {
|
||||
mangled,
|
||||
raw_demangled: raw,
|
||||
namespace_path,
|
||||
class_name,
|
||||
method_name,
|
||||
params_signature: params,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(text_before_paren, text_inside_outer_parens)` for the first
|
||||
/// top-level `(` in `s`. Returns `None` when no top-level paren is present.
|
||||
fn find_paren_split(s: &str) -> Option<(&str, &str)> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' if depth_angle == 0 => {
|
||||
// Find matching close at depth 0 on parens.
|
||||
let mut depth_paren = 1i32;
|
||||
let mut depth_angle2 = 0i32;
|
||||
for (j, &b2) in bytes.iter().enumerate().skip(i + 1) {
|
||||
match b2 {
|
||||
b'<' => depth_angle2 += 1,
|
||||
b'>' if depth_angle2 > 0 => depth_angle2 -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' => {
|
||||
depth_paren -= 1;
|
||||
if depth_paren == 0 {
|
||||
return Some((&s[..i], &s[i + 1..j]));
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Strip a leading return-type token (everything up to and including the
|
||||
/// last top-level whitespace). E.g. `void __cdecl Foo::Bar` → `Foo::Bar`.
|
||||
fn strip_return_type_prefix(s: &str) -> String {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
let mut depth_paren: i32 = 0;
|
||||
let mut last_ws_at: Option<usize> = None;
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' if depth_paren > 0 => depth_paren -= 1,
|
||||
b' ' if depth_angle == 0 && depth_paren == 0 => last_ws_at = Some(i),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
match last_ws_at {
|
||||
Some(i) => s[i + 1..].to_string(),
|
||||
None => s.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split a fully-qualified name on top-level `::` and tag the parts.
|
||||
fn split_qname(qname: &str) -> (Option<String>, Option<String>, Option<String>) {
|
||||
if qname.is_empty() {
|
||||
return (None, None, None);
|
||||
}
|
||||
let parts = top_level_split_colon_colon(qname);
|
||||
match parts.len() {
|
||||
0 => (None, None, None),
|
||||
1 => (None, None, Some(parts[0].clone())),
|
||||
2 => (None, Some(parts[0].clone()), Some(parts[1].clone())),
|
||||
_ => {
|
||||
let n = parts.len();
|
||||
let method = parts[n - 1].clone();
|
||||
let class = parts[n - 2].clone();
|
||||
let ns = parts[..n - 2].join("::");
|
||||
(Some(ns), Some(class), Some(method))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Split on top-level `::` — `::` inside `<...>` or `(...)` is preserved.
|
||||
fn top_level_split_colon_colon(s: &str) -> Vec<String> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
let mut depth_paren: i32 = 0;
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
let mut start = 0usize;
|
||||
let mut i = 0usize;
|
||||
while i < bytes.len() {
|
||||
let b = bytes[i];
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' if depth_paren > 0 => depth_paren -= 1,
|
||||
b':' if depth_angle == 0
|
||||
&& depth_paren == 0
|
||||
&& i + 1 < bytes.len()
|
||||
&& bytes[i + 1] == b':' =>
|
||||
{
|
||||
out.push(s[start..i].to_string());
|
||||
start = i + 2;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
out.push(s[start..].to_string());
|
||||
out.into_iter().filter(|p| !p.is_empty()).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn early_out_on_non_mangled() {
|
||||
assert!(demangle("plain_c_name").is_none());
|
||||
assert!(demangle("Foo::Bar").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn demangle_or_raw_records_failures() {
|
||||
let d = demangle_or_raw("not_mangled");
|
||||
assert_eq!(d.mangled, "not_mangled");
|
||||
assert_eq!(d.raw_demangled, "not_mangled");
|
||||
assert!(d.method_name.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_member_function() {
|
||||
// ?Setup@AudioSystem@apu@xe@@QEAAXXZ → public: __cdecl xe::apu::AudioSystem::Setup(void)
|
||||
let d = demangle("?Setup@AudioSystem@apu@xe@@QEAAXXZ").expect("should parse");
|
||||
assert_eq!(d.method_name.as_deref(), Some("Setup"));
|
||||
assert_eq!(d.class_name.as_deref(), Some("AudioSystem"));
|
||||
assert_eq!(d.namespace_path.as_deref(), Some("xe::apu"));
|
||||
assert_eq!(d.params_signature.as_deref(), Some("void"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rtti_type_descriptor_string() {
|
||||
// RTTI TypeDescriptor mangled name format: ".?AVClassName@@" → "class ClassName".
|
||||
// We strip the leading "." and call demangle on the "?AV…" part below in M3.
|
||||
// For now confirm the demangler handles the minimal class form.
|
||||
let d = demangle("?AVAudioSystem@apu@xe@@").expect("should parse");
|
||||
assert!(
|
||||
d.raw_demangled.contains("AudioSystem"),
|
||||
"raw='{}'",
|
||||
d.raw_demangled
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_qname_handles_namespace_chain() {
|
||||
let (ns, cls, m) = split_qname("a::b::c::Klass::method");
|
||||
assert_eq!(ns.as_deref(), Some("a::b::c"));
|
||||
assert_eq!(cls.as_deref(), Some("Klass"));
|
||||
assert_eq!(m.as_deref(), Some("method"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paren_split_handles_template_in_args() {
|
||||
// Templates inside the param list must not confuse paren matching.
|
||||
let s = "void __cdecl Foo::Bar(std::vector<int>, std::map<a, b>)";
|
||||
let (before, inside) = find_paren_split(s).expect("paren found");
|
||||
assert_eq!(before, "void __cdecl Foo::Bar");
|
||||
assert_eq!(inside, "std::vector<int>, std::map<a, b>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn double_colon_inside_template_not_split() {
|
||||
let parts = top_level_split_colon_colon("a::b<c::d>::e");
|
||||
assert_eq!(parts, vec!["a", "b<c::d>", "e"]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user