M2: MSVC C++ demangler + demangled_names DB table

Adds an MSVC name-demangling layer in front of M3's vtable / RTTI work:
- New `xenia_analysis::demangle` wraps the `msvc-demangler` crate (a Rust
  port of LLVM's `MicrosoftDemangle.cpp`). `demangle()` short-circuits on
  non-mangled inputs (`?` prefix check); `demangle_or_raw()` always returns
  a record (raw passthrough on parse failure).
- Heuristic split of the formatted demangled string into structured fields
  `(namespace_path, class_name, method_name, params_signature)`. Top-level
  paren / template-bracket aware, so `a::b<c::d>::e` and signatures with
  templated arg types parse correctly.
- DB: new `demangled_names(address, mangled, raw_demangled, namespace_path,
  class_name, method_name, params_signature)` with indices on address /
  class_name / method_name. Populated from any label whose name starts with
  `?` plus any import name that happens to be mangled.

For Sylpheed (a fully stripped binary) this table is empty out-of-the-box;
the layer's value lands in M3, which will append rows for every RTTI
TypeDescriptor name found in `.rdata`.

Tests 610→617 (+7 demangler unit tests covering early-out, raw fallback,
member function form, RTTI form, qname split, paren-template safety, and
top-level `::` splitting).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-08 20:02:21 +02:00
parent fd68285210
commit 89f5f7e4a9
7 changed files with 405 additions and 6 deletions

View File

@@ -0,0 +1,277 @@
//! MSVC C++ name demangling for Xbox 360 binaries.
//!
//! Wraps [`msvc_demangler::demangle`] (a Rust port of LLVM's
//! `MicrosoftDemangle.cpp`) and splits the resulting human-readable string
//! into structured fields (namespace path, class name, method name, params
//! signature) for storage in the `demangled_names` DB table.
//!
//! The structured split is heuristic — it operates on the formatted output,
//! not the parsed AST. This is good enough for typical RTTI strings of the
//! form `?AVClassName@Namespace@@` and standard member functions; exotic
//! template / lambda forms degrade gracefully (the structured fields end up
//! `None` while `raw_demangled` retains the full LLVM-style output).
//!
//! Reference: <https://docs.rs/msvc-demangler> (LLVM `MicrosoftDemangle.cpp` port).
use msvc_demangler::DemangleFlags;
/// Structured view of one demangled MSVC symbol.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Demangled {
/// Original mangled string.
pub mangled: String,
/// Full LLVM-style demangled output (e.g. `xe::apu::AudioSystem::Setup(void)`).
pub raw_demangled: String,
/// `::`-joined namespace path leading up to the class, e.g. `xe::apu`. None
/// when the symbol is at global scope.
pub namespace_path: Option<String>,
/// Class name for member functions, e.g. `AudioSystem`. None when the
/// symbol is a free function.
pub class_name: Option<String>,
/// Method or free-function name, e.g. `Setup`. None when the heuristic
/// could not separate the name from the rest of the demangled string.
pub method_name: Option<String>,
/// Parameter signature without the surrounding parens, e.g. `void` or
/// `int, char *`. None when not a function or no `(...)` was found.
pub params_signature: Option<String>,
}
/// Demangle one mangled MSVC C++ symbol. Returns `None` if the input does not
/// start with `?` (early-out for non-mangled names) OR if the underlying
/// demangler fails to parse it. Callers that want a "best effort" record
/// (NULL fields + raw=mangled) should use [`demangle_or_raw`] instead.
pub fn demangle(mangled: &str) -> Option<Demangled> {
if !mangled.starts_with('?') {
return None;
}
let raw = msvc_demangler::demangle(mangled, DemangleFlags::llvm()).ok()?;
Some(split_structured(mangled.to_string(), raw))
}
/// Demangle, or fall back to a record that just carries the original mangled
/// string in `raw_demangled` and leaves all structured fields `None`. Useful
/// for DB insert paths that want one row per mangled input regardless of
/// parser success.
pub fn demangle_or_raw(mangled: &str) -> Demangled {
if let Some(d) = demangle(mangled) {
return d;
}
Demangled {
mangled: mangled.to_string(),
raw_demangled: mangled.to_string(),
namespace_path: None,
class_name: None,
method_name: None,
params_signature: None,
}
}
/// Split a fully-formatted demangled string into structured fields.
///
/// Strategy:
/// 1. Find the first un-nested `(` — everything before it is the qualified
/// name; everything inside the matching parens is `params_signature`.
/// 2. Strip leading return-type tokens before the qualified name (everything
/// up to the LAST whitespace not inside `<...>` or `(...)` brackets).
/// 3. Split the qualified name on `::` (top-level only) — last segment is
/// `method_name`, second-to-last is `class_name`, the rest joined back
/// with `::` is `namespace_path`.
fn split_structured(mangled: String, raw: String) -> Demangled {
let raw_view = raw.as_str();
let (qualified_name, params) = match find_paren_split(raw_view) {
Some((before, inside)) => (before.trim_end().to_string(), Some(inside.to_string())),
None => (raw_view.to_string(), None),
};
// Drop any return-type prefix: keep everything after the last top-level
// whitespace boundary (where "top-level" means depth-0 in <...>/(...)).
let qname_clean = strip_return_type_prefix(&qualified_name);
let (namespace_path, class_name, method_name) = split_qname(&qname_clean);
Demangled {
mangled,
raw_demangled: raw,
namespace_path,
class_name,
method_name,
params_signature: params,
}
}
/// Returns `(text_before_paren, text_inside_outer_parens)` for the first
/// top-level `(` in `s`. Returns `None` when no top-level paren is present.
fn find_paren_split(s: &str) -> Option<(&str, &str)> {
let bytes = s.as_bytes();
let mut depth_angle: i32 = 0;
for (i, &b) in bytes.iter().enumerate() {
match b {
b'<' => depth_angle += 1,
b'>' if depth_angle > 0 => depth_angle -= 1,
b'(' if depth_angle == 0 => {
// Find matching close at depth 0 on parens.
let mut depth_paren = 1i32;
let mut depth_angle2 = 0i32;
for (j, &b2) in bytes.iter().enumerate().skip(i + 1) {
match b2 {
b'<' => depth_angle2 += 1,
b'>' if depth_angle2 > 0 => depth_angle2 -= 1,
b'(' => depth_paren += 1,
b')' => {
depth_paren -= 1;
if depth_paren == 0 {
return Some((&s[..i], &s[i + 1..j]));
}
}
_ => {}
}
}
return None;
}
_ => {}
}
}
None
}
/// Strip a leading return-type token (everything up to and including the
/// last top-level whitespace). E.g. `void __cdecl Foo::Bar` → `Foo::Bar`.
fn strip_return_type_prefix(s: &str) -> String {
let bytes = s.as_bytes();
let mut depth_angle: i32 = 0;
let mut depth_paren: i32 = 0;
let mut last_ws_at: Option<usize> = None;
for (i, &b) in bytes.iter().enumerate() {
match b {
b'<' => depth_angle += 1,
b'>' if depth_angle > 0 => depth_angle -= 1,
b'(' => depth_paren += 1,
b')' if depth_paren > 0 => depth_paren -= 1,
b' ' if depth_angle == 0 && depth_paren == 0 => last_ws_at = Some(i),
_ => {}
}
}
match last_ws_at {
Some(i) => s[i + 1..].to_string(),
None => s.to_string(),
}
}
/// Split a fully-qualified name on top-level `::` and tag the parts.
fn split_qname(qname: &str) -> (Option<String>, Option<String>, Option<String>) {
if qname.is_empty() {
return (None, None, None);
}
let parts = top_level_split_colon_colon(qname);
match parts.len() {
0 => (None, None, None),
1 => (None, None, Some(parts[0].clone())),
2 => (None, Some(parts[0].clone()), Some(parts[1].clone())),
_ => {
let n = parts.len();
let method = parts[n - 1].clone();
let class = parts[n - 2].clone();
let ns = parts[..n - 2].join("::");
(Some(ns), Some(class), Some(method))
}
}
}
/// Split on top-level `::` — `::` inside `<...>` or `(...)` is preserved.
fn top_level_split_colon_colon(s: &str) -> Vec<String> {
let bytes = s.as_bytes();
let mut depth_angle: i32 = 0;
let mut depth_paren: i32 = 0;
let mut out: Vec<String> = Vec::new();
let mut start = 0usize;
let mut i = 0usize;
while i < bytes.len() {
let b = bytes[i];
match b {
b'<' => depth_angle += 1,
b'>' if depth_angle > 0 => depth_angle -= 1,
b'(' => depth_paren += 1,
b')' if depth_paren > 0 => depth_paren -= 1,
b':' if depth_angle == 0
&& depth_paren == 0
&& i + 1 < bytes.len()
&& bytes[i + 1] == b':' =>
{
out.push(s[start..i].to_string());
start = i + 2;
i += 2;
continue;
}
_ => {}
}
i += 1;
}
out.push(s[start..].to_string());
out.into_iter().filter(|p| !p.is_empty()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn early_out_on_non_mangled() {
assert!(demangle("plain_c_name").is_none());
assert!(demangle("Foo::Bar").is_none());
}
#[test]
fn demangle_or_raw_records_failures() {
let d = demangle_or_raw("not_mangled");
assert_eq!(d.mangled, "not_mangled");
assert_eq!(d.raw_demangled, "not_mangled");
assert!(d.method_name.is_none());
}
#[test]
fn simple_member_function() {
// ?Setup@AudioSystem@apu@xe@@QEAAXXZ → public: __cdecl xe::apu::AudioSystem::Setup(void)
let d = demangle("?Setup@AudioSystem@apu@xe@@QEAAXXZ").expect("should parse");
assert_eq!(d.method_name.as_deref(), Some("Setup"));
assert_eq!(d.class_name.as_deref(), Some("AudioSystem"));
assert_eq!(d.namespace_path.as_deref(), Some("xe::apu"));
assert_eq!(d.params_signature.as_deref(), Some("void"));
}
#[test]
fn rtti_type_descriptor_string() {
// RTTI TypeDescriptor mangled name format: ".?AVClassName@@" → "class ClassName".
// We strip the leading "." and call demangle on the "?AV…" part below in M3.
// For now confirm the demangler handles the minimal class form.
let d = demangle("?AVAudioSystem@apu@xe@@").expect("should parse");
assert!(
d.raw_demangled.contains("AudioSystem"),
"raw='{}'",
d.raw_demangled
);
}
#[test]
fn split_qname_handles_namespace_chain() {
let (ns, cls, m) = split_qname("a::b::c::Klass::method");
assert_eq!(ns.as_deref(), Some("a::b::c"));
assert_eq!(cls.as_deref(), Some("Klass"));
assert_eq!(m.as_deref(), Some("method"));
}
#[test]
fn paren_split_handles_template_in_args() {
// Templates inside the param list must not confuse paren matching.
let s = "void __cdecl Foo::Bar(std::vector<int>, std::map<a, b>)";
let (before, inside) = find_paren_split(s).expect("paren found");
assert_eq!(before, "void __cdecl Foo::Bar");
assert_eq!(inside, "std::vector<int>, std::map<a, b>");
}
#[test]
fn double_colon_inside_template_not_split() {
let parts = top_level_split_colon_colon("a::b<c::d>::e");
assert_eq!(parts, vec!["a", "b<c::d>", "e"]);
}
}