M2: MSVC C++ demangler + demangled_names DB table

Adds an MSVC name-demangling layer in front of M3's vtable / RTTI work:
- New `xenia_analysis::demangle` wraps the `msvc-demangler` crate (a Rust
  port of LLVM's `MicrosoftDemangle.cpp`). `demangle()` short-circuits on
  non-mangled inputs (`?` prefix check); `demangle_or_raw()` always returns
  a record (raw passthrough on parse failure).
- Heuristic split of the formatted demangled string into structured fields
  `(namespace_path, class_name, method_name, params_signature)`. Top-level
  paren / template-bracket aware, so `a::b<c::d>::e` and signatures with
  templated arg types parse correctly.
- DB: new `demangled_names(address, mangled, raw_demangled, namespace_path,
  class_name, method_name, params_signature)` with indices on address /
  class_name / method_name. Populated from any label whose name starts with
  `?` plus any import name that happens to be mangled.

For Sylpheed (a fully stripped binary) this table is empty out-of-the-box;
the layer's value lands in M3, which will append rows for every RTTI
TypeDescriptor name found in `.rdata`.

Tests 610→617 (+7 demangler unit tests covering early-out, raw fallback,
member function form, RTTI form, qname split, paren-template safety, and
top-level `::` splitting).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-08 20:02:21 +02:00
parent fd68285210
commit 89f5f7e4a9
7 changed files with 405 additions and 6 deletions

11
Cargo.lock generated
View File

@@ -2160,6 +2160,16 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "msvc-demangler"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbeff6bd154a309b2ada5639b2661ca6ae4599b34e8487dc276d2cd637da2d76"
dependencies = [
"bitflags 2.11.0",
"itoa",
]
[[package]] [[package]]
name = "naga" name = "naga"
version = "22.1.0" version = "22.1.0"
@@ -4908,6 +4918,7 @@ dependencies = [
"anyhow", "anyhow",
"duckdb", "duckdb",
"metrics", "metrics",
"msvc-demangler",
"serde", "serde",
"serde_json", "serde_json",
"tracing", "tracing",

View File

@@ -14,3 +14,4 @@ anyhow = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
metrics = { workspace = true } metrics = { workspace = true }
duckdb = { workspace = true } duckdb = { workspace = true }
msvc-demangler = "0.11"

View File

@@ -71,13 +71,36 @@ SELECT name FROM functions WHERE address = 2186674160; -- 0x824D29F0
--- ---
## Layer M2 — MSVC C++ name demangler (planned) ## Layer M2 — MSVC C++ name demangler (landed)
Adds `demangled_names(address, mangled, namespace_path, class_name, ### Schema additions
method_name, params_signature, raw_demangled)`. Populates from any label / - New table `demangled_names(address BIGINT NULL, mangled VARCHAR NOT NULL,
import / RTTI string starting with `?`. Falls back to `raw_demangled = mangled` raw_demangled VARCHAR NOT NULL, namespace_path VARCHAR NULL,
when the parser cannot decode (e.g. exotic templates). See class_name VARCHAR NULL, method_name VARCHAR NULL,
`crates/xenia-analysis/src/demangle.rs` (when landed). params_signature VARCHAR NULL)`.
- Indices on `address`, `class_name`, `method_name`.
### What this layer does
- Wraps `msvc_demangler::demangle` (a Rust port of LLVM's
`MicrosoftDemangle.cpp`) and splits the formatted output into structured
fields via a heuristic top-level parser (handles templates and nested parens
correctly).
- Populates `demangled_names` from any label whose name starts with `?` plus
any import name that happens to be mangled (defensive — typical kernel
imports use C names).
### What this layer does NOT do
- Does not parse the AST returned by `msvc_demangler::parse` — uses the formatted
string and a heuristic split. Adequate for typical class member functions
and RTTI strings; exotic template / lambda forms still get `raw_demangled`
populated but may have NULL structured fields.
- Does not yet ingest RTTI strings discovered in `.rdata` — that's M3's job;
M3 will append rows to this table at the addresses where it finds RTTI
TypeDescriptors.
### Reference docs
- `msvc-demangler` crate (`https://docs.rs/msvc-demangler/0.11`).
- LLVM `MicrosoftDemangle.cpp` (the parser this crate ports).
## Layer M3 — Vtable + RTTI detection (planned) ## Layer M3 — Vtable + RTTI detection (planned)

View File

@@ -339,6 +339,16 @@ impl DbWriter {
kind VARCHAR NOT NULL -- function | import | saverestore | local | data | other kind VARCHAR NOT NULL -- function | import | saverestore | local | data | other
); );
CREATE TABLE demangled_names (
address BIGINT, -- VA the mangled name is associated with; NULL when from a non-address source (e.g. RTTI-only string)
mangled VARCHAR NOT NULL, -- original mangled symbol (e.g. ?Foo@Bar@@QEAAXXZ)
raw_demangled VARCHAR NOT NULL, -- LLVM-style demangled output (or mangled string on parse failure)
namespace_path VARCHAR, -- e.g. xe::apu (NULL = global / parser failure)
class_name VARCHAR, -- e.g. AudioSystem (NULL = free function / parser failure)
method_name VARCHAR, -- e.g. Setup (NULL on parser failure)
params_signature VARCHAR -- contents of the outermost (...) (NULL = not a function)
);
CREATE TABLE xrefs ( CREATE TABLE xrefs (
source BIGINT NOT NULL, -- VA of the referencing instruction source BIGINT NOT NULL, -- VA of the referencing instruction
target BIGINT NOT NULL, -- VA of the referenced destination target BIGINT NOT NULL, -- VA of the referenced destination
@@ -353,6 +363,7 @@ impl DbWriter {
insert_functions(&self.conn, func_analysis, labels)?; insert_functions(&self.conn, func_analysis, labels)?;
insert_pdata_entries(&self.conn, &func_analysis.pdata_entries)?; insert_pdata_entries(&self.conn, &func_analysis.pdata_entries)?;
insert_labels(&self.conn, labels)?; insert_labels(&self.conn, labels)?;
insert_demangled_from_labels(&self.conn, labels, info.import_libraries)?;
insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?; insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?;
let indices = [ let indices = [
@@ -360,6 +371,9 @@ impl DbWriter {
("idx_functions_pdata_validated", "CREATE INDEX idx_functions_pdata_validated ON functions(pdata_validated)"), ("idx_functions_pdata_validated", "CREATE INDEX idx_functions_pdata_validated ON functions(pdata_validated)"),
("idx_labels_kind", "CREATE INDEX idx_labels_kind ON labels(kind)"), ("idx_labels_kind", "CREATE INDEX idx_labels_kind ON labels(kind)"),
("idx_labels_name", "CREATE INDEX idx_labels_name ON labels(name)"), ("idx_labels_name", "CREATE INDEX idx_labels_name ON labels(name)"),
("idx_demangled_address", "CREATE INDEX idx_demangled_address ON demangled_names(address)"),
("idx_demangled_class", "CREATE INDEX idx_demangled_class ON demangled_names(class_name)"),
("idx_demangled_method", "CREATE INDEX idx_demangled_method ON demangled_names(method_name)"),
("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"), ("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"),
("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"), ("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"),
("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"), ("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"),
@@ -716,6 +730,69 @@ fn insert_functions(
Ok(()) Ok(())
} }
fn insert_demangled_from_labels(
conn: &Connection,
labels: &HashMap<u32, String>,
import_libraries: &[xenia_xex::header::ImportLibrary],
) -> anyhow::Result<()> {
let mut stmt = conn.prepare(
"INSERT INTO demangled_names
(address, mangled, raw_demangled, namespace_path, class_name,
method_name, params_signature)
VALUES (?, ?, ?, ?, ?, ?, ?)"
)?;
let mut count = 0u64;
for (&addr, name) in labels {
// The label table holds raw symbol names (`?...@...`). Imports come
// wrapped as `__imp_<lib>_<sym>`; strip the `__imp_<lib>_` prefix to
// recover any mangled inner name (rare for kernel imports but
// defensive). For now, skip imports entirely — they're handled below
// via `import_libraries`.
if name.starts_with("__imp_") {
continue;
}
if let Some(d) = crate::demangle::demangle(name) {
stmt.execute(params![
addr as i64,
d.mangled,
d.raw_demangled,
d.namespace_path,
d.class_name,
d.method_name,
d.params_signature,
])?;
count += 1;
}
}
// Defensive: also demangle any import name that happens to be mangled.
for lib in import_libraries {
for imp in &lib.imports {
let resolved = crate::resolve_ordinal(&lib.name, imp.ordinal);
if let Some(name) = resolved
&& let Some(d) = crate::demangle::demangle(name)
{
stmt.execute(params![
imp.address as i64,
d.mangled,
d.raw_demangled,
d.namespace_path,
d.class_name,
d.method_name,
d.params_signature,
])?;
count += 1;
}
}
}
metrics::counter!("db.rows", "table" => "demangled_names").increment(count);
tracing::info!(rows = count, table = "demangled_names", "demangler complete");
Ok(())
}
fn insert_pdata_entries( fn insert_pdata_entries(
conn: &Connection, conn: &Connection,
entries: &[xenia_xex::pdata::PdataEntry], entries: &[xenia_xex::pdata::PdataEntry],

View File

@@ -0,0 +1,277 @@
//! MSVC C++ name demangling for Xbox 360 binaries.
//!
//! Wraps [`msvc_demangler::demangle`] (a Rust port of LLVM's
//! `MicrosoftDemangle.cpp`) and splits the resulting human-readable string
//! into structured fields (namespace path, class name, method name, params
//! signature) for storage in the `demangled_names` DB table.
//!
//! The structured split is heuristic — it operates on the formatted output,
//! not the parsed AST. This is good enough for typical RTTI strings of the
//! form `?AVClassName@Namespace@@` and standard member functions; exotic
//! template / lambda forms degrade gracefully (the structured fields end up
//! `None` while `raw_demangled` retains the full LLVM-style output).
//!
//! Reference: <https://docs.rs/msvc-demangler> (LLVM `MicrosoftDemangle.cpp` port).
use msvc_demangler::DemangleFlags;
/// Structured view of one demangled MSVC symbol.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Demangled {
/// Original mangled string.
pub mangled: String,
/// Full LLVM-style demangled output (e.g. `xe::apu::AudioSystem::Setup(void)`).
pub raw_demangled: String,
/// `::`-joined namespace path leading up to the class, e.g. `xe::apu`. None
/// when the symbol is at global scope.
pub namespace_path: Option<String>,
/// Class name for member functions, e.g. `AudioSystem`. None when the
/// symbol is a free function.
pub class_name: Option<String>,
/// Method or free-function name, e.g. `Setup`. None when the heuristic
/// could not separate the name from the rest of the demangled string.
pub method_name: Option<String>,
/// Parameter signature without the surrounding parens, e.g. `void` or
/// `int, char *`. None when not a function or no `(...)` was found.
pub params_signature: Option<String>,
}
/// Demangle one mangled MSVC C++ symbol. Returns `None` if the input does not
/// start with `?` (early-out for non-mangled names) OR if the underlying
/// demangler fails to parse it. Callers that want a "best effort" record
/// (NULL fields + raw=mangled) should use [`demangle_or_raw`] instead.
pub fn demangle(mangled: &str) -> Option<Demangled> {
if !mangled.starts_with('?') {
return None;
}
let raw = msvc_demangler::demangle(mangled, DemangleFlags::llvm()).ok()?;
Some(split_structured(mangled.to_string(), raw))
}
/// Demangle, or fall back to a record that just carries the original mangled
/// string in `raw_demangled` and leaves all structured fields `None`. Useful
/// for DB insert paths that want one row per mangled input regardless of
/// parser success.
pub fn demangle_or_raw(mangled: &str) -> Demangled {
if let Some(d) = demangle(mangled) {
return d;
}
Demangled {
mangled: mangled.to_string(),
raw_demangled: mangled.to_string(),
namespace_path: None,
class_name: None,
method_name: None,
params_signature: None,
}
}
/// Split a fully-formatted demangled string into structured fields.
///
/// Strategy:
/// 1. Find the first un-nested `(` — everything before it is the qualified
/// name; everything inside the matching parens is `params_signature`.
/// 2. Strip leading return-type tokens before the qualified name (everything
/// up to the LAST whitespace not inside `<...>` or `(...)` brackets).
/// 3. Split the qualified name on `::` (top-level only) — last segment is
/// `method_name`, second-to-last is `class_name`, the rest joined back
/// with `::` is `namespace_path`.
fn split_structured(mangled: String, raw: String) -> Demangled {
let raw_view = raw.as_str();
let (qualified_name, params) = match find_paren_split(raw_view) {
Some((before, inside)) => (before.trim_end().to_string(), Some(inside.to_string())),
None => (raw_view.to_string(), None),
};
// Drop any return-type prefix: keep everything after the last top-level
// whitespace boundary (where "top-level" means depth-0 in <...>/(...)).
let qname_clean = strip_return_type_prefix(&qualified_name);
let (namespace_path, class_name, method_name) = split_qname(&qname_clean);
Demangled {
mangled,
raw_demangled: raw,
namespace_path,
class_name,
method_name,
params_signature: params,
}
}
/// Returns `(text_before_paren, text_inside_outer_parens)` for the first
/// top-level `(` in `s`. Returns `None` when no top-level paren is present.
fn find_paren_split(s: &str) -> Option<(&str, &str)> {
let bytes = s.as_bytes();
let mut depth_angle: i32 = 0;
for (i, &b) in bytes.iter().enumerate() {
match b {
b'<' => depth_angle += 1,
b'>' if depth_angle > 0 => depth_angle -= 1,
b'(' if depth_angle == 0 => {
// Find matching close at depth 0 on parens.
let mut depth_paren = 1i32;
let mut depth_angle2 = 0i32;
for (j, &b2) in bytes.iter().enumerate().skip(i + 1) {
match b2 {
b'<' => depth_angle2 += 1,
b'>' if depth_angle2 > 0 => depth_angle2 -= 1,
b'(' => depth_paren += 1,
b')' => {
depth_paren -= 1;
if depth_paren == 0 {
return Some((&s[..i], &s[i + 1..j]));
}
}
_ => {}
}
}
return None;
}
_ => {}
}
}
None
}
/// Strip a leading return-type token (everything up to and including the
/// last top-level whitespace). E.g. `void __cdecl Foo::Bar` → `Foo::Bar`.
fn strip_return_type_prefix(s: &str) -> String {
let bytes = s.as_bytes();
let mut depth_angle: i32 = 0;
let mut depth_paren: i32 = 0;
let mut last_ws_at: Option<usize> = None;
for (i, &b) in bytes.iter().enumerate() {
match b {
b'<' => depth_angle += 1,
b'>' if depth_angle > 0 => depth_angle -= 1,
b'(' => depth_paren += 1,
b')' if depth_paren > 0 => depth_paren -= 1,
b' ' if depth_angle == 0 && depth_paren == 0 => last_ws_at = Some(i),
_ => {}
}
}
match last_ws_at {
Some(i) => s[i + 1..].to_string(),
None => s.to_string(),
}
}
/// Split a fully-qualified name on top-level `::` and tag the parts.
fn split_qname(qname: &str) -> (Option<String>, Option<String>, Option<String>) {
if qname.is_empty() {
return (None, None, None);
}
let parts = top_level_split_colon_colon(qname);
match parts.len() {
0 => (None, None, None),
1 => (None, None, Some(parts[0].clone())),
2 => (None, Some(parts[0].clone()), Some(parts[1].clone())),
_ => {
let n = parts.len();
let method = parts[n - 1].clone();
let class = parts[n - 2].clone();
let ns = parts[..n - 2].join("::");
(Some(ns), Some(class), Some(method))
}
}
}
/// Split on top-level `::` — `::` inside `<...>` or `(...)` is preserved.
fn top_level_split_colon_colon(s: &str) -> Vec<String> {
let bytes = s.as_bytes();
let mut depth_angle: i32 = 0;
let mut depth_paren: i32 = 0;
let mut out: Vec<String> = Vec::new();
let mut start = 0usize;
let mut i = 0usize;
while i < bytes.len() {
let b = bytes[i];
match b {
b'<' => depth_angle += 1,
b'>' if depth_angle > 0 => depth_angle -= 1,
b'(' => depth_paren += 1,
b')' if depth_paren > 0 => depth_paren -= 1,
b':' if depth_angle == 0
&& depth_paren == 0
&& i + 1 < bytes.len()
&& bytes[i + 1] == b':' =>
{
out.push(s[start..i].to_string());
start = i + 2;
i += 2;
continue;
}
_ => {}
}
i += 1;
}
out.push(s[start..].to_string());
out.into_iter().filter(|p| !p.is_empty()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn early_out_on_non_mangled() {
assert!(demangle("plain_c_name").is_none());
assert!(demangle("Foo::Bar").is_none());
}
#[test]
fn demangle_or_raw_records_failures() {
let d = demangle_or_raw("not_mangled");
assert_eq!(d.mangled, "not_mangled");
assert_eq!(d.raw_demangled, "not_mangled");
assert!(d.method_name.is_none());
}
#[test]
fn simple_member_function() {
// ?Setup@AudioSystem@apu@xe@@QEAAXXZ → public: __cdecl xe::apu::AudioSystem::Setup(void)
let d = demangle("?Setup@AudioSystem@apu@xe@@QEAAXXZ").expect("should parse");
assert_eq!(d.method_name.as_deref(), Some("Setup"));
assert_eq!(d.class_name.as_deref(), Some("AudioSystem"));
assert_eq!(d.namespace_path.as_deref(), Some("xe::apu"));
assert_eq!(d.params_signature.as_deref(), Some("void"));
}
#[test]
fn rtti_type_descriptor_string() {
// RTTI TypeDescriptor mangled name format: ".?AVClassName@@" → "class ClassName".
// We strip the leading "." and call demangle on the "?AV…" part below in M3.
// For now confirm the demangler handles the minimal class form.
let d = demangle("?AVAudioSystem@apu@xe@@").expect("should parse");
assert!(
d.raw_demangled.contains("AudioSystem"),
"raw='{}'",
d.raw_demangled
);
}
#[test]
fn split_qname_handles_namespace_chain() {
let (ns, cls, m) = split_qname("a::b::c::Klass::method");
assert_eq!(ns.as_deref(), Some("a::b::c"));
assert_eq!(cls.as_deref(), Some("Klass"));
assert_eq!(m.as_deref(), Some("method"));
}
#[test]
fn paren_split_handles_template_in_args() {
// Templates inside the param list must not confuse paren matching.
let s = "void __cdecl Foo::Bar(std::vector<int>, std::map<a, b>)";
let (before, inside) = find_paren_split(s).expect("paren found");
assert_eq!(before, "void __cdecl Foo::Bar");
assert_eq!(inside, "std::vector<int>, std::map<a, b>");
}
#[test]
fn double_colon_inside_template_not_split() {
let parts = top_level_split_colon_colon("a::b<c::d>::e");
assert_eq!(parts, vec!["a", "b<c::d>", "e"]);
}
}

View File

@@ -6,6 +6,7 @@ pub mod disasm;
pub mod formatter; pub mod formatter;
pub mod sinks; pub mod sinks;
pub mod sql_views; pub mod sql_views;
pub mod demangle;
mod ordinals; mod ordinals;
pub use ordinals::resolve_ordinal; pub use ordinals::resolve_ordinal;

View File

@@ -172,6 +172,15 @@ fn db_schema_matches_expected_columns() {
("name", "VARCHAR"), ("name", "VARCHAR"),
("kind", "VARCHAR"), ("kind", "VARCHAR"),
]), ]),
("demangled_names", &[
("address", "BIGINT"),
("mangled", "VARCHAR"),
("raw_demangled", "VARCHAR"),
("namespace_path", "VARCHAR"),
("class_name", "VARCHAR"),
("method_name", "VARCHAR"),
("params_signature", "VARCHAR"),
]),
("xrefs", &[ ("xrefs", &[
("source", "BIGINT"), ("source", "BIGINT"),
("target", "BIGINT"), ("target", "BIGINT"),