From 89f5f7e4a9a3d2fce59a3d7cb02118934c1a3420 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Fri, 8 May 2026 20:02:21 +0200 Subject: [PATCH] M2: MSVC C++ demangler + demangled_names DB table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an MSVC name-demangling layer in front of M3's vtable / RTTI work: - New `xenia_analysis::demangle` wraps the `msvc-demangler` crate (a Rust port of LLVM's `MicrosoftDemangle.cpp`). `demangle()` short-circuits on non-mangled inputs (`?` prefix check); `demangle_or_raw()` always returns a record (raw passthrough on parse failure). - Heuristic split of the formatted demangled string into structured fields `(namespace_path, class_name, method_name, params_signature)`. Top-level paren / template-bracket aware, so `a::b::e` and signatures with templated arg types parse correctly. - DB: new `demangled_names(address, mangled, raw_demangled, namespace_path, class_name, method_name, params_signature)` with indices on address / class_name / method_name. Populated from any label whose name starts with `?` plus any import name that happens to be mangled. For Sylpheed (a fully stripped binary) this table is empty out-of-the-box; the layer's value lands in M3, which will append rows for every RTTI TypeDescriptor name found in `.rdata`. Tests 610→617 (+7 demangler unit tests covering early-out, raw fallback, member function form, RTTI form, qname split, paren-template safety, and top-level `::` splitting). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 11 + crates/xenia-analysis/Cargo.toml | 1 + crates/xenia-analysis/SCHEMA.md | 35 ++- crates/xenia-analysis/src/db.rs | 77 +++++ crates/xenia-analysis/src/demangle.rs | 277 ++++++++++++++++++ crates/xenia-analysis/src/lib.rs | 1 + .../xenia-analysis/tests/db_schema_golden.rs | 9 + 7 files changed, 405 insertions(+), 6 deletions(-) create mode 100644 crates/xenia-analysis/src/demangle.rs diff --git a/Cargo.lock b/Cargo.lock index 6b7f034..cbd6c5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2160,6 +2160,16 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "msvc-demangler" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbeff6bd154a309b2ada5639b2661ca6ae4599b34e8487dc276d2cd637da2d76" +dependencies = [ + "bitflags 2.11.0", + "itoa", +] + [[package]] name = "naga" version = "22.1.0" @@ -4908,6 +4918,7 @@ dependencies = [ "anyhow", "duckdb", "metrics", + "msvc-demangler", "serde", "serde_json", "tracing", diff --git a/crates/xenia-analysis/Cargo.toml b/crates/xenia-analysis/Cargo.toml index d8c9fc2..ae9a234 100644 --- a/crates/xenia-analysis/Cargo.toml +++ b/crates/xenia-analysis/Cargo.toml @@ -14,3 +14,4 @@ anyhow = { workspace = true } tracing = { workspace = true } metrics = { workspace = true } duckdb = { workspace = true } +msvc-demangler = "0.11" diff --git a/crates/xenia-analysis/SCHEMA.md b/crates/xenia-analysis/SCHEMA.md index 65e707c..4e40166 100644 --- a/crates/xenia-analysis/SCHEMA.md +++ b/crates/xenia-analysis/SCHEMA.md @@ -71,13 +71,36 @@ SELECT name FROM functions WHERE address = 2186674160; -- 0x824D29F0 --- -## Layer M2 — MSVC C++ name demangler (planned) +## Layer M2 — MSVC C++ name demangler (landed) -Adds `demangled_names(address, mangled, namespace_path, class_name, -method_name, params_signature, raw_demangled)`. Populates from any label / -import / RTTI string starting with `?`. Falls back to `raw_demangled = mangled` -when the parser cannot decode (e.g. exotic templates). See -`crates/xenia-analysis/src/demangle.rs` (when landed). +### Schema additions +- New table `demangled_names(address BIGINT NULL, mangled VARCHAR NOT NULL, + raw_demangled VARCHAR NOT NULL, namespace_path VARCHAR NULL, + class_name VARCHAR NULL, method_name VARCHAR NULL, + params_signature VARCHAR NULL)`. +- Indices on `address`, `class_name`, `method_name`. + +### What this layer does +- Wraps `msvc_demangler::demangle` (a Rust port of LLVM's + `MicrosoftDemangle.cpp`) and splits the formatted output into structured + fields via a heuristic top-level parser (handles templates and nested parens + correctly). +- Populates `demangled_names` from any label whose name starts with `?` plus + any import name that happens to be mangled (defensive — typical kernel + imports use C names). + +### What this layer does NOT do +- Does not parse the AST returned by `msvc_demangler::parse` — uses the formatted + string and a heuristic split. Adequate for typical class member functions + and RTTI strings; exotic template / lambda forms still get `raw_demangled` + populated but may have NULL structured fields. +- Does not yet ingest RTTI strings discovered in `.rdata` — that's M3's job; + M3 will append rows to this table at the addresses where it finds RTTI + TypeDescriptors. + +### Reference docs +- `msvc-demangler` crate (`https://docs.rs/msvc-demangler/0.11`). +- LLVM `MicrosoftDemangle.cpp` (the parser this crate ports). ## Layer M3 — Vtable + RTTI detection (planned) diff --git a/crates/xenia-analysis/src/db.rs b/crates/xenia-analysis/src/db.rs index 02fab6a..dbdf4b9 100644 --- a/crates/xenia-analysis/src/db.rs +++ b/crates/xenia-analysis/src/db.rs @@ -339,6 +339,16 @@ impl DbWriter { kind VARCHAR NOT NULL -- function | import | saverestore | local | data | other ); + CREATE TABLE demangled_names ( + address BIGINT, -- VA the mangled name is associated with; NULL when from a non-address source (e.g. RTTI-only string) + mangled VARCHAR NOT NULL, -- original mangled symbol (e.g. ?Foo@Bar@@QEAAXXZ) + raw_demangled VARCHAR NOT NULL, -- LLVM-style demangled output (or mangled string on parse failure) + namespace_path VARCHAR, -- e.g. xe::apu (NULL = global / parser failure) + class_name VARCHAR, -- e.g. AudioSystem (NULL = free function / parser failure) + method_name VARCHAR, -- e.g. Setup (NULL on parser failure) + params_signature VARCHAR -- contents of the outermost (...) (NULL = not a function) + ); + CREATE TABLE xrefs ( source BIGINT NOT NULL, -- VA of the referencing instruction target BIGINT NOT NULL, -- VA of the referenced destination @@ -353,6 +363,7 @@ impl DbWriter { insert_functions(&self.conn, func_analysis, labels)?; insert_pdata_entries(&self.conn, &func_analysis.pdata_entries)?; insert_labels(&self.conn, labels)?; + insert_demangled_from_labels(&self.conn, labels, info.import_libraries)?; insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?; let indices = [ @@ -360,6 +371,9 @@ impl DbWriter { ("idx_functions_pdata_validated", "CREATE INDEX idx_functions_pdata_validated ON functions(pdata_validated)"), ("idx_labels_kind", "CREATE INDEX idx_labels_kind ON labels(kind)"), ("idx_labels_name", "CREATE INDEX idx_labels_name ON labels(name)"), + ("idx_demangled_address", "CREATE INDEX idx_demangled_address ON demangled_names(address)"), + ("idx_demangled_class", "CREATE INDEX idx_demangled_class ON demangled_names(class_name)"), + ("idx_demangled_method", "CREATE INDEX idx_demangled_method ON demangled_names(method_name)"), ("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"), ("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"), ("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"), @@ -716,6 +730,69 @@ fn insert_functions( Ok(()) } +fn insert_demangled_from_labels( + conn: &Connection, + labels: &HashMap, + import_libraries: &[xenia_xex::header::ImportLibrary], +) -> anyhow::Result<()> { + let mut stmt = conn.prepare( + "INSERT INTO demangled_names + (address, mangled, raw_demangled, namespace_path, class_name, + method_name, params_signature) + VALUES (?, ?, ?, ?, ?, ?, ?)" + )?; + + let mut count = 0u64; + + for (&addr, name) in labels { + // The label table holds raw symbol names (`?...@...`). Imports come + // wrapped as `__imp__`; strip the `__imp__` prefix to + // recover any mangled inner name (rare for kernel imports but + // defensive). For now, skip imports entirely — they're handled below + // via `import_libraries`. + if name.starts_with("__imp_") { + continue; + } + if let Some(d) = crate::demangle::demangle(name) { + stmt.execute(params![ + addr as i64, + d.mangled, + d.raw_demangled, + d.namespace_path, + d.class_name, + d.method_name, + d.params_signature, + ])?; + count += 1; + } + } + + // Defensive: also demangle any import name that happens to be mangled. + for lib in import_libraries { + for imp in &lib.imports { + let resolved = crate::resolve_ordinal(&lib.name, imp.ordinal); + if let Some(name) = resolved + && let Some(d) = crate::demangle::demangle(name) + { + stmt.execute(params![ + imp.address as i64, + d.mangled, + d.raw_demangled, + d.namespace_path, + d.class_name, + d.method_name, + d.params_signature, + ])?; + count += 1; + } + } + } + + metrics::counter!("db.rows", "table" => "demangled_names").increment(count); + tracing::info!(rows = count, table = "demangled_names", "demangler complete"); + Ok(()) +} + fn insert_pdata_entries( conn: &Connection, entries: &[xenia_xex::pdata::PdataEntry], diff --git a/crates/xenia-analysis/src/demangle.rs b/crates/xenia-analysis/src/demangle.rs new file mode 100644 index 0000000..a9b955c --- /dev/null +++ b/crates/xenia-analysis/src/demangle.rs @@ -0,0 +1,277 @@ +//! MSVC C++ name demangling for Xbox 360 binaries. +//! +//! Wraps [`msvc_demangler::demangle`] (a Rust port of LLVM's +//! `MicrosoftDemangle.cpp`) and splits the resulting human-readable string +//! into structured fields (namespace path, class name, method name, params +//! signature) for storage in the `demangled_names` DB table. +//! +//! The structured split is heuristic — it operates on the formatted output, +//! not the parsed AST. This is good enough for typical RTTI strings of the +//! form `?AVClassName@Namespace@@` and standard member functions; exotic +//! template / lambda forms degrade gracefully (the structured fields end up +//! `None` while `raw_demangled` retains the full LLVM-style output). +//! +//! Reference: (LLVM `MicrosoftDemangle.cpp` port). + +use msvc_demangler::DemangleFlags; + +/// Structured view of one demangled MSVC symbol. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Demangled { + /// Original mangled string. + pub mangled: String, + /// Full LLVM-style demangled output (e.g. `xe::apu::AudioSystem::Setup(void)`). + pub raw_demangled: String, + /// `::`-joined namespace path leading up to the class, e.g. `xe::apu`. None + /// when the symbol is at global scope. + pub namespace_path: Option, + /// Class name for member functions, e.g. `AudioSystem`. None when the + /// symbol is a free function. + pub class_name: Option, + /// Method or free-function name, e.g. `Setup`. None when the heuristic + /// could not separate the name from the rest of the demangled string. + pub method_name: Option, + /// Parameter signature without the surrounding parens, e.g. `void` or + /// `int, char *`. None when not a function or no `(...)` was found. + pub params_signature: Option, +} + +/// Demangle one mangled MSVC C++ symbol. Returns `None` if the input does not +/// start with `?` (early-out for non-mangled names) OR if the underlying +/// demangler fails to parse it. Callers that want a "best effort" record +/// (NULL fields + raw=mangled) should use [`demangle_or_raw`] instead. +pub fn demangle(mangled: &str) -> Option { + if !mangled.starts_with('?') { + return None; + } + let raw = msvc_demangler::demangle(mangled, DemangleFlags::llvm()).ok()?; + Some(split_structured(mangled.to_string(), raw)) +} + +/// Demangle, or fall back to a record that just carries the original mangled +/// string in `raw_demangled` and leaves all structured fields `None`. Useful +/// for DB insert paths that want one row per mangled input regardless of +/// parser success. +pub fn demangle_or_raw(mangled: &str) -> Demangled { + if let Some(d) = demangle(mangled) { + return d; + } + Demangled { + mangled: mangled.to_string(), + raw_demangled: mangled.to_string(), + namespace_path: None, + class_name: None, + method_name: None, + params_signature: None, + } +} + +/// Split a fully-formatted demangled string into structured fields. +/// +/// Strategy: +/// 1. Find the first un-nested `(` — everything before it is the qualified +/// name; everything inside the matching parens is `params_signature`. +/// 2. Strip leading return-type tokens before the qualified name (everything +/// up to the LAST whitespace not inside `<...>` or `(...)` brackets). +/// 3. Split the qualified name on `::` (top-level only) — last segment is +/// `method_name`, second-to-last is `class_name`, the rest joined back +/// with `::` is `namespace_path`. +fn split_structured(mangled: String, raw: String) -> Demangled { + let raw_view = raw.as_str(); + + let (qualified_name, params) = match find_paren_split(raw_view) { + Some((before, inside)) => (before.trim_end().to_string(), Some(inside.to_string())), + None => (raw_view.to_string(), None), + }; + + // Drop any return-type prefix: keep everything after the last top-level + // whitespace boundary (where "top-level" means depth-0 in <...>/(...)). + let qname_clean = strip_return_type_prefix(&qualified_name); + + let (namespace_path, class_name, method_name) = split_qname(&qname_clean); + + Demangled { + mangled, + raw_demangled: raw, + namespace_path, + class_name, + method_name, + params_signature: params, + } +} + +/// Returns `(text_before_paren, text_inside_outer_parens)` for the first +/// top-level `(` in `s`. Returns `None` when no top-level paren is present. +fn find_paren_split(s: &str) -> Option<(&str, &str)> { + let bytes = s.as_bytes(); + let mut depth_angle: i32 = 0; + for (i, &b) in bytes.iter().enumerate() { + match b { + b'<' => depth_angle += 1, + b'>' if depth_angle > 0 => depth_angle -= 1, + b'(' if depth_angle == 0 => { + // Find matching close at depth 0 on parens. + let mut depth_paren = 1i32; + let mut depth_angle2 = 0i32; + for (j, &b2) in bytes.iter().enumerate().skip(i + 1) { + match b2 { + b'<' => depth_angle2 += 1, + b'>' if depth_angle2 > 0 => depth_angle2 -= 1, + b'(' => depth_paren += 1, + b')' => { + depth_paren -= 1; + if depth_paren == 0 { + return Some((&s[..i], &s[i + 1..j])); + } + } + _ => {} + } + } + return None; + } + _ => {} + } + } + None +} + +/// Strip a leading return-type token (everything up to and including the +/// last top-level whitespace). E.g. `void __cdecl Foo::Bar` → `Foo::Bar`. +fn strip_return_type_prefix(s: &str) -> String { + let bytes = s.as_bytes(); + let mut depth_angle: i32 = 0; + let mut depth_paren: i32 = 0; + let mut last_ws_at: Option = None; + for (i, &b) in bytes.iter().enumerate() { + match b { + b'<' => depth_angle += 1, + b'>' if depth_angle > 0 => depth_angle -= 1, + b'(' => depth_paren += 1, + b')' if depth_paren > 0 => depth_paren -= 1, + b' ' if depth_angle == 0 && depth_paren == 0 => last_ws_at = Some(i), + _ => {} + } + } + match last_ws_at { + Some(i) => s[i + 1..].to_string(), + None => s.to_string(), + } +} + +/// Split a fully-qualified name on top-level `::` and tag the parts. +fn split_qname(qname: &str) -> (Option, Option, Option) { + if qname.is_empty() { + return (None, None, None); + } + let parts = top_level_split_colon_colon(qname); + match parts.len() { + 0 => (None, None, None), + 1 => (None, None, Some(parts[0].clone())), + 2 => (None, Some(parts[0].clone()), Some(parts[1].clone())), + _ => { + let n = parts.len(); + let method = parts[n - 1].clone(); + let class = parts[n - 2].clone(); + let ns = parts[..n - 2].join("::"); + (Some(ns), Some(class), Some(method)) + } + } +} + +/// Split on top-level `::` — `::` inside `<...>` or `(...)` is preserved. +fn top_level_split_colon_colon(s: &str) -> Vec { + let bytes = s.as_bytes(); + let mut depth_angle: i32 = 0; + let mut depth_paren: i32 = 0; + let mut out: Vec = Vec::new(); + let mut start = 0usize; + let mut i = 0usize; + while i < bytes.len() { + let b = bytes[i]; + match b { + b'<' => depth_angle += 1, + b'>' if depth_angle > 0 => depth_angle -= 1, + b'(' => depth_paren += 1, + b')' if depth_paren > 0 => depth_paren -= 1, + b':' if depth_angle == 0 + && depth_paren == 0 + && i + 1 < bytes.len() + && bytes[i + 1] == b':' => + { + out.push(s[start..i].to_string()); + start = i + 2; + i += 2; + continue; + } + _ => {} + } + i += 1; + } + out.push(s[start..].to_string()); + out.into_iter().filter(|p| !p.is_empty()).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn early_out_on_non_mangled() { + assert!(demangle("plain_c_name").is_none()); + assert!(demangle("Foo::Bar").is_none()); + } + + #[test] + fn demangle_or_raw_records_failures() { + let d = demangle_or_raw("not_mangled"); + assert_eq!(d.mangled, "not_mangled"); + assert_eq!(d.raw_demangled, "not_mangled"); + assert!(d.method_name.is_none()); + } + + #[test] + fn simple_member_function() { + // ?Setup@AudioSystem@apu@xe@@QEAAXXZ → public: __cdecl xe::apu::AudioSystem::Setup(void) + let d = demangle("?Setup@AudioSystem@apu@xe@@QEAAXXZ").expect("should parse"); + assert_eq!(d.method_name.as_deref(), Some("Setup")); + assert_eq!(d.class_name.as_deref(), Some("AudioSystem")); + assert_eq!(d.namespace_path.as_deref(), Some("xe::apu")); + assert_eq!(d.params_signature.as_deref(), Some("void")); + } + + #[test] + fn rtti_type_descriptor_string() { + // RTTI TypeDescriptor mangled name format: ".?AVClassName@@" → "class ClassName". + // We strip the leading "." and call demangle on the "?AV…" part below in M3. + // For now confirm the demangler handles the minimal class form. + let d = demangle("?AVAudioSystem@apu@xe@@").expect("should parse"); + assert!( + d.raw_demangled.contains("AudioSystem"), + "raw='{}'", + d.raw_demangled + ); + } + + #[test] + fn split_qname_handles_namespace_chain() { + let (ns, cls, m) = split_qname("a::b::c::Klass::method"); + assert_eq!(ns.as_deref(), Some("a::b::c")); + assert_eq!(cls.as_deref(), Some("Klass")); + assert_eq!(m.as_deref(), Some("method")); + } + + #[test] + fn paren_split_handles_template_in_args() { + // Templates inside the param list must not confuse paren matching. + let s = "void __cdecl Foo::Bar(std::vector, std::map)"; + let (before, inside) = find_paren_split(s).expect("paren found"); + assert_eq!(before, "void __cdecl Foo::Bar"); + assert_eq!(inside, "std::vector, std::map"); + } + + #[test] + fn double_colon_inside_template_not_split() { + let parts = top_level_split_colon_colon("a::b::e"); + assert_eq!(parts, vec!["a", "b", "e"]); + } +} diff --git a/crates/xenia-analysis/src/lib.rs b/crates/xenia-analysis/src/lib.rs index e3f6639..ef52c20 100644 --- a/crates/xenia-analysis/src/lib.rs +++ b/crates/xenia-analysis/src/lib.rs @@ -6,6 +6,7 @@ pub mod disasm; pub mod formatter; pub mod sinks; pub mod sql_views; +pub mod demangle; mod ordinals; pub use ordinals::resolve_ordinal; diff --git a/crates/xenia-analysis/tests/db_schema_golden.rs b/crates/xenia-analysis/tests/db_schema_golden.rs index 821a548..dc69b62 100644 --- a/crates/xenia-analysis/tests/db_schema_golden.rs +++ b/crates/xenia-analysis/tests/db_schema_golden.rs @@ -172,6 +172,15 @@ fn db_schema_matches_expected_columns() { ("name", "VARCHAR"), ("kind", "VARCHAR"), ]), + ("demangled_names", &[ + ("address", "BIGINT"), + ("mangled", "VARCHAR"), + ("raw_demangled", "VARCHAR"), + ("namespace_path", "VARCHAR"), + ("class_name", "VARCHAR"), + ("method_name", "VARCHAR"), + ("params_signature", "VARCHAR"), + ]), ("xrefs", &[ ("source", "BIGINT"), ("target", "BIGINT"),