M2: MSVC C++ demangler + demangled_names DB table
Adds an MSVC name-demangling layer in front of M3's vtable / RTTI work: - New `xenia_analysis::demangle` wraps the `msvc-demangler` crate (a Rust port of LLVM's `MicrosoftDemangle.cpp`). `demangle()` short-circuits on non-mangled inputs (`?` prefix check); `demangle_or_raw()` always returns a record (raw passthrough on parse failure). - Heuristic split of the formatted demangled string into structured fields `(namespace_path, class_name, method_name, params_signature)`. Top-level paren / template-bracket aware, so `a::b<c::d>::e` and signatures with templated arg types parse correctly. - DB: new `demangled_names(address, mangled, raw_demangled, namespace_path, class_name, method_name, params_signature)` with indices on address / class_name / method_name. Populated from any label whose name starts with `?` plus any import name that happens to be mangled. For Sylpheed (a fully stripped binary) this table is empty out-of-the-box; the layer's value lands in M3, which will append rows for every RTTI TypeDescriptor name found in `.rdata`. Tests 610→617 (+7 demangler unit tests covering early-out, raw fallback, member function form, RTTI form, qname split, paren-template safety, and top-level `::` splitting). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,3 +14,4 @@ anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
metrics = { workspace = true }
|
||||
duckdb = { workspace = true }
|
||||
msvc-demangler = "0.11"
|
||||
|
||||
@@ -71,13 +71,36 @@ SELECT name FROM functions WHERE address = 2186674160; -- 0x824D29F0
|
||||
|
||||
---
|
||||
|
||||
## Layer M2 — MSVC C++ name demangler (planned)
|
||||
## Layer M2 — MSVC C++ name demangler (landed)
|
||||
|
||||
Adds `demangled_names(address, mangled, namespace_path, class_name,
|
||||
method_name, params_signature, raw_demangled)`. Populates from any label /
|
||||
import / RTTI string starting with `?`. Falls back to `raw_demangled = mangled`
|
||||
when the parser cannot decode (e.g. exotic templates). See
|
||||
`crates/xenia-analysis/src/demangle.rs` (when landed).
|
||||
### Schema additions
|
||||
- New table `demangled_names(address BIGINT NULL, mangled VARCHAR NOT NULL,
|
||||
raw_demangled VARCHAR NOT NULL, namespace_path VARCHAR NULL,
|
||||
class_name VARCHAR NULL, method_name VARCHAR NULL,
|
||||
params_signature VARCHAR NULL)`.
|
||||
- Indices on `address`, `class_name`, `method_name`.
|
||||
|
||||
### What this layer does
|
||||
- Wraps `msvc_demangler::demangle` (a Rust port of LLVM's
|
||||
`MicrosoftDemangle.cpp`) and splits the formatted output into structured
|
||||
fields via a heuristic top-level parser (handles templates and nested parens
|
||||
correctly).
|
||||
- Populates `demangled_names` from any label whose name starts with `?` plus
|
||||
any import name that happens to be mangled (defensive — typical kernel
|
||||
imports use C names).
|
||||
|
||||
### What this layer does NOT do
|
||||
- Does not parse the AST returned by `msvc_demangler::parse` — uses the formatted
|
||||
string and a heuristic split. Adequate for typical class member functions
|
||||
and RTTI strings; exotic template / lambda forms still get `raw_demangled`
|
||||
populated but may have NULL structured fields.
|
||||
- Does not yet ingest RTTI strings discovered in `.rdata` — that's M3's job;
|
||||
M3 will append rows to this table at the addresses where it finds RTTI
|
||||
TypeDescriptors.
|
||||
|
||||
### Reference docs
|
||||
- `msvc-demangler` crate (`https://docs.rs/msvc-demangler/0.11`).
|
||||
- LLVM `MicrosoftDemangle.cpp` (the parser this crate ports).
|
||||
|
||||
## Layer M3 — Vtable + RTTI detection (planned)
|
||||
|
||||
|
||||
@@ -339,6 +339,16 @@ impl DbWriter {
|
||||
kind VARCHAR NOT NULL -- function | import | saverestore | local | data | other
|
||||
);
|
||||
|
||||
CREATE TABLE demangled_names (
|
||||
address BIGINT, -- VA the mangled name is associated with; NULL when from a non-address source (e.g. RTTI-only string)
|
||||
mangled VARCHAR NOT NULL, -- original mangled symbol (e.g. ?Foo@Bar@@QEAAXXZ)
|
||||
raw_demangled VARCHAR NOT NULL, -- LLVM-style demangled output (or mangled string on parse failure)
|
||||
namespace_path VARCHAR, -- e.g. xe::apu (NULL = global / parser failure)
|
||||
class_name VARCHAR, -- e.g. AudioSystem (NULL = free function / parser failure)
|
||||
method_name VARCHAR, -- e.g. Setup (NULL on parser failure)
|
||||
params_signature VARCHAR -- contents of the outermost (...) (NULL = not a function)
|
||||
);
|
||||
|
||||
CREATE TABLE xrefs (
|
||||
source BIGINT NOT NULL, -- VA of the referencing instruction
|
||||
target BIGINT NOT NULL, -- VA of the referenced destination
|
||||
@@ -353,6 +363,7 @@ impl DbWriter {
|
||||
insert_functions(&self.conn, func_analysis, labels)?;
|
||||
insert_pdata_entries(&self.conn, &func_analysis.pdata_entries)?;
|
||||
insert_labels(&self.conn, labels)?;
|
||||
insert_demangled_from_labels(&self.conn, labels, info.import_libraries)?;
|
||||
insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?;
|
||||
|
||||
let indices = [
|
||||
@@ -360,6 +371,9 @@ impl DbWriter {
|
||||
("idx_functions_pdata_validated", "CREATE INDEX idx_functions_pdata_validated ON functions(pdata_validated)"),
|
||||
("idx_labels_kind", "CREATE INDEX idx_labels_kind ON labels(kind)"),
|
||||
("idx_labels_name", "CREATE INDEX idx_labels_name ON labels(name)"),
|
||||
("idx_demangled_address", "CREATE INDEX idx_demangled_address ON demangled_names(address)"),
|
||||
("idx_demangled_class", "CREATE INDEX idx_demangled_class ON demangled_names(class_name)"),
|
||||
("idx_demangled_method", "CREATE INDEX idx_demangled_method ON demangled_names(method_name)"),
|
||||
("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"),
|
||||
("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"),
|
||||
("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"),
|
||||
@@ -716,6 +730,69 @@ fn insert_functions(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_demangled_from_labels(
|
||||
conn: &Connection,
|
||||
labels: &HashMap<u32, String>,
|
||||
import_libraries: &[xenia_xex::header::ImportLibrary],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut stmt = conn.prepare(
|
||||
"INSERT INTO demangled_names
|
||||
(address, mangled, raw_demangled, namespace_path, class_name,
|
||||
method_name, params_signature)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)"
|
||||
)?;
|
||||
|
||||
let mut count = 0u64;
|
||||
|
||||
for (&addr, name) in labels {
|
||||
// The label table holds raw symbol names (`?...@...`). Imports come
|
||||
// wrapped as `__imp_<lib>_<sym>`; strip the `__imp_<lib>_` prefix to
|
||||
// recover any mangled inner name (rare for kernel imports but
|
||||
// defensive). For now, skip imports entirely — they're handled below
|
||||
// via `import_libraries`.
|
||||
if name.starts_with("__imp_") {
|
||||
continue;
|
||||
}
|
||||
if let Some(d) = crate::demangle::demangle(name) {
|
||||
stmt.execute(params![
|
||||
addr as i64,
|
||||
d.mangled,
|
||||
d.raw_demangled,
|
||||
d.namespace_path,
|
||||
d.class_name,
|
||||
d.method_name,
|
||||
d.params_signature,
|
||||
])?;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Defensive: also demangle any import name that happens to be mangled.
|
||||
for lib in import_libraries {
|
||||
for imp in &lib.imports {
|
||||
let resolved = crate::resolve_ordinal(&lib.name, imp.ordinal);
|
||||
if let Some(name) = resolved
|
||||
&& let Some(d) = crate::demangle::demangle(name)
|
||||
{
|
||||
stmt.execute(params![
|
||||
imp.address as i64,
|
||||
d.mangled,
|
||||
d.raw_demangled,
|
||||
d.namespace_path,
|
||||
d.class_name,
|
||||
d.method_name,
|
||||
d.params_signature,
|
||||
])?;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metrics::counter!("db.rows", "table" => "demangled_names").increment(count);
|
||||
tracing::info!(rows = count, table = "demangled_names", "demangler complete");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_pdata_entries(
|
||||
conn: &Connection,
|
||||
entries: &[xenia_xex::pdata::PdataEntry],
|
||||
|
||||
277
crates/xenia-analysis/src/demangle.rs
Normal file
277
crates/xenia-analysis/src/demangle.rs
Normal file
@@ -0,0 +1,277 @@
|
||||
//! MSVC C++ name demangling for Xbox 360 binaries.
|
||||
//!
|
||||
//! Wraps [`msvc_demangler::demangle`] (a Rust port of LLVM's
|
||||
//! `MicrosoftDemangle.cpp`) and splits the resulting human-readable string
|
||||
//! into structured fields (namespace path, class name, method name, params
|
||||
//! signature) for storage in the `demangled_names` DB table.
|
||||
//!
|
||||
//! The structured split is heuristic — it operates on the formatted output,
|
||||
//! not the parsed AST. This is good enough for typical RTTI strings of the
|
||||
//! form `?AVClassName@Namespace@@` and standard member functions; exotic
|
||||
//! template / lambda forms degrade gracefully (the structured fields end up
|
||||
//! `None` while `raw_demangled` retains the full LLVM-style output).
|
||||
//!
|
||||
//! Reference: <https://docs.rs/msvc-demangler> (LLVM `MicrosoftDemangle.cpp` port).
|
||||
|
||||
use msvc_demangler::DemangleFlags;
|
||||
|
||||
/// Structured view of one demangled MSVC symbol.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Demangled {
|
||||
/// Original mangled string.
|
||||
pub mangled: String,
|
||||
/// Full LLVM-style demangled output (e.g. `xe::apu::AudioSystem::Setup(void)`).
|
||||
pub raw_demangled: String,
|
||||
/// `::`-joined namespace path leading up to the class, e.g. `xe::apu`. None
|
||||
/// when the symbol is at global scope.
|
||||
pub namespace_path: Option<String>,
|
||||
/// Class name for member functions, e.g. `AudioSystem`. None when the
|
||||
/// symbol is a free function.
|
||||
pub class_name: Option<String>,
|
||||
/// Method or free-function name, e.g. `Setup`. None when the heuristic
|
||||
/// could not separate the name from the rest of the demangled string.
|
||||
pub method_name: Option<String>,
|
||||
/// Parameter signature without the surrounding parens, e.g. `void` or
|
||||
/// `int, char *`. None when not a function or no `(...)` was found.
|
||||
pub params_signature: Option<String>,
|
||||
}
|
||||
|
||||
/// Demangle one mangled MSVC C++ symbol. Returns `None` if the input does not
|
||||
/// start with `?` (early-out for non-mangled names) OR if the underlying
|
||||
/// demangler fails to parse it. Callers that want a "best effort" record
|
||||
/// (NULL fields + raw=mangled) should use [`demangle_or_raw`] instead.
|
||||
pub fn demangle(mangled: &str) -> Option<Demangled> {
|
||||
if !mangled.starts_with('?') {
|
||||
return None;
|
||||
}
|
||||
let raw = msvc_demangler::demangle(mangled, DemangleFlags::llvm()).ok()?;
|
||||
Some(split_structured(mangled.to_string(), raw))
|
||||
}
|
||||
|
||||
/// Demangle, or fall back to a record that just carries the original mangled
|
||||
/// string in `raw_demangled` and leaves all structured fields `None`. Useful
|
||||
/// for DB insert paths that want one row per mangled input regardless of
|
||||
/// parser success.
|
||||
pub fn demangle_or_raw(mangled: &str) -> Demangled {
|
||||
if let Some(d) = demangle(mangled) {
|
||||
return d;
|
||||
}
|
||||
Demangled {
|
||||
mangled: mangled.to_string(),
|
||||
raw_demangled: mangled.to_string(),
|
||||
namespace_path: None,
|
||||
class_name: None,
|
||||
method_name: None,
|
||||
params_signature: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split a fully-formatted demangled string into structured fields.
|
||||
///
|
||||
/// Strategy:
|
||||
/// 1. Find the first un-nested `(` — everything before it is the qualified
|
||||
/// name; everything inside the matching parens is `params_signature`.
|
||||
/// 2. Strip leading return-type tokens before the qualified name (everything
|
||||
/// up to the LAST whitespace not inside `<...>` or `(...)` brackets).
|
||||
/// 3. Split the qualified name on `::` (top-level only) — last segment is
|
||||
/// `method_name`, second-to-last is `class_name`, the rest joined back
|
||||
/// with `::` is `namespace_path`.
|
||||
fn split_structured(mangled: String, raw: String) -> Demangled {
|
||||
let raw_view = raw.as_str();
|
||||
|
||||
let (qualified_name, params) = match find_paren_split(raw_view) {
|
||||
Some((before, inside)) => (before.trim_end().to_string(), Some(inside.to_string())),
|
||||
None => (raw_view.to_string(), None),
|
||||
};
|
||||
|
||||
// Drop any return-type prefix: keep everything after the last top-level
|
||||
// whitespace boundary (where "top-level" means depth-0 in <...>/(...)).
|
||||
let qname_clean = strip_return_type_prefix(&qualified_name);
|
||||
|
||||
let (namespace_path, class_name, method_name) = split_qname(&qname_clean);
|
||||
|
||||
Demangled {
|
||||
mangled,
|
||||
raw_demangled: raw,
|
||||
namespace_path,
|
||||
class_name,
|
||||
method_name,
|
||||
params_signature: params,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(text_before_paren, text_inside_outer_parens)` for the first
|
||||
/// top-level `(` in `s`. Returns `None` when no top-level paren is present.
|
||||
fn find_paren_split(s: &str) -> Option<(&str, &str)> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' if depth_angle == 0 => {
|
||||
// Find matching close at depth 0 on parens.
|
||||
let mut depth_paren = 1i32;
|
||||
let mut depth_angle2 = 0i32;
|
||||
for (j, &b2) in bytes.iter().enumerate().skip(i + 1) {
|
||||
match b2 {
|
||||
b'<' => depth_angle2 += 1,
|
||||
b'>' if depth_angle2 > 0 => depth_angle2 -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' => {
|
||||
depth_paren -= 1;
|
||||
if depth_paren == 0 {
|
||||
return Some((&s[..i], &s[i + 1..j]));
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Strip a leading return-type token (everything up to and including the
|
||||
/// last top-level whitespace). E.g. `void __cdecl Foo::Bar` → `Foo::Bar`.
|
||||
fn strip_return_type_prefix(s: &str) -> String {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
let mut depth_paren: i32 = 0;
|
||||
let mut last_ws_at: Option<usize> = None;
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' if depth_paren > 0 => depth_paren -= 1,
|
||||
b' ' if depth_angle == 0 && depth_paren == 0 => last_ws_at = Some(i),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
match last_ws_at {
|
||||
Some(i) => s[i + 1..].to_string(),
|
||||
None => s.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split a fully-qualified name on top-level `::` and tag the parts.
|
||||
fn split_qname(qname: &str) -> (Option<String>, Option<String>, Option<String>) {
|
||||
if qname.is_empty() {
|
||||
return (None, None, None);
|
||||
}
|
||||
let parts = top_level_split_colon_colon(qname);
|
||||
match parts.len() {
|
||||
0 => (None, None, None),
|
||||
1 => (None, None, Some(parts[0].clone())),
|
||||
2 => (None, Some(parts[0].clone()), Some(parts[1].clone())),
|
||||
_ => {
|
||||
let n = parts.len();
|
||||
let method = parts[n - 1].clone();
|
||||
let class = parts[n - 2].clone();
|
||||
let ns = parts[..n - 2].join("::");
|
||||
(Some(ns), Some(class), Some(method))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Split on top-level `::` — `::` inside `<...>` or `(...)` is preserved.
|
||||
fn top_level_split_colon_colon(s: &str) -> Vec<String> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut depth_angle: i32 = 0;
|
||||
let mut depth_paren: i32 = 0;
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
let mut start = 0usize;
|
||||
let mut i = 0usize;
|
||||
while i < bytes.len() {
|
||||
let b = bytes[i];
|
||||
match b {
|
||||
b'<' => depth_angle += 1,
|
||||
b'>' if depth_angle > 0 => depth_angle -= 1,
|
||||
b'(' => depth_paren += 1,
|
||||
b')' if depth_paren > 0 => depth_paren -= 1,
|
||||
b':' if depth_angle == 0
|
||||
&& depth_paren == 0
|
||||
&& i + 1 < bytes.len()
|
||||
&& bytes[i + 1] == b':' =>
|
||||
{
|
||||
out.push(s[start..i].to_string());
|
||||
start = i + 2;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
out.push(s[start..].to_string());
|
||||
out.into_iter().filter(|p| !p.is_empty()).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn early_out_on_non_mangled() {
|
||||
assert!(demangle("plain_c_name").is_none());
|
||||
assert!(demangle("Foo::Bar").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn demangle_or_raw_records_failures() {
|
||||
let d = demangle_or_raw("not_mangled");
|
||||
assert_eq!(d.mangled, "not_mangled");
|
||||
assert_eq!(d.raw_demangled, "not_mangled");
|
||||
assert!(d.method_name.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_member_function() {
|
||||
// ?Setup@AudioSystem@apu@xe@@QEAAXXZ → public: __cdecl xe::apu::AudioSystem::Setup(void)
|
||||
let d = demangle("?Setup@AudioSystem@apu@xe@@QEAAXXZ").expect("should parse");
|
||||
assert_eq!(d.method_name.as_deref(), Some("Setup"));
|
||||
assert_eq!(d.class_name.as_deref(), Some("AudioSystem"));
|
||||
assert_eq!(d.namespace_path.as_deref(), Some("xe::apu"));
|
||||
assert_eq!(d.params_signature.as_deref(), Some("void"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rtti_type_descriptor_string() {
|
||||
// RTTI TypeDescriptor mangled name format: ".?AVClassName@@" → "class ClassName".
|
||||
// We strip the leading "." and call demangle on the "?AV…" part below in M3.
|
||||
// For now confirm the demangler handles the minimal class form.
|
||||
let d = demangle("?AVAudioSystem@apu@xe@@").expect("should parse");
|
||||
assert!(
|
||||
d.raw_demangled.contains("AudioSystem"),
|
||||
"raw='{}'",
|
||||
d.raw_demangled
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_qname_handles_namespace_chain() {
|
||||
let (ns, cls, m) = split_qname("a::b::c::Klass::method");
|
||||
assert_eq!(ns.as_deref(), Some("a::b::c"));
|
||||
assert_eq!(cls.as_deref(), Some("Klass"));
|
||||
assert_eq!(m.as_deref(), Some("method"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paren_split_handles_template_in_args() {
|
||||
// Templates inside the param list must not confuse paren matching.
|
||||
let s = "void __cdecl Foo::Bar(std::vector<int>, std::map<a, b>)";
|
||||
let (before, inside) = find_paren_split(s).expect("paren found");
|
||||
assert_eq!(before, "void __cdecl Foo::Bar");
|
||||
assert_eq!(inside, "std::vector<int>, std::map<a, b>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn double_colon_inside_template_not_split() {
|
||||
let parts = top_level_split_colon_colon("a::b<c::d>::e");
|
||||
assert_eq!(parts, vec!["a", "b<c::d>", "e"]);
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ pub mod disasm;
|
||||
pub mod formatter;
|
||||
pub mod sinks;
|
||||
pub mod sql_views;
|
||||
pub mod demangle;
|
||||
|
||||
mod ordinals;
|
||||
pub use ordinals::resolve_ordinal;
|
||||
|
||||
@@ -172,6 +172,15 @@ fn db_schema_matches_expected_columns() {
|
||||
("name", "VARCHAR"),
|
||||
("kind", "VARCHAR"),
|
||||
]),
|
||||
("demangled_names", &[
|
||||
("address", "BIGINT"),
|
||||
("mangled", "VARCHAR"),
|
||||
("raw_demangled", "VARCHAR"),
|
||||
("namespace_path", "VARCHAR"),
|
||||
("class_name", "VARCHAR"),
|
||||
("method_name", "VARCHAR"),
|
||||
("params_signature", "VARCHAR"),
|
||||
]),
|
||||
("xrefs", &[
|
||||
("source", "BIGINT"),
|
||||
("target", "BIGINT"),
|
||||
|
||||
Reference in New Issue
Block a user