xenia-analysis: unify disasm via xenia-cpu, split ingest/analyze, add sinks
The old src/ppc.rs that re-implemented PPC formatting collapses into a 30-line shim that delegates to xenia-cpu's single-source-of-truth disasm. A new disasm.rs wraps the shared iterator and feeds enriched items (analysis context: function membership, xrefs, mnemonics) into pluggable sinks. Sinks split: text.rs (objdump-like output), json.rs (JSONL stream matching the new xenia dis --json mode), duckdb.rs (the analysis DB ingest). db.rs is restructured into ingest_instructions + write_analysis_results so a run can stop after raw ingest, and a new target_hex column lands on the instructions table. sql_views.rs adds five additive views layered on top of the raw tables. Tests: assert-based JSON-fixture goldens (disasm_goldens) and a PRAGMA-table_info schema golden (db_schema_golden) covering all ingested tables and the SQL views. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
//! SQLite database writer for xenia-rs.
|
||||
//! DuckDB writer for xenia-rs.
|
||||
//!
|
||||
//! Layered, streaming writes shared by `extract`, `dis`, and `exec`.
|
||||
//! Each command's output is a superset of the previous:
|
||||
@@ -6,19 +6,119 @@
|
||||
//! - `dis --db` -> base + disasm tables (functions, labels, instructions, xrefs)
|
||||
//! - `exec --db` -> base + disasm + opt-in trace tables (exec_trace, import_calls, branch_trace)
|
||||
//!
|
||||
//! Performance: streaming commits every 100k rows, no end-of-run ANALYZE,
|
||||
//! progress messages before each index build.
|
||||
//! Bulk inserts use the DuckDB Appender API, which bypasses the SQL layer and
|
||||
//! writes directly to columnar storage — no transaction batching required.
|
||||
//!
|
||||
//! Trace kind values for `branch_trace.kind`:
|
||||
//! - "call" : any branch with LK set (raw & 1 == 1)
|
||||
//! - "return" : bclrx without LK
|
||||
//! - "jump" : bcctrx without LK
|
||||
//! - "branch" : bx/bcx without LK
|
||||
//! - `"call"` : any branch with LK set (raw & 1 == 1)
|
||||
//! - `"return"` : bclrx without LK
|
||||
//! - `"jump"` : bcctrx without LK
|
||||
//! - `"branch"` : bx/bcx without LK
|
||||
//!
|
||||
//! # Schema
|
||||
//!
|
||||
//! ## `metadata`
|
||||
//! Key-value table. One row per XEX header field. Values are strings.
|
||||
//!
|
||||
//! | key | value format | meaning |
|
||||
//! |--------------------|------------------|----------------------------------------------------|
|
||||
//! | `image_base` | `"0xXXXXXXXX"` | Virtual address where the PE image is mapped |
|
||||
//! | `entry_point` | `"0xXXXXXXXX"` | Absolute VA of the XEX entry point |
|
||||
//! | `original_pe_name` | string | Original PE filename from XEX optional headers |
|
||||
//! | `title_id` | `"0xXXXXXXXX"` | Xbox 360 Title ID (identifies the game) |
|
||||
//! | `media_id` | `"0xXXXXXXXX"` | Disc/media ID (identifies the specific disc build) |
|
||||
//!
|
||||
//! ## `sections`
|
||||
//! One row per PE section (`.text`, `.data`, etc.).
|
||||
//! - `name` — PE section name
|
||||
//! - `virtual_address` — RVA relative to `image_base` where the section is mapped in memory
|
||||
//! - `virtual_size` — Size in memory; may exceed `raw_size` due to BSS zero-fill
|
||||
//! - `raw_offset` — Byte offset of section data within the XEX/PE file
|
||||
//! - `raw_size` — Size of section data on disk
|
||||
//! - `flags` — `IMAGE_SCN_*` characteristics bit field
|
||||
//! - `is_code` — `true` if `IMAGE_SCN_CNT_CODE` is set
|
||||
//!
|
||||
//! ## `imports`
|
||||
//! One row per import record from the XEX import descriptor table.
|
||||
//! - `library` — Module name (e.g. `xboxkrnl.exe`, `xam.xex`)
|
||||
//! - `ordinal` — Numeric ordinal identifying the export within the library
|
||||
//! - `name` — Resolved human-readable symbol name; `NULL` if not in symbol table
|
||||
//! - `record_type` — XEX import record type: `0` = function thunk, `1` = variable
|
||||
//! - `address` — Absolute VA of the import thunk or variable in the binary
|
||||
//!
|
||||
//! ## `functions`
|
||||
//! One row per detected function (from prologue analysis).
|
||||
//! - `address` — Absolute VA of the function entry point (PK)
|
||||
//! - `name` — Symbol name, or `sub_XXXXXXXX` if unresolved
|
||||
//! - `end_address` — Absolute VA of last instruction + 4 (exclusive end)
|
||||
//! - `frame_size` — Stack frame size in bytes (from prologue)
|
||||
//! - `saved_gprs` — Bitmask of GPRs saved in prologue (bit N set ⇒ rN is saved)
|
||||
//! - `is_leaf` — `true` if the function has no outgoing calls (no `bl`/`blr`)
|
||||
//! - `is_saverestore` — `true` if this is a `__savegprlr_*`/`__restgprlr_*` compiler stub
|
||||
//!
|
||||
//! ## `labels`
|
||||
//! One row per named address; superset of functions.
|
||||
//! - `address` — Absolute VA (PK)
|
||||
//! - `name` — Symbol name
|
||||
//! - `kind` — One of: `function`, `import`, `saverestore`, `local`, `data`, `other`
|
||||
//!
|
||||
//! ## `instructions`
|
||||
//! One row per disassembled instruction.
|
||||
//! - `address` — Absolute VA (PK)
|
||||
//! - `raw` — 4-byte big-endian instruction word as integer
|
||||
//! - `mnemonic` — Base mnemonic (e.g. `stw`, `bl`, `cmpwi`)
|
||||
//! - `operands` — Operand string from base disassembly
|
||||
//! - `disasm` — Full base disassembly string (`mnemonic + " " + operands`)
|
||||
//! - `ext_mnemonic` — Simplified mnemonic (e.g. `mr` for `or rX,rY,rY`); `NULL` if none
|
||||
//! - `ext_operands` — Operands for the extended form; `NULL` if none
|
||||
//! - `ext_disasm` — Full extended disassembly string; `NULL` if none
|
||||
//! - `target_hex` — Resolved absolute branch target for `b`/`bc` (and link/AA variants); `NULL` for indirect or non-branch instructions. SQL views (`v_branch_xrefs`) self-join on this column.
|
||||
//! - `section` — Name of the PE section containing this instruction
|
||||
//! - `function` — VA of the enclosing function; `NULL` if not inside a detected function
|
||||
//! - `label` — Label name at this address; `NULL` if none
|
||||
//!
|
||||
//! ## `xrefs`
|
||||
//! One row per cross-reference edge (call, jump, data access).
|
||||
//! - `source` — Absolute VA of the instruction making the reference
|
||||
//! - `target` — Absolute VA of the referenced destination
|
||||
//! - `kind` — Reference type as the short tag from [`crate::xref::XrefKind::tag`]:
|
||||
//! `call`, `j` (jump), `br` (branch), `read` (data_read),
|
||||
//! `write` (data_write), `ref` (data_ref).
|
||||
//! Note: this is a different convention from `branch_trace.kind`,
|
||||
//! which uses the long names (`call` / `return` / `jump` / `branch`).
|
||||
//! - `instruction` — Mnemonic of the source instruction; `NULL` if address is not in binary
|
||||
//! - `source_func` — VA of the function containing `source`; `NULL` if unknown
|
||||
//! - `source_label` — Label at `source`; `NULL` if none
|
||||
//! - `target_label` — Label at `target`; `NULL` if none
|
||||
//!
|
||||
//! ## `exec_trace` *(opt-in: `--trace-instructions`)*
|
||||
//! One row per executed instruction.
|
||||
//! - `address` — Absolute VA of the instruction
|
||||
//! - `cycle` — Monotonic instruction counter (execution order)
|
||||
//! - `r3`, `r4`, `lr`, `sp` — Snapshot of key GPRs at time of execution
|
||||
//!
|
||||
//! ## `import_calls` *(opt-in: `--trace-imports`)*
|
||||
//! One row per intercepted kernel/import call.
|
||||
//! - `address` — VA of the import thunk
|
||||
//! - `cycle` — Instruction counter at point of interception
|
||||
//! - `module` — Library name (e.g. `xboxkrnl.exe`)
|
||||
//! - `ordinal` — Numeric ordinal within the module
|
||||
//! - `name` — Resolved symbol name
|
||||
//! - `arg_r3`–`arg_r6` — First four call arguments (PowerPC ABI: r3–r6)
|
||||
//! - `return_value` — Value in r3 after the call returns
|
||||
//!
|
||||
//! ## `branch_trace` *(opt-in: `--trace-branches`)*
|
||||
//! One row per taken branch.
|
||||
//! - `cycle` — Instruction counter
|
||||
//! - `source` — VA of the branch instruction
|
||||
//! - `target` — VA of the branch destination
|
||||
//! - `kind` — `call`, `return`, `jump`, or `branch` (see top-level doc)
|
||||
//! - `lr` — Link register value at time of branch
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use rusqlite::{Connection, params};
|
||||
use duckdb::{Connection, params};
|
||||
|
||||
use crate::func::FuncAnalysis;
|
||||
use crate::xref::{XrefMap, resolve_source_label};
|
||||
@@ -26,12 +126,9 @@ use crate::formatter::DisasmInfo;
|
||||
|
||||
const DEFAULT_BATCH_SIZE: u64 = 100_000;
|
||||
|
||||
/// Number of rows per DB commit / trace buffer flush.
|
||||
/// Configurable via the `XENIA_DB_BATCH_SIZE` env var (default 100_000).
|
||||
/// Used for:
|
||||
/// - `instructions` and `xrefs` streaming commits in `write_disasm`
|
||||
/// - `exec_trace` and `branch_trace` buffer thresholds during exec
|
||||
/// (`import_calls` always flushes at 1000 — low volume, not worth scaling.)
|
||||
/// Rows per trace buffer flush. Configurable via `XENIA_DB_BATCH_SIZE` env var (default 100_000).
|
||||
/// Applies to `exec_trace` and `branch_trace` buffer thresholds.
|
||||
/// `import_calls` always flushes at 1000 — low volume, not worth scaling.
|
||||
fn batch_size() -> u64 {
|
||||
use std::sync::OnceLock;
|
||||
static CACHED: OnceLock<u64> = OnceLock::new();
|
||||
@@ -94,12 +191,6 @@ impl DbWriter {
|
||||
std::fs::remove_file(path)?;
|
||||
}
|
||||
let conn = Connection::open(path)?;
|
||||
conn.execute_batch("
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = OFF;
|
||||
PRAGMA locking_mode = EXCLUSIVE;
|
||||
PRAGMA temp_store = MEMORY;
|
||||
")?;
|
||||
let cap = batch_size() as usize;
|
||||
Ok(Self {
|
||||
conn,
|
||||
@@ -118,29 +209,30 @@ impl DbWriter {
|
||||
// ── Base layer (written by extract/dis/exec) ─────────────────────────────
|
||||
|
||||
/// Write metadata, sections, imports tables and their indices.
|
||||
#[tracing::instrument(skip_all, name = "db.write_base")]
|
||||
pub fn write_base(&mut self, info: &DisasmInfo) -> anyhow::Result<()> {
|
||||
self.conn.execute_batch("
|
||||
CREATE TABLE metadata (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
key VARCHAR PRIMARY KEY, -- header field name
|
||||
value VARCHAR NOT NULL -- hex-formatted or plain string value
|
||||
);
|
||||
|
||||
CREATE TABLE sections (
|
||||
name TEXT NOT NULL,
|
||||
virtual_address INTEGER NOT NULL,
|
||||
virtual_size INTEGER NOT NULL,
|
||||
raw_offset INTEGER NOT NULL,
|
||||
raw_size INTEGER NOT NULL,
|
||||
flags INTEGER NOT NULL,
|
||||
is_code BOOLEAN NOT NULL
|
||||
name VARCHAR NOT NULL, -- PE section name (e.g. .text, .rdata)
|
||||
virtual_address BIGINT NOT NULL, -- RVA relative to image_base
|
||||
virtual_size BIGINT NOT NULL, -- size in memory; may exceed raw_size (BSS)
|
||||
raw_offset BIGINT NOT NULL, -- byte offset of section data in the file
|
||||
raw_size BIGINT NOT NULL, -- size of section data on disk
|
||||
flags BIGINT NOT NULL, -- IMAGE_SCN_* characteristics bit field
|
||||
is_code BOOLEAN NOT NULL -- true if IMAGE_SCN_CNT_CODE is set
|
||||
);
|
||||
|
||||
CREATE TABLE imports (
|
||||
library TEXT NOT NULL,
|
||||
ordinal INTEGER NOT NULL,
|
||||
name TEXT,
|
||||
record_type INTEGER NOT NULL,
|
||||
address INTEGER NOT NULL
|
||||
library VARCHAR NOT NULL, -- module name (e.g. xboxkrnl.exe, xam.xex)
|
||||
ordinal BIGINT NOT NULL, -- ordinal identifying the export within the library
|
||||
name VARCHAR, -- resolved symbol name; NULL if not in symbol table
|
||||
record_type BIGINT NOT NULL, -- 0 = function thunk, 1 = variable
|
||||
address BIGINT NOT NULL -- absolute VA of the thunk or variable
|
||||
);
|
||||
")?;
|
||||
|
||||
@@ -150,15 +242,69 @@ impl DbWriter {
|
||||
|
||||
self.conn.execute_batch("
|
||||
CREATE INDEX idx_imports_library ON imports(library);
|
||||
CREATE INDEX idx_imports_name ON imports(name);
|
||||
CREATE INDEX idx_imports_name ON imports(name);
|
||||
")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ── Disasm layer (written by dis/exec) ───────────────────────────────────
|
||||
|
||||
/// Write functions, labels, instructions, xrefs tables and indices.
|
||||
pub fn write_disasm(
|
||||
/// Phase-3 ingest pass — purely mechanical disasm rows. Creates the
|
||||
/// `instructions` table (and its indices) and streams every code-section
|
||||
/// instruction through the iterator + DuckDB sink. Does NOT touch
|
||||
/// `functions` / `labels` / `xrefs` — that's [`Self::write_analysis_results`].
|
||||
///
|
||||
/// `func_analysis` and `labels` are still required at this layer because
|
||||
/// each row carries the rolling-window `function` and `label` columns for
|
||||
/// downstream queries.
|
||||
#[tracing::instrument(skip_all, name = "db.ingest_instructions")]
|
||||
pub fn ingest_instructions(
|
||||
&mut self,
|
||||
pe: &[u8],
|
||||
info: &DisasmInfo,
|
||||
func_analysis: &FuncAnalysis,
|
||||
labels: &HashMap<u32, String>,
|
||||
) -> anyhow::Result<()> {
|
||||
self.conn.execute_batch("
|
||||
CREATE TABLE instructions (
|
||||
address BIGINT PRIMARY KEY, -- absolute VA
|
||||
raw BIGINT NOT NULL, -- 4-byte big-endian instruction word as integer
|
||||
mnemonic VARCHAR NOT NULL, -- base mnemonic (e.g. stw, bl, cmpwi)
|
||||
operands VARCHAR NOT NULL, -- operand string from base disassembly
|
||||
disasm VARCHAR NOT NULL, -- full base disassembly (mnemonic + operands)
|
||||
ext_mnemonic VARCHAR, -- simplified mnemonic (e.g. mr); NULL if none
|
||||
ext_operands VARCHAR, -- operands for the extended form; NULL if none
|
||||
ext_disasm VARCHAR, -- full extended disassembly string; NULL if none
|
||||
target_hex BIGINT, -- resolved absolute target for direct branches; NULL for indirect/non-branch
|
||||
section VARCHAR NOT NULL, -- PE section name containing this instruction
|
||||
function BIGINT, -- VA of the enclosing function; NULL if unknown
|
||||
label VARCHAR -- label at this address; NULL if none
|
||||
);
|
||||
")?;
|
||||
|
||||
insert_instructions_streaming(&self.conn, pe, info, func_analysis, labels)?;
|
||||
|
||||
let indices = [
|
||||
("idx_instructions_function", "CREATE INDEX idx_instructions_function ON instructions(function)"),
|
||||
("idx_instructions_mnemonic", "CREATE INDEX idx_instructions_mnemonic ON instructions(mnemonic)"),
|
||||
("idx_instructions_ext_mnemonic", "CREATE INDEX idx_instructions_ext_mnemonic ON instructions(ext_mnemonic)"),
|
||||
("idx_instructions_section", "CREATE INDEX idx_instructions_section ON instructions(section)"),
|
||||
("idx_instructions_label", "CREATE INDEX idx_instructions_label ON instructions(label)"),
|
||||
("idx_instructions_target_hex", "CREATE INDEX idx_instructions_target_hex ON instructions(target_hex)"),
|
||||
];
|
||||
for (name, sql) in indices {
|
||||
tracing::debug!(index = name, "creating instructions index");
|
||||
self.conn.execute_batch(sql)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Phase-3 analyze pass — writes the Rust-pass-derived tables
|
||||
/// (`functions`, `labels`, `xrefs`) and their indices. Always executes
|
||||
/// in `--analyze=rust` and `--analyze=both` modes; skipped only when
|
||||
/// the caller deliberately chooses a Rust-free DB layout.
|
||||
#[tracing::instrument(skip_all, name = "db.write_analysis_results")]
|
||||
pub fn write_analysis_results(
|
||||
&mut self,
|
||||
pe: &[u8],
|
||||
info: &DisasmInfo,
|
||||
@@ -168,74 +314,111 @@ impl DbWriter {
|
||||
) -> anyhow::Result<()> {
|
||||
self.conn.execute_batch("
|
||||
CREATE TABLE functions (
|
||||
address INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
end_address INTEGER NOT NULL,
|
||||
frame_size INTEGER NOT NULL,
|
||||
saved_gprs INTEGER NOT NULL,
|
||||
is_leaf BOOLEAN NOT NULL,
|
||||
is_saverestore BOOLEAN NOT NULL
|
||||
address BIGINT PRIMARY KEY, -- absolute VA of entry point
|
||||
name VARCHAR NOT NULL, -- symbol name, or sub_XXXXXXXX if unresolved
|
||||
end_address BIGINT NOT NULL, -- VA of last instruction + 4 (exclusive end)
|
||||
frame_size BIGINT NOT NULL, -- stack frame size in bytes (from prologue)
|
||||
saved_gprs BIGINT NOT NULL, -- bitmask of GPRs saved in prologue (bit N = rN)
|
||||
is_leaf BOOLEAN NOT NULL, -- true if the function has no outgoing calls
|
||||
is_saverestore BOOLEAN NOT NULL -- true if __savegprlr_* / __restgprlr_* stub
|
||||
);
|
||||
|
||||
CREATE TABLE labels (
|
||||
address INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE instructions (
|
||||
address INTEGER PRIMARY KEY,
|
||||
raw INTEGER NOT NULL,
|
||||
mnemonic TEXT NOT NULL,
|
||||
operands TEXT NOT NULL,
|
||||
disasm TEXT NOT NULL,
|
||||
ext_mnemonic TEXT,
|
||||
ext_operands TEXT,
|
||||
ext_disasm TEXT,
|
||||
section TEXT NOT NULL,
|
||||
function INTEGER,
|
||||
label TEXT
|
||||
address BIGINT PRIMARY KEY, -- absolute VA
|
||||
name VARCHAR NOT NULL, -- symbol name
|
||||
kind VARCHAR NOT NULL -- function | import | saverestore | local | data | other
|
||||
);
|
||||
|
||||
CREATE TABLE xrefs (
|
||||
source INTEGER NOT NULL,
|
||||
target INTEGER NOT NULL,
|
||||
kind TEXT NOT NULL,
|
||||
instruction TEXT,
|
||||
source_func INTEGER,
|
||||
source_label TEXT,
|
||||
target_label TEXT
|
||||
source BIGINT NOT NULL, -- VA of the referencing instruction
|
||||
target BIGINT NOT NULL, -- VA of the referenced destination
|
||||
kind VARCHAR NOT NULL, -- call | jump | branch | data_read | data_write | data_ref
|
||||
instruction VARCHAR, -- mnemonic of source instruction; NULL if not in binary
|
||||
source_func BIGINT, -- VA of the function containing source; NULL if unknown
|
||||
source_label VARCHAR, -- label at source; NULL if none
|
||||
target_label VARCHAR -- label at target; NULL if none
|
||||
);
|
||||
")?;
|
||||
|
||||
insert_functions(&self.conn, func_analysis, labels)?;
|
||||
insert_labels(&self.conn, labels)?;
|
||||
insert_instructions_streaming(&self.conn, pe, info, func_analysis, labels)?;
|
||||
insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?;
|
||||
|
||||
let indices = [
|
||||
("idx_functions_name", "CREATE INDEX idx_functions_name ON functions(name)"),
|
||||
("idx_labels_kind", "CREATE INDEX idx_labels_kind ON labels(kind)"),
|
||||
("idx_labels_name", "CREATE INDEX idx_labels_name ON labels(name)"),
|
||||
("idx_instructions_function", "CREATE INDEX idx_instructions_function ON instructions(function)"),
|
||||
("idx_instructions_mnemonic", "CREATE INDEX idx_instructions_mnemonic ON instructions(mnemonic)"),
|
||||
("idx_instructions_ext_mnemonic","CREATE INDEX idx_instructions_ext_mnemonic ON instructions(ext_mnemonic)"),
|
||||
("idx_instructions_section", "CREATE INDEX idx_instructions_section ON instructions(section)"),
|
||||
("idx_instructions_label", "CREATE INDEX idx_instructions_label ON instructions(label)"),
|
||||
("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"),
|
||||
("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"),
|
||||
("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"),
|
||||
("idx_xrefs_kind", "CREATE INDEX idx_xrefs_kind ON xrefs(kind)"),
|
||||
("idx_xrefs_instruction", "CREATE INDEX idx_xrefs_instruction ON xrefs(instruction)"),
|
||||
("idx_xrefs_target_label", "CREATE INDEX idx_xrefs_target_label ON xrefs(target_label)"),
|
||||
("idx_functions_name", "CREATE INDEX idx_functions_name ON functions(name)"),
|
||||
("idx_labels_kind", "CREATE INDEX idx_labels_kind ON labels(kind)"),
|
||||
("idx_labels_name", "CREATE INDEX idx_labels_name ON labels(name)"),
|
||||
("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"),
|
||||
("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"),
|
||||
("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"),
|
||||
("idx_xrefs_kind", "CREATE INDEX idx_xrefs_kind ON xrefs(kind)"),
|
||||
("idx_xrefs_instruction", "CREATE INDEX idx_xrefs_instruction ON xrefs(instruction)"),
|
||||
("idx_xrefs_target_label", "CREATE INDEX idx_xrefs_target_label ON xrefs(target_label)"),
|
||||
];
|
||||
for (name, sql) in indices {
|
||||
eprintln!("[db] creating {name}...");
|
||||
tracing::debug!(index = name, "creating analysis index");
|
||||
self.conn.execute_batch(sql)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Back-compat wrapper for callers that want the full pre-Phase-3
|
||||
/// "everything in one shot" behaviour. Equivalent to
|
||||
/// `ingest_instructions` + `write_analysis_results`.
|
||||
#[tracing::instrument(skip_all, name = "db.write_disasm")]
|
||||
pub fn write_disasm(
|
||||
&mut self,
|
||||
pe: &[u8],
|
||||
info: &DisasmInfo,
|
||||
func_analysis: &FuncAnalysis,
|
||||
labels: &HashMap<u32, String>,
|
||||
xrefs: &XrefMap,
|
||||
) -> anyhow::Result<()> {
|
||||
self.ingest_instructions(pe, info, func_analysis, labels)?;
|
||||
self.write_analysis_results(pe, info, func_analysis, labels, xrefs)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Phase-3 SQL-views layer — defines additive read-only views over
|
||||
/// `instructions` (and optionally `xrefs`/`functions`/`labels`).
|
||||
/// See [`crate::sql_views`] for the SQL definitions.
|
||||
///
|
||||
/// Called when `--analyze=sql` or `--analyze=both` is in effect.
|
||||
#[tracing::instrument(skip_all, name = "db.create_sql_views")]
|
||||
pub fn create_sql_views(&mut self) -> anyhow::Result<()> {
|
||||
for (name, sql) in crate::sql_views::ALL_VIEWS {
|
||||
tracing::debug!(view = name, "creating SQL view");
|
||||
self.conn.execute_batch(sql)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Cross-check: count branch xrefs found by the SQL view that are absent
|
||||
/// from the Rust-pass `xrefs` table (and vice versa). Returns
|
||||
/// `(sql_only, rust_only)` row counts. Both should be zero — the two
|
||||
/// surfaces produce identical edges by construction. A non-zero count
|
||||
/// signals drift between the formatter's `mnemonic` column and
|
||||
/// `xref.rs`'s opcode classification, and is logged as a warning by the
|
||||
/// caller.
|
||||
#[tracing::instrument(skip_all, name = "db.cross_check_branch_xrefs")]
|
||||
pub fn cross_check_branch_xrefs(&self) -> anyhow::Result<(u64, u64)> {
|
||||
let sql_only: i64 = self.conn.query_row(
|
||||
"SELECT COUNT(*) FROM v_branch_xrefs vb \
|
||||
LEFT JOIN xrefs x \
|
||||
ON x.source = vb.source AND x.target = vb.target AND x.kind = vb.kind \
|
||||
WHERE x.source IS NULL",
|
||||
[], |row| row.get(0)
|
||||
)?;
|
||||
let rust_only: i64 = self.conn.query_row(
|
||||
"SELECT COUNT(*) FROM xrefs x \
|
||||
LEFT JOIN v_branch_xrefs vb \
|
||||
ON vb.source = x.source AND vb.target = x.target AND vb.kind = x.kind \
|
||||
WHERE x.kind IN ('call','j','br') AND vb.source IS NULL",
|
||||
[], |row| row.get(0)
|
||||
)?;
|
||||
Ok((sql_only as u64, rust_only as u64))
|
||||
}
|
||||
|
||||
// ── Trace layer (written by exec when flags enabled) ─────────────────────
|
||||
|
||||
/// Create the opt-in trace tables. No-op if all flags are false.
|
||||
@@ -251,49 +434,43 @@ impl DbWriter {
|
||||
|
||||
if trace_instructions {
|
||||
self.conn.execute_batch("
|
||||
CREATE TABLE IF NOT EXISTS exec_trace (
|
||||
id INTEGER PRIMARY KEY,
|
||||
address INTEGER NOT NULL,
|
||||
cycle INTEGER NOT NULL,
|
||||
r3 INTEGER NOT NULL,
|
||||
r4 INTEGER NOT NULL,
|
||||
lr INTEGER NOT NULL,
|
||||
sp INTEGER NOT NULL
|
||||
CREATE TABLE exec_trace (
|
||||
address BIGINT NOT NULL, -- absolute VA of the instruction
|
||||
cycle BIGINT NOT NULL, -- monotonic instruction counter (execution order)
|
||||
r3 BIGINT NOT NULL, -- r3 at time of execution
|
||||
r4 BIGINT NOT NULL, -- r4 at time of execution
|
||||
lr BIGINT NOT NULL, -- link register
|
||||
sp BIGINT NOT NULL -- stack pointer
|
||||
);
|
||||
DELETE FROM exec_trace;
|
||||
")?;
|
||||
}
|
||||
|
||||
if trace_imports {
|
||||
self.conn.execute_batch("
|
||||
CREATE TABLE IF NOT EXISTS import_calls (
|
||||
id INTEGER PRIMARY KEY,
|
||||
address INTEGER NOT NULL,
|
||||
cycle INTEGER NOT NULL,
|
||||
module TEXT NOT NULL,
|
||||
ordinal INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
arg_r3 INTEGER NOT NULL,
|
||||
arg_r4 INTEGER NOT NULL,
|
||||
arg_r5 INTEGER NOT NULL,
|
||||
arg_r6 INTEGER NOT NULL,
|
||||
return_value INTEGER NOT NULL
|
||||
CREATE TABLE import_calls (
|
||||
address BIGINT NOT NULL, -- VA of the import thunk
|
||||
cycle BIGINT NOT NULL, -- instruction counter at interception
|
||||
module VARCHAR NOT NULL, -- library name (e.g. xboxkrnl.exe)
|
||||
ordinal BIGINT NOT NULL, -- ordinal within the module
|
||||
name VARCHAR NOT NULL, -- resolved symbol name
|
||||
arg_r3 BIGINT NOT NULL, -- first argument (r3)
|
||||
arg_r4 BIGINT NOT NULL, -- second argument (r4)
|
||||
arg_r5 BIGINT NOT NULL, -- third argument (r5)
|
||||
arg_r6 BIGINT NOT NULL, -- fourth argument (r6)
|
||||
return_value BIGINT NOT NULL -- r3 after the call returns
|
||||
);
|
||||
DELETE FROM import_calls;
|
||||
")?;
|
||||
}
|
||||
|
||||
if trace_branches {
|
||||
self.conn.execute_batch("
|
||||
CREATE TABLE IF NOT EXISTS branch_trace (
|
||||
id INTEGER PRIMARY KEY,
|
||||
cycle INTEGER NOT NULL,
|
||||
source INTEGER NOT NULL,
|
||||
target INTEGER NOT NULL,
|
||||
kind TEXT NOT NULL,
|
||||
lr INTEGER NOT NULL
|
||||
CREATE TABLE branch_trace (
|
||||
cycle BIGINT NOT NULL, -- instruction counter
|
||||
source BIGINT NOT NULL, -- VA of the branch instruction
|
||||
target BIGINT NOT NULL, -- VA of the branch destination
|
||||
kind VARCHAR NOT NULL, -- call | return | jump | branch
|
||||
lr BIGINT NOT NULL -- link register at time of branch
|
||||
);
|
||||
DELETE FROM branch_trace;
|
||||
")?;
|
||||
}
|
||||
|
||||
@@ -326,109 +503,99 @@ impl DbWriter {
|
||||
|
||||
fn flush_exec(&mut self) {
|
||||
if self.exec_buffer.is_empty() { return; }
|
||||
let tx = self.conn.unchecked_transaction().unwrap();
|
||||
{
|
||||
let mut stmt = tx.prepare_cached(
|
||||
"INSERT INTO exec_trace (address, cycle, r3, r4, lr, sp) VALUES (?1, ?2, ?3, ?4, ?5, ?6)"
|
||||
).unwrap();
|
||||
for e in &self.exec_buffer {
|
||||
stmt.execute(params![
|
||||
e.address as i64,
|
||||
e.cycle as i64,
|
||||
e.r3 as i64,
|
||||
e.r4 as i64,
|
||||
e.lr as i64,
|
||||
e.sp as i64,
|
||||
]).ok();
|
||||
}
|
||||
let mut appender = self.conn.appender("exec_trace").unwrap();
|
||||
for e in &self.exec_buffer {
|
||||
appender.append_row(params![
|
||||
e.address as i64,
|
||||
e.cycle as i64,
|
||||
e.r3 as i64,
|
||||
e.r4 as i64,
|
||||
e.lr as i64,
|
||||
e.sp as i64,
|
||||
]).ok();
|
||||
}
|
||||
tx.commit().ok();
|
||||
appender.flush().ok();
|
||||
self.exec_count += self.exec_buffer.len() as u64;
|
||||
self.exec_buffer.clear();
|
||||
}
|
||||
|
||||
fn flush_imports(&mut self) {
|
||||
if self.import_buffer.is_empty() { return; }
|
||||
let tx = self.conn.unchecked_transaction().unwrap();
|
||||
{
|
||||
let mut stmt = tx.prepare_cached(
|
||||
"INSERT INTO import_calls (address, cycle, module, ordinal, name, arg_r3, arg_r4, arg_r5, arg_r6, return_value)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)"
|
||||
).unwrap();
|
||||
for e in &self.import_buffer {
|
||||
stmt.execute(params![
|
||||
e.address as i64,
|
||||
e.cycle as i64,
|
||||
e.module,
|
||||
e.ordinal as i64,
|
||||
e.name,
|
||||
e.arg_r3 as i64,
|
||||
e.arg_r4 as i64,
|
||||
e.arg_r5 as i64,
|
||||
e.arg_r6 as i64,
|
||||
e.return_value as i64,
|
||||
]).ok();
|
||||
}
|
||||
let mut appender = self.conn.appender("import_calls").unwrap();
|
||||
for e in &self.import_buffer {
|
||||
appender.append_row(params![
|
||||
e.address as i64,
|
||||
e.cycle as i64,
|
||||
e.module.as_str(),
|
||||
e.ordinal as i64,
|
||||
e.name.as_str(),
|
||||
e.arg_r3 as i64,
|
||||
e.arg_r4 as i64,
|
||||
e.arg_r5 as i64,
|
||||
e.arg_r6 as i64,
|
||||
e.return_value as i64,
|
||||
]).ok();
|
||||
}
|
||||
tx.commit().ok();
|
||||
appender.flush().ok();
|
||||
self.import_count += self.import_buffer.len() as u64;
|
||||
self.import_buffer.clear();
|
||||
}
|
||||
|
||||
fn flush_branches(&mut self) {
|
||||
if self.branch_buffer.is_empty() { return; }
|
||||
let tx = self.conn.unchecked_transaction().unwrap();
|
||||
{
|
||||
let mut stmt = tx.prepare_cached(
|
||||
"INSERT INTO branch_trace (cycle, source, target, kind, lr) VALUES (?1, ?2, ?3, ?4, ?5)"
|
||||
).unwrap();
|
||||
for e in &self.branch_buffer {
|
||||
stmt.execute(params![
|
||||
e.cycle as i64,
|
||||
e.source as i64,
|
||||
e.target as i64,
|
||||
e.kind,
|
||||
e.lr as i64,
|
||||
]).ok();
|
||||
}
|
||||
let mut appender = self.conn.appender("branch_trace").unwrap();
|
||||
for e in &self.branch_buffer {
|
||||
appender.append_row(params![
|
||||
e.cycle as i64,
|
||||
e.source as i64,
|
||||
e.target as i64,
|
||||
e.kind,
|
||||
e.lr as i64,
|
||||
]).ok();
|
||||
}
|
||||
tx.commit().ok();
|
||||
appender.flush().ok();
|
||||
self.branch_count += self.branch_buffer.len() as u64;
|
||||
self.branch_buffer.clear();
|
||||
}
|
||||
|
||||
/// Flush remaining trace buffers and create their indices.
|
||||
#[tracing::instrument(skip_all, name = "db.finalize_traces")]
|
||||
pub fn finalize_traces(&mut self) -> anyhow::Result<()> {
|
||||
self.flush_exec();
|
||||
self.flush_imports();
|
||||
self.flush_branches();
|
||||
|
||||
if self.trace_instructions {
|
||||
eprintln!("[db] creating idx_exec_trace_address...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_exec_trace_address ON exec_trace(address);")?;
|
||||
eprintln!("[db] creating idx_exec_trace_cycle...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_exec_trace_cycle ON exec_trace(cycle);")?;
|
||||
tracing::debug!("creating idx_exec_trace_address");
|
||||
self.conn.execute_batch("CREATE INDEX idx_exec_trace_address ON exec_trace(address);")?;
|
||||
tracing::debug!("creating idx_exec_trace_cycle");
|
||||
self.conn.execute_batch("CREATE INDEX idx_exec_trace_cycle ON exec_trace(cycle);")?;
|
||||
}
|
||||
if self.trace_imports {
|
||||
eprintln!("[db] creating idx_import_calls_name...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_import_calls_name ON import_calls(name);")?;
|
||||
eprintln!("[db] creating idx_import_calls_cycle...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_import_calls_cycle ON import_calls(cycle);")?;
|
||||
tracing::debug!("creating idx_import_calls_name");
|
||||
self.conn.execute_batch("CREATE INDEX idx_import_calls_name ON import_calls(name);")?;
|
||||
tracing::debug!("creating idx_import_calls_cycle");
|
||||
self.conn.execute_batch("CREATE INDEX idx_import_calls_cycle ON import_calls(cycle);")?;
|
||||
}
|
||||
if self.trace_branches {
|
||||
eprintln!("[db] creating idx_branch_trace_source...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_branch_trace_source ON branch_trace(source);")?;
|
||||
eprintln!("[db] creating idx_branch_trace_target...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_branch_trace_target ON branch_trace(target);")?;
|
||||
eprintln!("[db] creating idx_branch_trace_kind...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_branch_trace_kind ON branch_trace(kind);")?;
|
||||
eprintln!("[db] creating idx_branch_trace_cycle...");
|
||||
self.conn.execute_batch("CREATE INDEX IF NOT EXISTS idx_branch_trace_cycle ON branch_trace(cycle);")?;
|
||||
tracing::debug!("creating idx_branch_trace_source");
|
||||
self.conn.execute_batch("CREATE INDEX idx_branch_trace_source ON branch_trace(source);")?;
|
||||
tracing::debug!("creating idx_branch_trace_target");
|
||||
self.conn.execute_batch("CREATE INDEX idx_branch_trace_target ON branch_trace(target);")?;
|
||||
tracing::debug!("creating idx_branch_trace_kind");
|
||||
self.conn.execute_batch("CREATE INDEX idx_branch_trace_kind ON branch_trace(kind);")?;
|
||||
tracing::debug!("creating idx_branch_trace_cycle");
|
||||
self.conn.execute_batch("CREATE INDEX idx_branch_trace_cycle ON branch_trace(cycle);")?;
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
"[db] trace totals: {} instructions, {} imports, {} branches",
|
||||
self.exec_count, self.import_count, self.branch_count
|
||||
metrics::counter!("db.rows", "table" => "exec_trace").increment(self.exec_count);
|
||||
metrics::counter!("db.rows", "table" => "import_calls").increment(self.import_count);
|
||||
metrics::counter!("db.rows", "table" => "branch_trace").increment(self.branch_count);
|
||||
tracing::info!(
|
||||
instructions = self.exec_count,
|
||||
imports = self.import_count,
|
||||
branches = self.branch_count,
|
||||
"trace totals"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -453,7 +620,7 @@ pub fn write_db(
|
||||
// ── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
fn insert_metadata(conn: &Connection, info: &DisasmInfo) -> anyhow::Result<()> {
|
||||
let mut stmt = conn.prepare("INSERT INTO metadata (key, value) VALUES (?1, ?2)")?;
|
||||
let mut stmt = conn.prepare("INSERT INTO metadata (key, value) VALUES (?, ?)")?;
|
||||
stmt.execute(params!["image_base", format!("0x{:08X}", info.image_base)])?;
|
||||
stmt.execute(params!["entry_point", format!("0x{:08X}", info.entry_point)])?;
|
||||
if let Some(name) = info.original_pe_name {
|
||||
@@ -471,7 +638,7 @@ fn insert_metadata(conn: &Connection, info: &DisasmInfo) -> anyhow::Result<()> {
|
||||
fn insert_sections(conn: &Connection, sections: &[xenia_xex::pe::PeSection]) -> anyhow::Result<()> {
|
||||
let mut stmt = conn.prepare(
|
||||
"INSERT INTO sections (name, virtual_address, virtual_size, raw_offset, raw_size, flags, is_code)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)"
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)"
|
||||
)?;
|
||||
for s in sections {
|
||||
stmt.execute(params![
|
||||
@@ -481,7 +648,7 @@ fn insert_sections(conn: &Connection, sections: &[xenia_xex::pe::PeSection]) ->
|
||||
s.raw_offset as i64,
|
||||
s.raw_size as i64,
|
||||
s.flags as i64,
|
||||
s.is_code() as i32,
|
||||
s.is_code(),
|
||||
])?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -490,7 +657,7 @@ fn insert_sections(conn: &Connection, sections: &[xenia_xex::pe::PeSection]) ->
|
||||
fn insert_imports(conn: &Connection, info: &DisasmInfo) -> anyhow::Result<()> {
|
||||
let mut stmt = conn.prepare(
|
||||
"INSERT INTO imports (library, ordinal, name, record_type, address)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5)"
|
||||
VALUES (?, ?, ?, ?, ?)"
|
||||
)?;
|
||||
for lib in info.import_libraries {
|
||||
for imp in &lib.imports {
|
||||
@@ -514,7 +681,7 @@ fn insert_functions(
|
||||
) -> anyhow::Result<()> {
|
||||
let mut stmt = conn.prepare(
|
||||
"INSERT INTO functions (address, name, end_address, frame_size, saved_gprs, is_leaf, is_saverestore)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)"
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)"
|
||||
)?;
|
||||
for (&addr, fi) in &func_analysis.functions {
|
||||
let name = labels.get(&addr)
|
||||
@@ -526,8 +693,8 @@ fn insert_functions(
|
||||
fi.end as i64,
|
||||
fi.frame_size as i64,
|
||||
fi.saved_gprs as i64,
|
||||
fi.is_leaf as i32,
|
||||
fi.is_saverestore as i32,
|
||||
fi.is_leaf,
|
||||
fi.is_saverestore,
|
||||
])?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -538,7 +705,7 @@ fn insert_labels(
|
||||
labels: &HashMap<u32, String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut stmt = conn.prepare(
|
||||
"INSERT OR IGNORE INTO labels (address, name, kind) VALUES (?1, ?2, ?3)"
|
||||
"INSERT INTO labels (address, name, kind) VALUES (?, ?, ?) ON CONFLICT DO NOTHING"
|
||||
)?;
|
||||
for (&addr, name) in labels {
|
||||
let kind = if name.starts_with("sub_") || name == "entry_point" {
|
||||
@@ -566,78 +733,22 @@ fn insert_instructions_streaming(
|
||||
func_analysis: &FuncAnalysis,
|
||||
labels: &HashMap<u32, String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut tx = conn.unchecked_transaction()?;
|
||||
let mut count: u64 = 0;
|
||||
let mut since_commit: u64 = 0;
|
||||
let mut appender = conn.appender("instructions")?;
|
||||
let mut total: u64 = 0;
|
||||
|
||||
for section in info.sections {
|
||||
if !section.is_code() { continue; }
|
||||
|
||||
let va_start = section.virtual_address;
|
||||
let va_end = va_start + section.virtual_size;
|
||||
let file_start = section.virtual_address as usize;
|
||||
|
||||
let mut current_func: Option<u32> = None;
|
||||
let mut addr = va_start;
|
||||
|
||||
while addr < va_end {
|
||||
let abs_addr = info.image_base + addr;
|
||||
let off = (addr - va_start) as usize + file_start;
|
||||
if off + 4 > pe.len() { break; }
|
||||
|
||||
if func_analysis.is_function_start(abs_addr) {
|
||||
current_func = Some(abs_addr);
|
||||
}
|
||||
|
||||
let instr = u32::from_be_bytes([pe[off], pe[off+1], pe[off+2], pe[off+3]]);
|
||||
let decoded = crate::ppc::disasm(instr, abs_addr);
|
||||
let (mnemonic, operands) = split_disasm(&decoded.base);
|
||||
|
||||
let (ext_mnemonic, ext_operands, ext_disasm): (Option<&str>, Option<&str>, Option<&str>) =
|
||||
match &decoded.ext {
|
||||
Some(ext) => {
|
||||
let (em, eo) = split_disasm(ext);
|
||||
(Some(em), Some(eo), Some(ext.as_str()))
|
||||
}
|
||||
None => (None, None, None),
|
||||
};
|
||||
let label = labels.get(&abs_addr).map(|s| s.as_str());
|
||||
|
||||
{
|
||||
let mut stmt = tx.prepare_cached(
|
||||
"INSERT INTO instructions (address, raw, mnemonic, operands, disasm, ext_mnemonic, ext_operands, ext_disasm, section, function, label)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)"
|
||||
)?;
|
||||
stmt.execute(params![
|
||||
abs_addr as i64,
|
||||
instr as i64,
|
||||
mnemonic,
|
||||
operands,
|
||||
decoded.base,
|
||||
ext_mnemonic,
|
||||
ext_operands,
|
||||
ext_disasm,
|
||||
section.name,
|
||||
current_func.map(|a| a as i64),
|
||||
label,
|
||||
])?;
|
||||
}
|
||||
|
||||
count += 1;
|
||||
since_commit += 1;
|
||||
addr += 4;
|
||||
|
||||
if since_commit >= batch_size() {
|
||||
tx.commit()?;
|
||||
eprintln!("[db] instructions: {count} committed");
|
||||
tx = conn.unchecked_transaction()?;
|
||||
since_commit = 0;
|
||||
}
|
||||
}
|
||||
let va_start = info.image_base + section.virtual_address;
|
||||
let va_end = info.image_base + section.virtual_address + section.virtual_size;
|
||||
let items = crate::disasm::enrich_section(
|
||||
pe, info.image_base, §ion.name, va_start, va_end, func_analysis, labels,
|
||||
);
|
||||
total += crate::sinks::duckdb::append_instructions(&mut appender, items)?;
|
||||
}
|
||||
|
||||
tx.commit()?;
|
||||
eprintln!("[db] inserted {count} instructions");
|
||||
appender.flush()?;
|
||||
metrics::counter!("db.rows", "table" => "instructions").increment(total);
|
||||
tracing::info!(rows = total, table = "instructions", "bulk insert complete");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -649,9 +760,8 @@ fn insert_xrefs_streaming(
|
||||
func_analysis: &FuncAnalysis,
|
||||
labels: &HashMap<u32, String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut tx = conn.unchecked_transaction()?;
|
||||
let mut appender = conn.appender("xrefs")?;
|
||||
let mut count: u64 = 0;
|
||||
let mut since_commit: u64 = 0;
|
||||
|
||||
for (&target, refs) in xrefs {
|
||||
let target_label = labels.get(&target).map(|s| s.as_str());
|
||||
@@ -663,10 +773,11 @@ fn insert_xrefs_streaming(
|
||||
let off = xref.source.wrapping_sub(image_base) as usize;
|
||||
if off + 4 <= pe.len() {
|
||||
let raw = u32::from_be_bytes([pe[off], pe[off+1], pe[off+2], pe[off+3]]);
|
||||
let decoded = crate::ppc::disasm(raw, xref.source);
|
||||
let display = decoded.display().to_string();
|
||||
let (mnem, _) = split_disasm(&display);
|
||||
Some(mnem.to_string())
|
||||
let d = xenia_cpu::decode(raw, xref.source);
|
||||
let t = xenia_cpu::disasm::format(&d);
|
||||
// Prefer the simplified mnemonic when present (matches what
|
||||
// a human reading the .asm file sees for that line).
|
||||
Some(t.ext_mnemonic.unwrap_or(t.mnemonic))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -681,47 +792,22 @@ fn insert_xrefs_streaming(
|
||||
xref.source, func_analysis, labels,
|
||||
);
|
||||
|
||||
{
|
||||
let mut stmt = tx.prepare_cached(
|
||||
"INSERT INTO xrefs (source, target, kind, instruction, source_func, source_label, target_label)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)"
|
||||
)?;
|
||||
stmt.execute(params![
|
||||
xref.source as i64,
|
||||
target as i64,
|
||||
kind,
|
||||
instruction,
|
||||
source_func,
|
||||
source_label,
|
||||
target_label,
|
||||
])?;
|
||||
}
|
||||
appender.append_row(params![
|
||||
xref.source as i64,
|
||||
target as i64,
|
||||
kind,
|
||||
instruction.as_deref(),
|
||||
source_func,
|
||||
source_label.as_str(),
|
||||
target_label,
|
||||
])?;
|
||||
|
||||
count += 1;
|
||||
since_commit += 1;
|
||||
|
||||
if since_commit >= batch_size() {
|
||||
tx.commit()?;
|
||||
eprintln!("[db] xrefs: {count} committed");
|
||||
tx = conn.unchecked_transaction()?;
|
||||
since_commit = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tx.commit()?;
|
||||
eprintln!("[db] inserted {count} xrefs");
|
||||
appender.flush()?;
|
||||
metrics::counter!("db.rows", "table" => "xrefs").increment(count);
|
||||
tracing::info!(rows = count, table = "xrefs", "bulk insert complete");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Split "mnemonic operands" into (mnemonic, operands).
|
||||
fn split_disasm(disasm: &str) -> (&str, &str) {
|
||||
let trimmed = disasm.trim();
|
||||
if let Some(pos) = trimmed.find(|c: char| c.is_whitespace()) {
|
||||
let mnemonic = &trimmed[..pos];
|
||||
let operands = trimmed[pos..].trim_start();
|
||||
(mnemonic, operands)
|
||||
} else {
|
||||
(trimmed, "")
|
||||
}
|
||||
}
|
||||
|
||||
51
crates/xenia-analysis/src/disasm.rs
Normal file
51
crates/xenia-analysis/src/disasm.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
//! Analysis-side enrichment over [`xenia_cpu::disasm::iter_disasm`].
|
||||
//!
|
||||
//! Turns a stream of decoder-only [`xenia_cpu::disasm::DisasmItem`]s into a
|
||||
//! stream of [`RichDisasmItem`]s carrying section name + enclosing function +
|
||||
//! label name. The three sinks in [`crate::sinks`] (text, JSON, DuckDB) all
|
||||
//! consume `RichDisasmItem`.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use xenia_cpu::disasm::DisasmItem;
|
||||
|
||||
use crate::func::FuncAnalysis;
|
||||
|
||||
/// `DisasmItem` plus the analysis context (section/function/label).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RichDisasmItem<'a> {
|
||||
pub item: DisasmItem,
|
||||
pub section: &'a str,
|
||||
pub function: Option<u32>,
|
||||
pub label: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// Walk one code section, yielding rich items annotated with section name,
|
||||
/// rolling-window enclosing function, and label-at-address.
|
||||
///
|
||||
/// The `function` field tracks the most recent function-start the iterator
|
||||
/// has crossed — matching the legacy `current_func` behaviour in
|
||||
/// `db.rs::insert_instructions_streaming`.
|
||||
pub fn enrich_section<'a>(
|
||||
image: &'a [u8],
|
||||
image_base: u32,
|
||||
section_name: &'a str,
|
||||
va_start: u32,
|
||||
va_end: u32,
|
||||
func_analysis: &'a FuncAnalysis,
|
||||
labels: &'a HashMap<u32, String>,
|
||||
) -> impl Iterator<Item = RichDisasmItem<'a>> + 'a {
|
||||
let mut current_func: Option<u32> = None;
|
||||
xenia_cpu::disasm::iter_disasm(image, image_base, va_start, va_end).map(move |item| {
|
||||
if func_analysis.is_function_start(item.addr) {
|
||||
current_func = Some(item.addr);
|
||||
}
|
||||
let label = labels.get(&item.addr).map(|s| s.as_str());
|
||||
RichDisasmItem {
|
||||
item,
|
||||
section: section_name,
|
||||
function: current_func,
|
||||
label,
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -6,8 +6,10 @@ use std::io::Write;
|
||||
use xenia_xex::header::ImportLibrary;
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::disasm::enrich_section;
|
||||
use crate::func::FuncAnalysis;
|
||||
use crate::xref::{XrefKind, Xref, XrefMap, section_for_addr, resolve_source_label};
|
||||
use crate::sinks::text::write_instr_line;
|
||||
use crate::xref::{XrefKind, Xref, XrefMap, resolve_source_label};
|
||||
|
||||
/// Metadata passed to the formatter (avoids exposing full Xex2Header internals).
|
||||
pub struct DisasmInfo<'a> {
|
||||
@@ -88,11 +90,14 @@ pub fn write_asm(
|
||||
writeln!(out)?;
|
||||
|
||||
let mut in_function = false;
|
||||
let mut addr = va_start;
|
||||
while addr < va_end {
|
||||
let abs_addr = info.image_base + addr;
|
||||
let off = (addr - va_start) as usize + file_start;
|
||||
if off + 4 > pe.len() { break; }
|
||||
let abs_start = info.image_base + va_start;
|
||||
let abs_end = info.image_base + va_end;
|
||||
|
||||
let items = enrich_section(
|
||||
pe, info.image_base, §ion.name, abs_start, abs_end, func_analysis, labels,
|
||||
);
|
||||
for ri in items {
|
||||
let abs_addr = ri.item.addr;
|
||||
|
||||
// Function start? Emit separator + header
|
||||
if let Some(fi) = func_analysis.get(abs_addr) {
|
||||
@@ -126,7 +131,6 @@ pub fn write_asm(
|
||||
writeln!(out, "; FUNCTION: {lbl}{detail_str}")?;
|
||||
}
|
||||
|
||||
// Xrefs for function entry
|
||||
if let Some(xref_lines) = format_xrefs(abs_addr, xrefs, func_analysis, labels) {
|
||||
for line in &xref_lines {
|
||||
writeln!(out, "{line}")?;
|
||||
@@ -141,7 +145,6 @@ pub fn write_asm(
|
||||
if let Some(lbl) = labels.get(&abs_addr) {
|
||||
if !func_analysis.is_function_start(abs_addr) {
|
||||
writeln!(out)?;
|
||||
// Xrefs for local labels
|
||||
if let Some(xref_lines) = format_xrefs(abs_addr, xrefs, func_analysis, labels) {
|
||||
for line in &xref_lines {
|
||||
writeln!(out, "{line}")?;
|
||||
@@ -159,37 +162,8 @@ pub fn write_asm(
|
||||
writeln!(out, " ; IMPORT: {imp_name}")?;
|
||||
}
|
||||
|
||||
let instr = u32::from_be_bytes([
|
||||
pe[off], pe[off+1], pe[off+2], pe[off+3]
|
||||
]);
|
||||
|
||||
let decoded = crate::ppc::disasm(instr, abs_addr);
|
||||
let disasm_text = decoded.display().to_string();
|
||||
|
||||
// Annotate branch targets with label names
|
||||
let mut annotated = annotate_branch(&disasm_text, labels);
|
||||
|
||||
// Annotate data references
|
||||
if let Some(&(data_addr, kind)) = data_annotations.get(&abs_addr) {
|
||||
let tag = match kind {
|
||||
XrefKind::DataRead => "[R]",
|
||||
XrefKind::DataWrite => "[W]",
|
||||
_ => "[&]",
|
||||
};
|
||||
let sec = section_for_addr(data_addr, info.sections, info.image_base)
|
||||
.unwrap_or("?");
|
||||
let data_lbl = labels.get(&data_addr)
|
||||
.map(|s| format!(" = {s}"))
|
||||
.unwrap_or_default();
|
||||
if !annotated.contains("; ->") {
|
||||
annotated = format!("{annotated:<40} ; {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
} else {
|
||||
annotated = format!("{annotated} {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(out, " {:08X}: {:08X} {}", abs_addr, instr, annotated)?;
|
||||
addr += 4;
|
||||
let data_annot = data_annotations.get(&abs_addr).copied();
|
||||
write_instr_line(out, &ri, labels, info.sections, info.image_base, data_annot)?;
|
||||
}
|
||||
if in_function {
|
||||
writeln!(out, "; end function")?;
|
||||
@@ -298,21 +272,3 @@ fn format_xrefs(
|
||||
|
||||
Some(lines)
|
||||
}
|
||||
|
||||
fn annotate_branch(disasm: &str, labels: &HashMap<u32, String>) -> String {
|
||||
if let Some(pos) = disasm.find("0x") {
|
||||
let hex_start = pos + 2;
|
||||
let hex_end = disasm[hex_start..].find(|c: char| !c.is_ascii_hexdigit())
|
||||
.map(|i| hex_start + i)
|
||||
.unwrap_or(disasm.len());
|
||||
let hex_str = &disasm[hex_start..hex_end];
|
||||
if hex_str.len() == 8 {
|
||||
if let Ok(addr) = u32::from_str_radix(hex_str, 16) {
|
||||
if let Some(lbl) = labels.get(&addr) {
|
||||
return format!("{disasm:<40} ; -> {lbl}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
disasm.to_string()
|
||||
}
|
||||
|
||||
@@ -184,12 +184,14 @@ fn find_saverestore_stubs(
|
||||
|
||||
// ── Main analysis ──────────────────────────────────────────────────────────
|
||||
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base), entry_point = format_args!("{:#010x}", entry_point)))]
|
||||
pub fn analyze(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
entry_point: u32,
|
||||
code_sections: &[(u32, u32, u32)], // (va_start, va_size, flags)
|
||||
) -> FuncAnalysis {
|
||||
let started = std::time::Instant::now();
|
||||
let code_ranges: Vec<(u32, u32)> = code_sections.iter()
|
||||
.map(|(va, sz, _)| (image_base + va, image_base + va + sz))
|
||||
.collect();
|
||||
@@ -197,10 +199,10 @@ pub fn analyze(
|
||||
// 1. Find save/restore stubs
|
||||
let (save_base, restore_base) = find_saverestore_stubs(pe, image_base, &code_ranges);
|
||||
if let Some(sb) = save_base {
|
||||
eprintln!("[func] __savegprlr stub at 0x{sb:08X}");
|
||||
tracing::debug!(addr = format_args!("{:#010x}", sb), "__savegprlr stub");
|
||||
}
|
||||
if let Some(rb) = restore_base {
|
||||
eprintln!("[func] __restgprlr stub at 0x{rb:08X}");
|
||||
tracing::debug!(addr = format_args!("{:#010x}", rb), "__restgprlr stub");
|
||||
}
|
||||
|
||||
// Set of addresses in the save/restore region (to exclude from function detection)
|
||||
@@ -221,18 +223,17 @@ pub fn analyze(
|
||||
for &(start, end) in &code_ranges {
|
||||
let mut addr = start;
|
||||
while addr < end {
|
||||
if let Some(instr) = read_instr(pe, addr, image_base) {
|
||||
if let Some(target) = bl_target(instr, addr) {
|
||||
if let Some(instr) = read_instr(pe, addr, image_base)
|
||||
&& let Some(target) = bl_target(instr, addr) {
|
||||
// Don't count calls into save/restore stubs as function entries
|
||||
if !saverestore_addrs.contains(&target) {
|
||||
call_targets.insert(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
addr += 4;
|
||||
}
|
||||
}
|
||||
eprintln!("[func] {} bl targets (candidate functions)", call_targets.len());
|
||||
tracing::debug!(candidates = call_targets.len(), "bl targets collected");
|
||||
|
||||
// 3. For each candidate, detect prologue and walk to epilogue
|
||||
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
|
||||
@@ -267,7 +268,13 @@ pub fn analyze(
|
||||
});
|
||||
}
|
||||
|
||||
eprintln!("[func] {} functions detected", functions.len());
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "functions").record(elapsed_ms);
|
||||
tracing::info!(
|
||||
functions = functions.len(),
|
||||
elapsed_ms,
|
||||
"function detection complete"
|
||||
);
|
||||
|
||||
FuncAnalysis {
|
||||
functions,
|
||||
@@ -302,15 +309,13 @@ fn analyze_function(
|
||||
let instr1 = read_instr(pe, func_addr + 4, image_base).unwrap_or(0);
|
||||
|
||||
// Check if next is bl to save stub
|
||||
if let Some(target) = bl_target(instr1, func_addr + 4) {
|
||||
if let Some(sb) = save_base {
|
||||
if target >= sb && target < sb + 18 * 4 {
|
||||
if let Some(target) = bl_target(instr1, func_addr + 4)
|
||||
&& let Some(sb) = save_base
|
||||
&& target >= sb && target < sb + 18 * 4 {
|
||||
let idx = (target - sb) / 4;
|
||||
saved_gprs = 18 - idx;
|
||||
prologue_len = 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Next should be stwu r1, -N(r1)
|
||||
let stwu_instr = read_instr(pe, func_addr + prologue_len, image_base).unwrap_or(0);
|
||||
@@ -356,14 +361,12 @@ fn analyze_function(
|
||||
}
|
||||
|
||||
// Epilogue: b __restgprlr_NN (tail branch into restore stub)
|
||||
if let Some(target) = b_target(instr, addr) {
|
||||
if let Some(rb) = restore_base {
|
||||
if target >= rb && target < rb + 18 * 4 {
|
||||
if let Some(target) = b_target(instr, addr)
|
||||
&& let Some(rb) = restore_base
|
||||
&& target >= rb && target < rb + 18 * 4 {
|
||||
end_addr = addr + 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Epilogue: bctr (indirect tail call — end of function)
|
||||
if is_bctr(instr) {
|
||||
@@ -407,24 +410,22 @@ impl FuncAnalysis {
|
||||
for (&addr, fi) in &self.functions {
|
||||
if fi.is_saverestore {
|
||||
// Label the block start, plus individual register entry points
|
||||
if let Some(sb) = self.save_gpr_base {
|
||||
if addr == sb {
|
||||
if let Some(sb) = self.save_gpr_base
|
||||
&& addr == sb {
|
||||
for i in 0u32..18 {
|
||||
let reg = 14 + i;
|
||||
labels.insert(sb + i * 4, format!("__savegprlr_{reg}"));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if let Some(rb) = self.restore_gpr_base {
|
||||
if addr == rb {
|
||||
if let Some(rb) = self.restore_gpr_base
|
||||
&& addr == rb {
|
||||
for i in 0u32..18 {
|
||||
let reg = 14 + i;
|
||||
labels.insert(rb + i * 4, format!("__restgprlr_{reg}"));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
labels.insert(addr, format!("sub_{addr:08X}"));
|
||||
}
|
||||
|
||||
@@ -2,9 +2,13 @@ pub mod ppc;
|
||||
pub mod func;
|
||||
pub mod xref;
|
||||
pub mod db;
|
||||
pub mod disasm;
|
||||
pub mod formatter;
|
||||
pub mod sinks;
|
||||
pub mod sql_views;
|
||||
|
||||
mod ordinals;
|
||||
pub use ordinals::resolve_ordinal;
|
||||
pub use xref::{XrefKind, Xref, XrefMap, resolve_source_label};
|
||||
pub use db::{DbWriter, ExecTraceEntry, ImportCallEntry, BranchTraceEntry};
|
||||
pub use disasm::{RichDisasmItem, enrich_section};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
37
crates/xenia-analysis/src/sinks/duckdb.rs
Normal file
37
crates/xenia-analysis/src/sinks/duckdb.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
//! DuckDB sink — appends rich disasm items to the `instructions` table.
|
||||
//!
|
||||
//! Column layout matches [`crate::db`]: address, raw, mnemonic, operands,
|
||||
//! disasm, ext_mnemonic, ext_operands, ext_disasm, section, function, label.
|
||||
|
||||
use duckdb::{Appender, params};
|
||||
|
||||
use crate::disasm::RichDisasmItem;
|
||||
|
||||
/// Append every item to the appender. Returns the number of rows written.
|
||||
/// Does NOT flush — the caller decides when to flush, since multiple
|
||||
/// section iterators typically share one appender.
|
||||
pub fn append_instructions<'a>(
|
||||
appender: &mut Appender<'_>,
|
||||
items: impl IntoIterator<Item = RichDisasmItem<'a>>,
|
||||
) -> duckdb::Result<u64> {
|
||||
let mut count: u64 = 0;
|
||||
for ri in items {
|
||||
let t = &ri.item.text;
|
||||
appender.append_row(params![
|
||||
ri.item.addr as i64,
|
||||
ri.item.raw as i64,
|
||||
t.mnemonic.as_str(),
|
||||
t.operands.as_str(),
|
||||
t.disasm.as_str(),
|
||||
t.ext_mnemonic.as_deref(),
|
||||
t.ext_operands.as_deref(),
|
||||
t.ext_disasm.as_deref(),
|
||||
t.branch_target.map(|t| t as i64),
|
||||
ri.section,
|
||||
ri.function.map(|f| f as i64),
|
||||
ri.label,
|
||||
])?;
|
||||
count += 1;
|
||||
}
|
||||
Ok(count)
|
||||
}
|
||||
63
crates/xenia-analysis/src/sinks/json.rs
Normal file
63
crates/xenia-analysis/src/sinks/json.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
//! JSON Lines sink — one structured row per line, constant memory.
|
||||
//!
|
||||
//! Suited for piping into `jq`, importing into pandas / DuckDB's
|
||||
//! `read_json_auto`, or feeding downstream tooling that expects a
|
||||
//! line-delimited stream rather than a single megaobject.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::disasm::RichDisasmItem;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct JsonRow<'a> {
|
||||
addr: u32,
|
||||
raw: u32,
|
||||
mnemonic: &'a str,
|
||||
operands: &'a str,
|
||||
disasm: &'a str,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
ext_mnemonic: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
ext_operands: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
ext_disasm: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
branch_target: Option<u32>,
|
||||
section: &'a str,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
function: Option<u32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
label: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// Write each item as a single JSON object on its own line. Returns the
|
||||
/// number of rows written.
|
||||
pub fn write_jsonl<'a, W: Write>(
|
||||
out: &mut W,
|
||||
items: impl IntoIterator<Item = RichDisasmItem<'a>>,
|
||||
) -> io::Result<u64> {
|
||||
let mut count: u64 = 0;
|
||||
for ri in items {
|
||||
let t = &ri.item.text;
|
||||
let row = JsonRow {
|
||||
addr: ri.item.addr,
|
||||
raw: ri.item.raw,
|
||||
mnemonic: &t.mnemonic,
|
||||
operands: &t.operands,
|
||||
disasm: &t.disasm,
|
||||
ext_mnemonic: t.ext_mnemonic.as_deref(),
|
||||
ext_operands: t.ext_operands.as_deref(),
|
||||
ext_disasm: t.ext_disasm.as_deref(),
|
||||
branch_target: t.branch_target,
|
||||
section: ri.section,
|
||||
function: ri.function,
|
||||
label: ri.label,
|
||||
};
|
||||
serde_json::to_writer(&mut *out, &row)?;
|
||||
out.write_all(b"\n")?;
|
||||
count += 1;
|
||||
}
|
||||
Ok(count)
|
||||
}
|
||||
8
crates/xenia-analysis/src/sinks/mod.rs
Normal file
8
crates/xenia-analysis/src/sinks/mod.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
//! Output sinks for [`crate::disasm::RichDisasmItem`] streams.
|
||||
//!
|
||||
//! Each sink consumes the same iterator shape and writes to a different
|
||||
//! medium: human-readable .asm text, JSON Lines, or DuckDB rows.
|
||||
|
||||
pub mod duckdb;
|
||||
pub mod json;
|
||||
pub mod text;
|
||||
58
crates/xenia-analysis/src/sinks/text.rs
Normal file
58
crates/xenia-analysis/src/sinks/text.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
//! Text sink — renders one .asm instruction line with optional
|
||||
//! branch-target / data-ref annotations.
|
||||
//!
|
||||
//! The full `write_asm` orchestration (section headers, function prologue
|
||||
//! info, xref comment blocks, hex-dump of data sections) stays in
|
||||
//! [`crate::formatter`]; this sink only owns the per-instruction line.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use xenia_xex::pe::PeSection;
|
||||
|
||||
use crate::disasm::RichDisasmItem;
|
||||
use crate::xref::{XrefKind, section_for_addr};
|
||||
|
||||
/// Render one instruction line:
|
||||
/// ` 82000000: 60000000 nop`
|
||||
/// ` 82000004: 4800FFFC bl 0x82000000 ; -> entry_point`
|
||||
/// ` 82000010: 812A0000 lwz r9, 0(r10) ; [R] 0x828A0000 (.rdata) = dat_…`
|
||||
pub fn write_instr_line<W: Write + ?Sized>(
|
||||
out: &mut W,
|
||||
item: &RichDisasmItem<'_>,
|
||||
labels: &HashMap<u32, String>,
|
||||
sections: &[PeSection],
|
||||
image_base: u32,
|
||||
data_annotation: Option<(u32, XrefKind)>,
|
||||
) -> io::Result<()> {
|
||||
let disasm_text = item.item.text.display();
|
||||
|
||||
// Branch-target → label annotation. Uses the structured `branch_target`
|
||||
// field (cleaner than the legacy "find 0x in disasm string" regex).
|
||||
let mut annotated = match item.item.text.branch_target {
|
||||
Some(target) => match labels.get(&target) {
|
||||
Some(lbl) => format!("{disasm_text:<40} ; -> {lbl}"),
|
||||
None => disasm_text.to_string(),
|
||||
},
|
||||
None => disasm_text.to_string(),
|
||||
};
|
||||
|
||||
if let Some((data_addr, kind)) = data_annotation {
|
||||
let tag = match kind {
|
||||
XrefKind::DataRead => "[R]",
|
||||
XrefKind::DataWrite => "[W]",
|
||||
_ => "[&]",
|
||||
};
|
||||
let sec = section_for_addr(data_addr, sections, image_base).unwrap_or("?");
|
||||
let data_lbl = labels.get(&data_addr)
|
||||
.map(|s| format!(" = {s}"))
|
||||
.unwrap_or_default();
|
||||
if !annotated.contains("; ->") {
|
||||
annotated = format!("{annotated:<40} ; {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
} else {
|
||||
annotated = format!("{annotated} {tag} 0x{data_addr:08X} ({sec}){data_lbl}");
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(out, " {:08X}: {:08X} {}", item.item.addr, item.item.raw, annotated)
|
||||
}
|
||||
141
crates/xenia-analysis/src/sql_views.rs
Normal file
141
crates/xenia-analysis/src/sql_views.rs
Normal file
@@ -0,0 +1,141 @@
|
||||
//! Additive SQL views over the Phase-3 ingest tables.
|
||||
//!
|
||||
//! These views are created when `--analyze=sql` or `--analyze=both` is set.
|
||||
//! They are *not* a replacement for the Rust passes ([`crate::xref`],
|
||||
//! [`crate::func`]) — those still own data-ref resolution and prologue
|
||||
//! pattern matching. The views cover the cleanly-relational parts:
|
||||
//!
|
||||
//! - branch xrefs (self-join on `instructions.target_hex`)
|
||||
//! - call graph + reachability (recursive CTE over `xrefs`)
|
||||
//! - convenience joins (function-first-instruction, imports-called)
|
||||
//!
|
||||
//! All views are read-only and stable across re-creation: dropping and
|
||||
//! recreating the database via [`crate::db::DbWriter::open_fresh`] re-runs
|
||||
//! these definitions.
|
||||
//!
|
||||
//! ## Cross-check semantics
|
||||
//!
|
||||
//! `v_branch_xrefs` is intended to produce *exactly* the same `(source,
|
||||
//! target, kind)` tuples as the Rust `xref.rs` first pass — given the same
|
||||
//! input image. [`crate::db::DbWriter::cross_check_branch_xrefs`] queries
|
||||
//! the symmetric difference and returns the row counts; both should be
|
||||
//! zero. A non-zero count means the formatter's `mnemonic` column or the
|
||||
//! kind-classification CASE drifted out of agreement with `xref.rs`, and
|
||||
//! is worth a one-line warning at log time.
|
||||
|
||||
/// `(view_name, CREATE VIEW … SQL)` pairs in the order they must run.
|
||||
/// Later views may depend on earlier ones (e.g. `v_call_graph` reads
|
||||
/// `xrefs`, which is the Rust-pass table; `v_branch_xrefs` is independent).
|
||||
pub const ALL_VIEWS: &[(&str, &str)] = &[
|
||||
("v_branch_xrefs", V_BRANCH_XREFS),
|
||||
("v_call_graph", V_CALL_GRAPH),
|
||||
("v_reachability_from_entry", V_REACHABILITY_FROM_ENTRY),
|
||||
("v_function_first_instruction", V_FUNCTION_FIRST_INSTRUCTION),
|
||||
("v_imports_called", V_IMPORTS_CALLED),
|
||||
];
|
||||
|
||||
/// Branch cross-references derived purely from `instructions.target_hex`.
|
||||
///
|
||||
/// Mirrors the kind classification in [`crate::xref::collect_branch_target`]
|
||||
/// and the short tags returned by [`crate::xref::XrefKind::tag`] (which are
|
||||
/// what `xrefs.kind` actually stores):
|
||||
/// - I-form (`b`/`bl`/`ba`/`bla`): `bl`/`bla` → `"call"`, `b`/`ba` → `"j"`
|
||||
/// - B-form (`bc`/`bcl`/`bca`/`bcla`): always → `"br"`
|
||||
///
|
||||
/// Indirect branches (`bclr`/`bcctr`) leave `target_hex` NULL and are
|
||||
/// excluded from this view by design.
|
||||
const V_BRANCH_XREFS: &str = "
|
||||
CREATE OR REPLACE VIEW v_branch_xrefs AS
|
||||
SELECT
|
||||
address AS source,
|
||||
target_hex AS target,
|
||||
CASE
|
||||
WHEN mnemonic IN ('bl', 'bla') THEN 'call'
|
||||
WHEN mnemonic IN ('b', 'ba') THEN 'j'
|
||||
WHEN mnemonic IN ('bc', 'bcl', 'bca', 'bcla') THEN 'br'
|
||||
ELSE 'br'
|
||||
END AS kind,
|
||||
mnemonic AS instruction,
|
||||
function AS source_func
|
||||
FROM instructions
|
||||
WHERE target_hex IS NOT NULL;
|
||||
";
|
||||
|
||||
/// Call-graph edges resolved against function names.
|
||||
///
|
||||
/// Reads from `xrefs` (the Rust-pass table) — this is the canonical source
|
||||
/// for *all* edge kinds, including indirect/data; SQL can't reconstruct the
|
||||
/// data-ref edges cleanly because they require register tracking. For pure
|
||||
/// branch edges, `v_branch_xrefs` produces equivalent rows directly from
|
||||
/// `instructions`.
|
||||
const V_CALL_GRAPH: &str = "
|
||||
CREATE OR REPLACE VIEW v_call_graph AS
|
||||
SELECT
|
||||
x.source AS caller_addr,
|
||||
cf.name AS caller_name,
|
||||
x.target AS callee_addr,
|
||||
tf.name AS callee_name,
|
||||
x.kind AS edge_kind
|
||||
FROM xrefs x
|
||||
LEFT JOIN functions cf ON cf.address = x.source_func
|
||||
LEFT JOIN functions tf ON tf.address = x.target
|
||||
WHERE x.kind = 'call';
|
||||
";
|
||||
|
||||
/// Transitive function-level reachability from the entry point over
|
||||
/// call/jump/branch edges. Useful for finding dead code
|
||||
/// (`SELECT address FROM functions
|
||||
/// WHERE address NOT IN (SELECT addr FROM v_reachability_from_entry)`)
|
||||
/// and for scoping analysis to the live subset.
|
||||
///
|
||||
/// Seeds from the function containing the `entry_point` label and walks
|
||||
/// the recursive closure: a reachable function's instructions branch into
|
||||
/// the functions enclosing the branch targets, which are then reachable
|
||||
/// in turn. `UNION` (not `UNION ALL`) deduplicates to handle call-graph
|
||||
/// cycles (recursive functions, mutually-recursive pairs).
|
||||
const V_REACHABILITY_FROM_ENTRY: &str = "
|
||||
CREATE OR REPLACE VIEW v_reachability_from_entry AS
|
||||
WITH RECURSIVE reach(fn) AS (
|
||||
SELECT i.function FROM instructions i
|
||||
JOIN labels l ON l.address = i.address
|
||||
WHERE l.name = 'entry_point' AND i.function IS NOT NULL
|
||||
UNION
|
||||
SELECT tgt.function FROM xrefs x
|
||||
JOIN instructions src ON src.address = x.source
|
||||
JOIN instructions tgt ON tgt.address = x.target
|
||||
JOIN reach r ON src.function = r.fn
|
||||
WHERE x.kind IN ('call', 'j', 'br')
|
||||
AND tgt.function IS NOT NULL
|
||||
)
|
||||
SELECT fn AS addr FROM reach;
|
||||
";
|
||||
|
||||
/// Convenience join: each function's first decoded instruction. Useful for
|
||||
/// quickly inspecting prologue patterns without computing offsets manually.
|
||||
const V_FUNCTION_FIRST_INSTRUCTION: &str = "
|
||||
CREATE OR REPLACE VIEW v_function_first_instruction AS
|
||||
SELECT
|
||||
f.address AS function_addr,
|
||||
f.name AS function_name,
|
||||
i.raw AS first_raw,
|
||||
i.disasm AS first_disasm,
|
||||
i.ext_disasm AS first_ext_disasm
|
||||
FROM functions f
|
||||
JOIN instructions i ON i.address = f.address;
|
||||
";
|
||||
|
||||
/// Per-function summary of which kernel/library imports it calls. Joins
|
||||
/// xrefs (call edges) against the labels table to surface import names.
|
||||
const V_IMPORTS_CALLED: &str = "
|
||||
CREATE OR REPLACE VIEW v_imports_called AS
|
||||
SELECT
|
||||
x.source_func AS function_addr,
|
||||
f.name AS function_name,
|
||||
x.target AS import_addr,
|
||||
l.name AS import_name
|
||||
FROM xrefs x
|
||||
JOIN labels l ON l.address = x.target
|
||||
LEFT JOIN functions f ON f.address = x.source_func
|
||||
WHERE x.kind = 'call'
|
||||
AND l.kind = 'import';
|
||||
";
|
||||
@@ -53,6 +53,7 @@ pub struct XrefResult {
|
||||
}
|
||||
|
||||
/// Perform full cross-reference analysis on a PE image.
|
||||
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base), entry_point = format_args!("{:#010x}", entry_point)))]
|
||||
pub fn analyze_xrefs(
|
||||
pe: &[u8],
|
||||
image_base: u32,
|
||||
@@ -61,6 +62,7 @@ pub fn analyze_xrefs(
|
||||
func_analysis: &FuncAnalysis,
|
||||
import_map: &HashMap<u32, String>,
|
||||
) -> XrefResult {
|
||||
let started = std::time::Instant::now();
|
||||
let func_labels = func_analysis.generate_labels();
|
||||
let mut labels: HashMap<u32, String> = func_labels;
|
||||
labels.insert(entry_point, "entry_point".to_string());
|
||||
@@ -124,7 +126,7 @@ pub fn analyze_xrefs(
|
||||
let rd = ((instr >> 21) & 0x1F) as usize;
|
||||
let ra = ((instr >> 16) & 0x1F) as usize;
|
||||
let simm = ((instr & 0xFFFF) as i16) as i32;
|
||||
let uimm = (instr & 0xFFFF) as u32;
|
||||
let uimm = instr & 0xFFFF;
|
||||
|
||||
// Reset tracking on function boundaries (prologue = mfspr rN, LR)
|
||||
if opcode == 31 {
|
||||
@@ -181,8 +183,8 @@ pub fn analyze_xrefs(
|
||||
}
|
||||
// Load instructions: lwz, lbz, lhz, lha, lfs, lfd, lwzu, etc.
|
||||
32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 46 | 48 | 49 | 50 | 51 => {
|
||||
if ra != 0 {
|
||||
if let Some(base) = reg_hi[ra] {
|
||||
if ra != 0
|
||||
&& let Some(base) = reg_hi[ra] {
|
||||
let data_addr = base.wrapping_add(simm as u32);
|
||||
if is_in_ranges(data_addr, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (data_addr, XrefKind::DataRead));
|
||||
@@ -190,14 +192,13 @@ pub fn analyze_xrefs(
|
||||
labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Load into rD may clobber the tracked value
|
||||
reg_hi[rd] = None;
|
||||
}
|
||||
// Store instructions: stw, stb, sth, stfs, stfd, stwu, etc.
|
||||
36 | 37 | 38 | 39 | 44 | 45 | 47 | 52 | 53 | 54 | 55 => {
|
||||
if ra != 0 {
|
||||
if let Some(base) = reg_hi[ra] {
|
||||
if ra != 0
|
||||
&& let Some(base) = reg_hi[ra] {
|
||||
let data_addr = base.wrapping_add(simm as u32);
|
||||
if is_in_ranges(data_addr, &data_ranges) {
|
||||
data_annotations.insert(abs_addr, (data_addr, XrefKind::DataWrite));
|
||||
@@ -205,7 +206,6 @@ pub fn analyze_xrefs(
|
||||
labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Any other instruction writing to rD: invalidate
|
||||
_ => {
|
||||
@@ -221,6 +221,17 @@ pub fn analyze_xrefs(
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed_ms = started.elapsed().as_millis() as f64;
|
||||
metrics::histogram!("analysis.phase_ms", "phase" => "xrefs").record(elapsed_ms);
|
||||
let total_xrefs: usize = xrefs.values().map(|v| v.len()).sum();
|
||||
tracing::info!(
|
||||
labels = labels.len(),
|
||||
xrefs = total_xrefs,
|
||||
data_annotations = data_annotations.len(),
|
||||
elapsed_ms,
|
||||
"xref analysis complete"
|
||||
);
|
||||
|
||||
XrefResult { labels, xrefs, data_annotations }
|
||||
}
|
||||
|
||||
@@ -262,7 +273,7 @@ fn is_in_ranges(addr: u32, ranges: &[(u32, u32)]) -> bool {
|
||||
}
|
||||
|
||||
/// Find which section a data address falls in.
|
||||
pub fn section_for_addr<'a>(addr: u32, sections: &'a [PeSection], image_base: u32) -> Option<&'a str> {
|
||||
pub fn section_for_addr(addr: u32, sections: &[PeSection], image_base: u32) -> Option<&str> {
|
||||
for s in sections {
|
||||
let start = image_base + s.virtual_address;
|
||||
let end = start + s.virtual_size;
|
||||
@@ -285,12 +296,11 @@ pub fn resolve_source_label(
|
||||
}
|
||||
|
||||
// Find the containing function (largest start <= addr)
|
||||
if let Some((&func_start, _fi)) = func_analysis.functions.range(..=addr).next_back() {
|
||||
if let Some(func_label) = labels.get(&func_start) {
|
||||
if let Some((&func_start, _fi)) = func_analysis.functions.range(..=addr).next_back()
|
||||
&& let Some(func_label) = labels.get(&func_start) {
|
||||
let offset = addr - func_start;
|
||||
return format!("{func_label}+0x{offset:X}");
|
||||
}
|
||||
}
|
||||
|
||||
format!("0x{addr:08X}")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user