Files
xenia-rs/crates/xenia-analysis/src/sql_views.rs
MechaCat02 ab4fe211e5 M5+M7: indirect-dispatch reachability + .rdata string detection
Two MEDIUM milestones bundled (both opportunistic per plan; both small).

## M5 — indirect-dispatch reachability

- `xenia_analysis::indirect`: per-basic-block register tracker over each
  detected function. Recognises the canonical static-vtable pattern
  `lis+addi → lwz off(rA) → mtctr → bcctrl` where rA holds a known M3
  vtable address. Emits one `Xref { kind: IndirectCall }` per resolvable
  bcctrl site.
- PowerPC ABI awareness: `bl`-style calls clobber volatile r0..r12 + ctr
  but preserve non-volatile r13..r31, so a vtable pointer parked in r30/r31
  before a call survives.
- Label-based basic-block boundaries kill register state — bounds
  false-positive risk for jump-IN paths.
- New `XrefKind::IndirectCall` variant (DB tag `'ind_call'`).
- New SQL view `v_indirect_reachability_from_entry` — strict superset of
  `v_reachability_from_entry`, taking `ind_call` edges in the BFS.

Sylpheed yield: 0 edges detected. The binary's 1,001 static lis+addi
references into vtables are nearly all constructor-side vptr writes, not
dispatches; real method dispatch goes through `this->vptr` which requires
alias analysis we explicitly don't do. Documented in SCHEMA.md as the
expected limitation. Three unit tests cover the synthetic-correctness path.

## M7 — string / constant-pool detection

- `xenia_analysis::strings`: scans `.rdata` for runs of ≥ 6 printable
  ASCII bytes (NUL-terminated) and ≥ 6 UTF-16LE code units (basic-plane
  printable ASCII, NUL u16 terminator).
- New `strings(address PK, encoding, length, content)` table + encoding index.
- Implicit cross-ref via existing `xrefs.kind='ref'` rows whose target
  matches a strings.address.

Sylpheed yield: 6,311 ASCII strings (including embedded HLSL shader source
and AS_CB_SURFACE_SWIZZLE_* assertion strings). 9,132 lis+addi sites
cross-reference detected strings — names source PCs near each string in
one query. Four unit tests cover encoding detection, NUL termination, and
short-run rejection.

Tests 626→633 (+3 indirect, +4 strings).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 21:22:50 +02:00

166 lines
6.8 KiB
Rust

//! Additive SQL views over the Phase-3 ingest tables.
//!
//! These views are created when `--analyze=sql` or `--analyze=both` is set.
//! They are *not* a replacement for the Rust passes ([`crate::xref`],
//! [`crate::func`]) — those still own data-ref resolution and prologue
//! pattern matching. The views cover the cleanly-relational parts:
//!
//! - branch xrefs (self-join on `instructions.target_hex`)
//! - call graph + reachability (recursive CTE over `xrefs`)
//! - convenience joins (function-first-instruction, imports-called)
//!
//! All views are read-only and stable across re-creation: dropping and
//! recreating the database via [`crate::db::DbWriter::open_fresh`] re-runs
//! these definitions.
//!
//! ## Cross-check semantics
//!
//! `v_branch_xrefs` is intended to produce *exactly* the same `(source,
//! target, kind)` tuples as the Rust `xref.rs` first pass — given the same
//! input image. [`crate::db::DbWriter::cross_check_branch_xrefs`] queries
//! the symmetric difference and returns the row counts; both should be
//! zero. A non-zero count means the formatter's `mnemonic` column or the
//! kind-classification CASE drifted out of agreement with `xref.rs`, and
//! is worth a one-line warning at log time.
/// `(view_name, CREATE VIEW … SQL)` pairs in the order they must run.
/// Later views may depend on earlier ones (e.g. `v_call_graph` reads
/// `xrefs`, which is the Rust-pass table; `v_branch_xrefs` is independent).
pub const ALL_VIEWS: &[(&str, &str)] = &[
("v_branch_xrefs", V_BRANCH_XREFS),
("v_call_graph", V_CALL_GRAPH),
("v_reachability_from_entry", V_REACHABILITY_FROM_ENTRY),
("v_indirect_reachability_from_entry", V_INDIRECT_REACHABILITY_FROM_ENTRY),
("v_function_first_instruction", V_FUNCTION_FIRST_INSTRUCTION),
("v_imports_called", V_IMPORTS_CALLED),
];
/// Branch cross-references derived purely from `instructions.target_hex`.
///
/// Mirrors the kind classification in [`crate::xref::collect_branch_target`]
/// and the short tags returned by [`crate::xref::XrefKind::tag`] (which are
/// what `xrefs.kind` actually stores):
/// - I-form (`b`/`bl`/`ba`/`bla`): `bl`/`bla` → `"call"`, `b`/`ba` → `"j"`
/// - B-form (`bc`/`bcl`/`bca`/`bcla`): always → `"br"`
///
/// Indirect branches (`bclr`/`bcctr`) leave `target_hex` NULL and are
/// excluded from this view by design.
const V_BRANCH_XREFS: &str = "
CREATE OR REPLACE VIEW v_branch_xrefs AS
SELECT
address AS source,
target_hex AS target,
CASE
WHEN mnemonic IN ('bl', 'bla') THEN 'call'
WHEN mnemonic IN ('b', 'ba') THEN 'j'
WHEN mnemonic IN ('bc', 'bcl', 'bca', 'bcla') THEN 'br'
ELSE 'br'
END AS kind,
mnemonic AS instruction,
function AS source_func
FROM instructions
WHERE target_hex IS NOT NULL;
";
/// Call-graph edges resolved against function names.
///
/// Reads from `xrefs` (the Rust-pass table) — this is the canonical source
/// for *all* edge kinds, including indirect/data; SQL can't reconstruct the
/// data-ref edges cleanly because they require register tracking. For pure
/// branch edges, `v_branch_xrefs` produces equivalent rows directly from
/// `instructions`.
const V_CALL_GRAPH: &str = "
CREATE OR REPLACE VIEW v_call_graph AS
SELECT
x.source AS caller_addr,
cf.name AS caller_name,
x.target AS callee_addr,
tf.name AS callee_name,
x.kind AS edge_kind
FROM xrefs x
LEFT JOIN functions cf ON cf.address = x.source_func
LEFT JOIN functions tf ON tf.address = x.target
WHERE x.kind = 'call';
";
/// Transitive function-level reachability from the entry point over
/// call/jump/branch edges. Useful for finding dead code
/// (`SELECT address FROM functions
/// WHERE address NOT IN (SELECT addr FROM v_reachability_from_entry)`)
/// and for scoping analysis to the live subset.
///
/// Seeds from the function containing the `entry_point` label and walks
/// the recursive closure: a reachable function's instructions branch into
/// the functions enclosing the branch targets, which are then reachable
/// in turn. `UNION` (not `UNION ALL`) deduplicates to handle call-graph
/// cycles (recursive functions, mutually-recursive pairs).
const V_REACHABILITY_FROM_ENTRY: &str = "
CREATE OR REPLACE VIEW v_reachability_from_entry AS
WITH RECURSIVE reach(fn) AS (
SELECT i.function FROM instructions i
JOIN labels l ON l.address = i.address
WHERE l.name = 'entry_point' AND i.function IS NOT NULL
UNION
SELECT tgt.function FROM xrefs x
JOIN instructions src ON src.address = x.source
JOIN instructions tgt ON tgt.address = x.target
JOIN reach r ON src.function = r.fn
WHERE x.kind IN ('call', 'j', 'br')
AND tgt.function IS NOT NULL
)
SELECT fn AS addr FROM reach;
";
/// Reachability extended over `kind='ind_call'` edges from M5. Strict
/// superset of `v_reachability_from_entry` — every fn there is also here,
/// plus any function reached only via a vtable bcctrl whose vtable+slot
/// the M5 dataflow could resolve. Sample 5 newly-reachable PCs in canary
/// before trusting widely; the analysis intentionally leaves out alias-
/// dependent indirect calls (vtable loaded from a `this` field).
const V_INDIRECT_REACHABILITY_FROM_ENTRY: &str = "
CREATE OR REPLACE VIEW v_indirect_reachability_from_entry AS
WITH RECURSIVE reach(fn) AS (
SELECT i.function FROM instructions i
JOIN labels l ON l.address = i.address
WHERE l.name = 'entry_point' AND i.function IS NOT NULL
UNION
SELECT tgt.function FROM xrefs x
JOIN instructions src ON src.address = x.source
JOIN instructions tgt ON tgt.address = x.target
JOIN reach r ON src.function = r.fn
WHERE x.kind IN ('call', 'ind_call', 'j', 'br')
AND tgt.function IS NOT NULL
)
SELECT fn AS addr FROM reach;
";
/// Convenience join: each function's first decoded instruction. Useful for
/// quickly inspecting prologue patterns without computing offsets manually.
const V_FUNCTION_FIRST_INSTRUCTION: &str = "
CREATE OR REPLACE VIEW v_function_first_instruction AS
SELECT
f.address AS function_addr,
f.name AS function_name,
i.raw AS first_raw,
i.disasm AS first_disasm,
i.ext_disasm AS first_ext_disasm
FROM functions f
JOIN instructions i ON i.address = f.address;
";
/// Per-function summary of which kernel/library imports it calls. Joins
/// xrefs (call edges) against the labels table to surface import names.
const V_IMPORTS_CALLED: &str = "
CREATE OR REPLACE VIEW v_imports_called AS
SELECT
x.source_func AS function_addr,
f.name AS function_name,
x.target AS import_addr,
l.name AS import_name
FROM xrefs x
JOIN labels l ON l.address = x.target
LEFT JOIN functions f ON f.address = x.source_func
WHERE x.kind = 'call'
AND l.kind = 'import';
";