M1: parse .pdata RUNTIME_FUNCTION; cross-validate function boundaries

Adds an authoritative function-boundary source from the linker:
- New `xenia_xex::pdata` parses .pdata 8-byte entries (BeginAddress + packed
  prolog/length/flags). Bit layout per Microsoft PE32 PowerPC spec: prolog in
  bits 0..7, function_length in bits 8..29, flags in 30..31.
- `func::analyze_with_pdata` unions pdata BeginAddresses into the candidate
  set, attaches `pdata_validated`/`pdata_length` to each `FuncInfo`, and trims
  any function whose `end` overlaps the next start (catches mis-merge where
  one row spanned two prologues — the audit-031 sub_824D23B0/sub_824D29F0
  case).
- DB: extends `functions` with `pdata_validated BOOLEAN`, `pdata_length BIGINT`;
  new table `pdata_entries`; index on pdata_validated.
- New `crates/xenia-analysis/SCHEMA.md` documents M1 layer + forward work.

Validation on Sylpheed: 25481 functions (was 12156) / 23073 pdata_validated /
0 orphans / 0 mis-merges. Audit-031 mis-merge resolved: sub_824D29F0 now has
its own row with `pdata_length=280` (70 dwords); sub_824D23B0 now correctly
ends at 0x824D2878 (`pdata_length=1224` matches prologue walk).

Tests 605→610. New 5-test pdata unit suite covers bit layout + sentinel +
out-of-range filtering + real-world layout round-trip.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-08 19:44:02 +02:00
parent e061e21851
commit 70120465a3
7 changed files with 503 additions and 16 deletions

View File

@@ -314,13 +314,23 @@ impl DbWriter {
) -> anyhow::Result<()> {
self.conn.execute_batch("
CREATE TABLE functions (
address BIGINT PRIMARY KEY, -- absolute VA of entry point
name VARCHAR NOT NULL, -- symbol name, or sub_XXXXXXXX if unresolved
end_address BIGINT NOT NULL, -- VA of last instruction + 4 (exclusive end)
frame_size BIGINT NOT NULL, -- stack frame size in bytes (from prologue)
saved_gprs BIGINT NOT NULL, -- bitmask of GPRs saved in prologue (bit N = rN)
is_leaf BOOLEAN NOT NULL, -- true if the function has no outgoing calls
is_saverestore BOOLEAN NOT NULL -- true if __savegprlr_* / __restgprlr_* stub
address BIGINT PRIMARY KEY, -- absolute VA of entry point
name VARCHAR NOT NULL, -- symbol name, or sub_XXXXXXXX if unresolved
end_address BIGINT NOT NULL, -- VA of last instruction + 4 (exclusive end)
frame_size BIGINT NOT NULL, -- stack frame size in bytes (from prologue)
saved_gprs BIGINT NOT NULL, -- bitmask of GPRs saved in prologue (bit N = rN)
is_leaf BOOLEAN NOT NULL, -- true if the function has no outgoing calls
is_saverestore BOOLEAN NOT NULL, -- true if __savegprlr_* / __restgprlr_* stub
pdata_validated BOOLEAN NOT NULL, -- true if .pdata RUNTIME_FUNCTION exists at this VA
pdata_length BIGINT -- length in bytes per .pdata; NULL if no pdata entry
);
CREATE TABLE pdata_entries (
begin_address BIGINT PRIMARY KEY, -- absolute VA of function start (RUNTIME_FUNCTION.BeginAddress)
end_address BIGINT NOT NULL, -- begin_address + function_length (exclusive)
function_length BIGINT NOT NULL, -- function size in bytes
prolog_length BIGINT NOT NULL, -- prolog size in bytes
flags BIGINT NOT NULL -- raw 2-bit flags (bit 1=32-bit-code, bit 0=exception)
);
CREATE TABLE labels (
@@ -341,11 +351,13 @@ impl DbWriter {
")?;
insert_functions(&self.conn, func_analysis, labels)?;
insert_pdata_entries(&self.conn, &func_analysis.pdata_entries)?;
insert_labels(&self.conn, labels)?;
insert_xrefs_streaming(&self.conn, xrefs, pe, info.image_base, func_analysis, labels)?;
let indices = [
("idx_functions_name", "CREATE INDEX idx_functions_name ON functions(name)"),
("idx_functions_pdata_validated", "CREATE INDEX idx_functions_pdata_validated ON functions(pdata_validated)"),
("idx_labels_kind", "CREATE INDEX idx_labels_kind ON labels(kind)"),
("idx_labels_name", "CREATE INDEX idx_labels_name ON labels(name)"),
("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"),
@@ -680,8 +692,10 @@ fn insert_functions(
labels: &HashMap<u32, String>,
) -> anyhow::Result<()> {
let mut stmt = conn.prepare(
"INSERT INTO functions (address, name, end_address, frame_size, saved_gprs, is_leaf, is_saverestore)
VALUES (?, ?, ?, ?, ?, ?, ?)"
"INSERT INTO functions
(address, name, end_address, frame_size, saved_gprs, is_leaf, is_saverestore,
pdata_validated, pdata_length)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"
)?;
for (&addr, fi) in &func_analysis.functions {
let name = labels.get(&addr)
@@ -695,6 +709,33 @@ fn insert_functions(
fi.saved_gprs as i64,
fi.is_leaf,
fi.is_saverestore,
fi.pdata_validated,
fi.pdata_length.map(|n| n as i64),
])?;
}
Ok(())
}
fn insert_pdata_entries(
conn: &Connection,
entries: &[xenia_xex::pdata::PdataEntry],
) -> anyhow::Result<()> {
if entries.is_empty() {
return Ok(());
}
let mut stmt = conn.prepare(
"INSERT INTO pdata_entries
(begin_address, end_address, function_length, prolog_length, flags)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT DO NOTHING"
)?;
for e in entries {
stmt.execute(params![
e.begin_address as i64,
e.end_address() as i64,
e.function_length as i64,
e.prolog_length as i64,
e.flags as i64,
])?;
}
Ok(())

View File

@@ -32,6 +32,13 @@ pub struct FuncInfo {
pub is_leaf: bool,
/// True if this is a save/restore GPR helper stub.
pub is_saverestore: bool,
/// True if `.pdata` has a RUNTIME_FUNCTION whose `BeginAddress` matches `start`.
/// Authoritative ground truth from the linker; rows without this flag are
/// prologue-detected only and may carry boundary errors.
pub pdata_validated: bool,
/// Function size in bytes per `.pdata`'s `function_length` field, if known.
/// Absent (None) when this row is prologue-only.
pub pdata_length: Option<u32>,
}
/// Result of the function analysis pass.
@@ -42,6 +49,9 @@ pub struct FuncAnalysis {
pub save_gpr_base: Option<u32>,
/// Addresses in the restore-GPR region (start of __restgprlr block).
pub restore_gpr_base: Option<u32>,
/// Raw `.pdata` entries from the binary, in original order. Empty when no
/// `.pdata` was supplied. Mirrored into the DB as `pdata_entries`.
pub pdata_entries: Vec<xenia_xex::pdata::PdataEntry>,
}
// ── Instruction field helpers ──────────────────────────────────────────────
@@ -190,6 +200,29 @@ pub fn analyze(
image_base: u32,
entry_point: u32,
code_sections: &[(u32, u32, u32)], // (va_start, va_size, flags)
) -> FuncAnalysis {
analyze_with_pdata(pe, image_base, entry_point, code_sections, &[])
}
/// Same as [`analyze`] but also unions `.pdata` `RUNTIME_FUNCTION` entries
/// into the candidate set. Each surviving function carries `pdata_validated`
/// when its start matches a pdata `BeginAddress`, and `pdata_length` when
/// the linker-supplied length disagrees with the prologue walk.
///
/// Pdata entries that have no prologue match (orphans) are still emitted,
/// using the linker-supplied length to bound the function.
///
/// What this layer does NOT do:
/// - Does not edit the `prolog_length` we'd derive from prologue analysis;
/// `frame_size` and `saved_gprs` remain best-effort prologue inferences.
/// - Does not infer base/derived call edges — that's M3+M5.
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base), entry_point = format_args!("{:#010x}", entry_point), pdata_entries = pdata.len()))]
pub fn analyze_with_pdata(
pe: &[u8],
image_base: u32,
entry_point: u32,
code_sections: &[(u32, u32, u32)],
pdata: &[xenia_xex::pdata::PdataEntry],
) -> FuncAnalysis {
let started = std::time::Instant::now();
let code_ranges: Vec<(u32, u32)> = code_sections.iter()
@@ -216,7 +249,8 @@ pub fn analyze(
for i in 0..21 { saverestore_addrs.insert(rb + i * 4); }
}
// 2. Collect all bl targets as candidate function entries
// 2. Collect all bl targets as candidate function entries.
// Union: bl targets pdata BeginAddresses entry_point.
let mut call_targets: HashSet<u32> = HashSet::new();
call_targets.insert(entry_point);
@@ -233,14 +267,58 @@ pub fn analyze(
addr += 4;
}
}
tracing::debug!(candidates = call_targets.len(), "bl targets collected");
// 3. For each candidate, detect prologue and walk to epilogue
// Index pdata by begin_address for O(1) prologue → length lookup.
let pdata_by_begin: HashMap<u32, &xenia_xex::pdata::PdataEntry> =
pdata.iter().map(|e| (e.begin_address, e)).collect();
for e in pdata {
if !saverestore_addrs.contains(&e.begin_address) {
call_targets.insert(e.begin_address);
}
}
tracing::debug!(
candidates = call_targets.len(),
pdata_entries = pdata.len(),
"function candidates (bl pdata)"
);
// 3. For each candidate, detect prologue and walk to epilogue. Pdata
// metadata is layered on after the prologue walk so a missing prologue
// still yields an entry when pdata covers it.
let mut functions: BTreeMap<u32, FuncInfo> = BTreeMap::new();
for &func_addr in &call_targets {
if let Some(fi) = analyze_function(pe, image_base, func_addr, &code_ranges, save_base, restore_base) {
let pdata_entry = pdata_by_begin.get(&func_addr).copied();
if let Some(mut fi) = analyze_function(
pe, image_base, func_addr, &code_ranges, save_base, restore_base,
) {
if let Some(p) = pdata_entry {
fi.pdata_validated = true;
fi.pdata_length = Some(p.function_length);
// If the prologue walk ended too early, trust pdata's length.
let pdata_end = p.begin_address.wrapping_add(p.function_length);
if pdata_end > fi.end {
fi.end = pdata_end;
}
}
functions.insert(func_addr, fi);
} else if let Some(p) = pdata_entry {
// Orphan: pdata claims a function here but no prologue matched.
// Emit a synthetic entry so the row exists for downstream queries.
functions.insert(
func_addr,
FuncInfo {
start: func_addr,
end: p.begin_address.wrapping_add(p.function_length),
frame_size: 0,
saved_gprs: 0,
is_leaf: false,
is_saverestore: false,
pdata_validated: true,
pdata_length: Some(p.function_length),
},
);
}
}
@@ -255,6 +333,8 @@ pub fn analyze(
saved_gprs: 18,
is_leaf: true,
is_saverestore: true,
pdata_validated: pdata_by_begin.contains_key(&sb),
pdata_length: pdata_by_begin.get(&sb).map(|p| p.function_length),
});
}
if let Some(rb) = restore_base {
@@ -265,13 +345,33 @@ pub fn analyze(
saved_gprs: 18,
is_leaf: true,
is_saverestore: true,
pdata_validated: pdata_by_begin.contains_key(&rb),
pdata_length: pdata_by_begin.get(&rb).map(|p| p.function_length),
});
}
// 5. Fix up `end_address` collisions: if function A's `end` overlaps
// function B's `start` (B > A), trim A. This catches mis-merged
// prologue walks where pdata revealed an interleaved second prologue.
// We do this in a single forward pass.
let starts: Vec<u32> = functions.keys().copied().collect();
for i in 0..starts.len().saturating_sub(1) {
let cur = starts[i];
let next = starts[i + 1];
if let Some(fi) = functions.get_mut(&cur)
&& fi.end > next
{
fi.end = next;
}
}
let elapsed_ms = started.elapsed().as_millis() as f64;
metrics::histogram!("analysis.phase_ms", "phase" => "functions").record(elapsed_ms);
let pdata_validated_count = functions.values().filter(|f| f.pdata_validated).count();
tracing::info!(
functions = functions.len(),
pdata_entries = pdata.len(),
pdata_validated = pdata_validated_count,
elapsed_ms,
"function detection complete"
);
@@ -280,6 +380,7 @@ pub fn analyze(
functions,
save_gpr_base: save_base,
restore_gpr_base: restore_base,
pdata_entries: pdata.to_vec(),
}
}
@@ -395,6 +496,8 @@ fn analyze_function(
saved_gprs,
is_leaf,
is_saverestore: false,
pdata_validated: false,
pdata_length: None,
})
}