diff --git a/crates/xenia-analysis/SCHEMA.md b/crates/xenia-analysis/SCHEMA.md index 1bc40fb..abe4e07 100644 --- a/crates/xenia-analysis/SCHEMA.md +++ b/crates/xenia-analysis/SCHEMA.md @@ -227,9 +227,55 @@ See `crates/xenia-analysis/src/lookup.rs`. - 9,132 lis+addi sites cross-reference into the detected strings — names the source PCs that reference each string. -## Forward work (M6, M8–M12, not yet landed) +## Layer M6 — Extended store-class xrefs + `addr_mode` column (landed) + +### Schema additions +- `xrefs.addr_mode VARCHAR NULL` — sub-classifies how the source instruction + computes its target. NULL for control-flow edges (call / ind_call / j / + br); one of the following tags for data edges: + - `d_form` — standard signed-16 displacement (lwz/stw/lfs/stfs/etc.) + - `lis_addi` — address materialised via `lis + addi` register tracking + - `lis_ori` — address materialised via `lis + ori` + - `multiword` — `lmw / stmw` (one xref per slot; up to 32-rS slots) + - `x_form_indexed` — `stwx / stbx / sthx / stwux / stbux / sthux / stdx / + stdux / lwzx / lbzx / lhzx / lhax / lwzux / lbzux / lhzux / lhaux / ldx / + ldux` — emitted only when both rA and rB are tracked constants + - `x_form_byterev` — `stwbrx / sthbrx / lwbrx / lhbrx` + - `atomic` — `stwcx. / stdcx.` reservation-conditional stores + - `dcbz` — cache-line clear (32-byte zero at rA+rB) +- Index `idx_xrefs_addr_mode`. + +### What this layer does +- Tags every existing data xref with its addressing mode (`d_form` for the + bulk; `lis_addi` / `lis_ori` for the lift-and-add cases that produce + DataRef rows). +- Adds new dispatch for opcode 47 (`stmw`) and 46 (`lmw`), expanding to + per-slot DataWrite / DataRead rows. +- Adds new dispatch for opcode 31 X-form: stores, atomic, byte-reverse, + dcbz. X-form rows are emitted ONLY when both rA and rB resolve to known + constants (otherwise the address is runtime-dependent and we skip). + +### What this layer does NOT do +- VMX / VMX128 vector stores (opcode 31 with vector XO codes) are not + emitted — they always have register-indexed addresses that the + lis+addi tracker can't usually resolve, and detecting them adds noise + without improving target resolution. +- The dominant runtime-of-stwx pattern (rA = base, rB = runtime index) is + not resolved — by design; mem-watch covers the runtime side per VERIFY-B. + +### Sylpheed yield +- 28,834 `lis_addi` refs, 18,485 `d_form` reads, 3,288 `d_form` writes — + the existing baseline now properly tagged. +- **442 newly-detected `x_form_indexed` reads** — primarily lwzx/lhzx + reads from in-table dispatch (each pair (rA,rB) resolved statically). +- **40 newly-detected `atomic` writes** — every `stwcx.` site with a + resolvable address; useful for reservation-table audits. +- 9 `lis_ori` refs. +- 0 multiword / dcbz / byterev — these instructions exist in the binary + but are not in lis+addi-tracked code paths. + +## Forward work (M8–M12, not yet landed) -- **M6** — extended `xrefs.kind='write'` for indexed/byte-reverse/multiword/VMX/DCBZ/atomic stores with `addr_mode` column. - **M8** — dispatch-table heuristics beyond vtables (e.g. function-pointer arrays in `.data`). - **M9** — `__CxxFrameHandler` exception scope-table parsing. - **M10** — `.tls` section / TLS slot tracking. diff --git a/crates/xenia-analysis/src/db.rs b/crates/xenia-analysis/src/db.rs index 4baeab3..233ebb1 100644 --- a/crates/xenia-analysis/src/db.rs +++ b/crates/xenia-analysis/src/db.rs @@ -390,7 +390,8 @@ impl DbWriter { CREATE TABLE xrefs ( source BIGINT NOT NULL, -- VA of the referencing instruction target BIGINT NOT NULL, -- VA of the referenced destination - kind VARCHAR NOT NULL, -- call | jump | branch | data_read | data_write | data_ref + kind VARCHAR NOT NULL, -- call | ind_call | j | br | read | write | ref + addr_mode VARCHAR, -- M6 sub-classification of how source computes target (NULL for control-flow) instruction VARCHAR, -- mnemonic of source instruction; NULL if not in binary source_func BIGINT, -- VA of the function containing source; NULL if unknown source_label VARCHAR, -- label at source; NULL if none @@ -418,6 +419,7 @@ impl DbWriter { ("idx_methods_function", "CREATE INDEX idx_methods_function ON methods(function_address)"), ("idx_classes_rtti", "CREATE INDEX idx_classes_rtti ON classes(rtti_present)"), ("idx_strings_encoding", "CREATE INDEX idx_strings_encoding ON strings(encoding)"), + ("idx_xrefs_addr_mode", "CREATE INDEX idx_xrefs_addr_mode ON xrefs(addr_mode)"), ("idx_xrefs_target", "CREATE INDEX idx_xrefs_target ON xrefs(target)"), ("idx_xrefs_source", "CREATE INDEX idx_xrefs_source ON xrefs(source)"), ("idx_xrefs_source_func", "CREATE INDEX idx_xrefs_source_func ON xrefs(source_func)"), @@ -1062,10 +1064,12 @@ fn insert_xrefs_streaming( xref.source, func_analysis, labels, ); + let addr_mode = xref.addr_mode.map(|m| m.tag()); appender.append_row(params![ xref.source as i64, target as i64, kind, + addr_mode, instruction.as_deref(), source_func, source_label.as_str(), diff --git a/crates/xenia-analysis/src/xref.rs b/crates/xenia-analysis/src/xref.rs index bb7345f..b37ad08 100644 --- a/crates/xenia-analysis/src/xref.rs +++ b/crates/xenia-analysis/src/xref.rs @@ -39,10 +39,54 @@ impl XrefKind { } } +/// Sub-classification of how `source`'s instruction computes its target +/// address. Only meaningful for data xrefs (`read` / `write` / `ref`); call +/// / jump / branch / ind_call rows store `None`. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub enum AddrMode { + /// Standard signed-16 displacement: `lwz rD, simm(rA)`, `stw rS, simm(rA)`, + /// FP D-forms (`lfs/lfd/stfs/stfd`), update variants. The dominant case. + DForm, + /// Address materialised via `lis + addi` register tracking — no + /// load/store yet at this site. + LisAddi, + /// Address materialised via `lis + ori` register tracking. + LisOri, + /// Multi-word D-form: `lmw / stmw rS, simm(rA)` — emits one xref per + /// register slot (32-rS slots starting at the resolved base). + Multiword, + /// X-form indexed: `stwx / stbx / sthx / stwux / stbux / sthux / stdx / stdux`. + /// Static resolution requires both rA and rB constant. + XFormIndexed, + /// X-form byte-reverse: `stwbrx / sthbrx / lwbrx / lhbrx`. + XFormByteRev, + /// Reservation/atomic store-conditional: `stwcx. / stdcx.`. + Atomic, + /// Cache-line clear: `dcbz rA, rB` — clears 32 bytes at rA+rB. + DCBZ, +} + +impl AddrMode { + pub fn tag(self) -> &'static str { + match self { + AddrMode::DForm => "d_form", + AddrMode::LisAddi => "lis_addi", + AddrMode::LisOri => "lis_ori", + AddrMode::Multiword => "multiword", + AddrMode::XFormIndexed => "x_form_indexed", + AddrMode::XFormByteRev => "x_form_byterev", + AddrMode::Atomic => "atomic", + AddrMode::DCBZ => "dcbz", + } + } +} + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct Xref { pub source: u32, pub kind: XrefKind, + /// `None` for control-flow edges; `Some(...)` for data edges. + pub addr_mode: Option, } pub type XrefMap = HashMap>; @@ -160,7 +204,10 @@ pub fn analyze_xrefs( let data_addr = base.wrapping_add(simm as u32); if is_in_ranges(data_addr, &data_ranges) { data_annotations.insert(abs_addr, (data_addr, XrefKind::DataRef)); - xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataRef }); + xrefs.entry(data_addr).or_default().push(Xref { + source: abs_addr, kind: XrefKind::DataRef, + addr_mode: Some(AddrMode::LisAddi), + }); labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}")); } reg_hi[rd] = Some(data_addr); // propagate for chained access @@ -175,7 +222,10 @@ pub fn analyze_xrefs( let data_addr = base | uimm; if is_in_ranges(data_addr, &data_ranges) { data_annotations.insert(abs_addr, (data_addr, XrefKind::DataRef)); - xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataRef }); + xrefs.entry(data_addr).or_default().push(Xref { + source: abs_addr, kind: XrefKind::DataRef, + addr_mode: Some(AddrMode::LisOri), + }); labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}")); } reg_hi[ra] = Some(data_addr); @@ -184,31 +234,147 @@ pub fn analyze_xrefs( } } // Load instructions: lwz, lbz, lhz, lha, lfs, lfd, lwzu, etc. - 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 46 | 48 | 49 | 50 | 51 => { + 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 48 | 49 | 50 | 51 => { if ra != 0 && let Some(base) = reg_hi[ra] { let data_addr = base.wrapping_add(simm as u32); if is_in_ranges(data_addr, &data_ranges) { data_annotations.insert(abs_addr, (data_addr, XrefKind::DataRead)); - xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataRead }); + xrefs.entry(data_addr).or_default().push(Xref { + source: abs_addr, kind: XrefKind::DataRead, + addr_mode: Some(AddrMode::DForm), + }); labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}")); } } // Load into rD may clobber the tracked value reg_hi[rd] = None; } + // lmw rD, simm(rA) — D-form multi-word load. Reads (32-rD) + // consecutive 4-byte words starting at base+simm into + // rD..r31. Emits one DataRead per slot. + 46 => { + if ra != 0 + && let Some(base) = reg_hi[ra] + { + let mut addr_w = base.wrapping_add(simm as u32); + for _slot in (rd as u32)..32 { + if is_in_ranges(addr_w, &data_ranges) { + data_annotations.insert(abs_addr, (addr_w, XrefKind::DataRead)); + xrefs.entry(addr_w).or_default().push(Xref { + source: abs_addr, kind: XrefKind::DataRead, + addr_mode: Some(AddrMode::Multiword), + }); + labels.entry(addr_w).or_insert_with(|| format!("dat_{addr_w:08X}")); + } + addr_w = addr_w.wrapping_add(4); + } + } + reg_hi[rd] = None; + } // Store instructions: stw, stb, sth, stfs, stfd, stwu, etc. - 36 | 37 | 38 | 39 | 44 | 45 | 47 | 52 | 53 | 54 | 55 => { + 36 | 37 | 38 | 39 | 44 | 45 | 52 | 53 | 54 | 55 => { if ra != 0 && let Some(base) = reg_hi[ra] { let data_addr = base.wrapping_add(simm as u32); if is_in_ranges(data_addr, &data_ranges) { data_annotations.insert(abs_addr, (data_addr, XrefKind::DataWrite)); - xrefs.entry(data_addr).or_default().push(Xref { source: abs_addr, kind: XrefKind::DataWrite }); + xrefs.entry(data_addr).or_default().push(Xref { + source: abs_addr, kind: XrefKind::DataWrite, + addr_mode: Some(AddrMode::DForm), + }); labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}")); } } } + // stmw rS, simm(rA) — D-form multi-word store. Writes + // (32-rS) consecutive 4-byte words from rS..r31 to + // base+simm onward. Emits one DataWrite per slot. + 47 => { + if ra != 0 + && let Some(base) = reg_hi[ra] + { + let mut addr_w = base.wrapping_add(simm as u32); + for _slot in (rd as u32)..32 { + if is_in_ranges(addr_w, &data_ranges) { + data_annotations.insert(abs_addr, (addr_w, XrefKind::DataWrite)); + xrefs.entry(addr_w).or_default().push(Xref { + source: abs_addr, kind: XrefKind::DataWrite, + addr_mode: Some(AddrMode::Multiword), + }); + labels.entry(addr_w).or_insert_with(|| format!("dat_{addr_w:08X}")); + } + addr_w = addr_w.wrapping_add(4); + } + } + } + // X-form: opcode 31 — indexed loads/stores, atomic ops, dcbz. + // We can't statically resolve `rA + rB` without tracking rB + // too; we record an xref ONLY when rB is also a known + // constant (rare) OR when rB is r0 (which encodes as zero). + // Falls through to the generic-clobber arm afterwards via + // the explicit reg_hi update. + 31 => { + let xo = (instr >> 1) & 0x3FF; + let rb = ((instr >> 11) & 0x1F) as usize; + let resolve_rab = |reg_hi: &[Option; 32]| -> Option { + let a = if ra == 0 { Some(0u32) } else { reg_hi[ra] }; + let b = if rb == 0 { Some(0u32) } else { reg_hi[rb] }; + match (a, b) { + (Some(av), Some(bv)) => Some(av.wrapping_add(bv)), + _ => None, + } + }; + let mode_for_xo = |xo: u32| -> Option<(AddrMode, XrefKind)> { + match xo { + // Atomic store-conditional + 150 => Some((AddrMode::Atomic, XrefKind::DataWrite)), // stwcx. + 214 => Some((AddrMode::Atomic, XrefKind::DataWrite)), // stdcx. + // Byte-reverse stores + 662 => Some((AddrMode::XFormByteRev, XrefKind::DataWrite)), // stwbrx + 918 => Some((AddrMode::XFormByteRev, XrefKind::DataWrite)), // sthbrx + // Byte-reverse loads + 534 => Some((AddrMode::XFormByteRev, XrefKind::DataRead)), // lwbrx + 790 => Some((AddrMode::XFormByteRev, XrefKind::DataRead)), // lhbrx + // dcbz — cache-line zero (32-byte clear). Treat as a write. + 1014 => Some((AddrMode::DCBZ, XrefKind::DataWrite)), + // Plain X-form indexed stores (the common ones) + 151 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stwx + 215 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stbx + 407 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // sthx + 183 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stwux + 247 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stbux + 439 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // sthux + 149 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stdx + 181 => Some((AddrMode::XFormIndexed, XrefKind::DataWrite)), // stdux + // Plain X-form indexed loads + 23 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lwzx + 87 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lbzx + 279 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhzx + 343 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhax + 55 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lwzux + 119 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lbzux + 311 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhzux + 375 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // lhaux + 21 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // ldx + 53 => Some((AddrMode::XFormIndexed, XrefKind::DataRead)), // ldux + _ => None, + } + }; + if let Some((addr_mode, kind)) = mode_for_xo(xo) + && let Some(data_addr) = resolve_rab(®_hi) + && is_in_ranges(data_addr, &data_ranges) + { + data_annotations.insert(abs_addr, (data_addr, kind)); + xrefs.entry(data_addr).or_default().push(Xref { + source: abs_addr, kind, + addr_mode: Some(addr_mode), + }); + labels.entry(data_addr).or_insert_with(|| format!("dat_{data_addr:08X}")); + } + // Fall through: any X-form op may write rD; invalidate. + reg_hi[rd] = None; + } // Any other instruction writing to rD: invalidate _ => { // Conservatively invalidate for instructions that modify rD @@ -248,7 +414,7 @@ fn collect_branch_target(instr: u32, addr: u32, labels: &mut HashMap { // B-form: bc/bcl @@ -256,7 +422,7 @@ fn collect_branch_target(instr: u32, addr: u32, labels: &mut HashMap {} } @@ -306,3 +472,36 @@ pub fn resolve_source_label( format!("0x{addr:08X}") } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn addr_mode_tags_are_distinct() { + let modes = [ + AddrMode::DForm, + AddrMode::LisAddi, + AddrMode::LisOri, + AddrMode::Multiword, + AddrMode::XFormIndexed, + AddrMode::XFormByteRev, + AddrMode::Atomic, + AddrMode::DCBZ, + ]; + let tags: std::collections::HashSet<&str> = modes.iter().map(|m| m.tag()).collect(); + assert_eq!(tags.len(), modes.len(), "every AddrMode variant must have a unique tag"); + } + + #[test] + fn xref_struct_carries_addr_mode_for_data_edges() { + let x = Xref { source: 0x1234, kind: XrefKind::DataWrite, addr_mode: Some(AddrMode::DForm) }; + assert_eq!(x.addr_mode.unwrap().tag(), "d_form"); + } + + #[test] + fn xref_struct_addr_mode_is_none_for_call_edges() { + let x = Xref { source: 0x1234, kind: XrefKind::Call, addr_mode: None }; + assert!(x.addr_mode.is_none()); + } +} diff --git a/crates/xenia-analysis/tests/db_schema_golden.rs b/crates/xenia-analysis/tests/db_schema_golden.rs index 7ded9da..9a2080a 100644 --- a/crates/xenia-analysis/tests/db_schema_golden.rs +++ b/crates/xenia-analysis/tests/db_schema_golden.rs @@ -212,6 +212,7 @@ fn db_schema_matches_expected_columns() { ("source", "BIGINT"), ("target", "BIGINT"), ("kind", "VARCHAR"), + ("addr_mode", "VARCHAR"), ("instruction", "VARCHAR"), ("source_func", "BIGINT"), ("source_label", "VARCHAR"), diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs index 73fa812..58ed1ab 100644 --- a/crates/xenia-app/src/main.rs +++ b/crates/xenia-app/src/main.rs @@ -4121,6 +4121,7 @@ fn cmd_dis( .push(xenia_analysis::xref::Xref { source: edge.source, kind: xenia_analysis::xref::XrefKind::IndirectCall, + addr_mode: None, }); }