M9.5 + M11.5 + VMX + SJIS/UTF-8: close the post-M5.5 deferred set

Closes the four remaining deferred follow-up items in one bundle. All four are smaller-scope and additive; lockstep determinism unaffected (analyzer-only changes). ## M9.5 — __CxxFrameHandler scope-table parsing - New `xenia_analysis::eh_scope` module. Magic-scans .rdata for the three documented MSVC FuncInfo signatures (0x19930520/21/22) on 4-byte alignment. Each match is parsed as the documented struct (BE u32 fields), with sanity caps on max_state / n_try_blocks / pointer validity. - Walks pUnwindMap (UnwindMapEntry, 8 bytes) and pTryBlockMap (TryBlockMapEntry, 20 bytes) into one row each. - New tables eh_funcinfo, eh_unwind_map, eh_try_blocks. - Sylpheed yield: 2,588 FuncInfo (all version 0x19930522) / 10,019 unwind entries / 315 try-blocks. ## M11.5 — Static-init driver chain detection - New `xenia_analysis::static_init` module. Walks every function looking for the canonical _initterm loop: lwz cursor; mtctr; bcctrl; addi cursor, cursor, 4 bounded by a compare against another constant register. Extracts (array_start, array_end) and reads the array. - Reuses `function_pointer_arrays` table — drivers' arrays land with kind='static_init' (replacing M11's prologue-heuristic output where the structurally-grounded pattern fires). - Sylpheed yield: 0 drivers detected — the binary's static-init structure does not match the canonical CRT loop. Infrastructure ready; future M11.6 can relax. ## VMX vector-store xrefs (M6 follow-up) - Adds AltiVec/VMX X-form load/store XOs to the M6 opcode-31 dispatch: lvx/lvxl/lvebx/lvehx/lvewx (reads) and stvx/stvxl/stvebx/stvehx/stvewx (writes), all addr_mode= 'x_form_indexed'. Static resolution still requires both rA and rB constant. - Sylpheed yield: 110 newly-detected stvx writes. ## Shift_JIS + UTF-8 localised-string detection (M7 follow-up) - Extends `xenia_analysis::strings::analyze` with scan_shift_jis (JIS X 0208 lead/trail byte ranges + half-width katakana pass-through) and scan_utf8 (2- and 3-byte sequences). At least one multi-byte unit required so pure-ASCII strings aren't double-counted. - SJIS bytes rendered as \xHH escapes for diagnostic readability; full SJIS→UTF-8 decoding deferred. - Sylpheed yield: 790 Shift_JIS strings (Japanese debug + UI text) + 39 UTF-8. ## Tests - +2 EH (parses_minimal_funcinfo_v0, rejects_bogus_max_state) - +2 static_init (detects_canonical_initterm_loop, rejects_function_without_pattern) - +2 strings (detects_shift_jis_string, detects_utf8_multibyte_string) Tests 649→655 (+6 unit tests). DB schema golden + write_analysis_results signature updated for new EH parameter. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 00:36:53 +02:00
parent b03192c772
commit e428ce33aa
9 changed files with 1159 additions and 14 deletions
--- a/crates/xenia-analysis/src/strings.rs
+++ b/crates/xenia-analysis/src/strings.rs
@@ -14,7 +14,12 @@
 //!   on this big-endian platform); we do NOT try big-endian UTF-16.
 //! - No language detection / classification beyond encoding.
 //!
-//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic.
+//! Extends the original ASCII / UTF-16LE pass with Shift_JIS detection
+//! (Sylpheed is originally Japanese — likely yields mission/UI text
+//! invisible to ASCII-only) and UTF-8 multi-byte detection.
+//!
+//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic;
+//! Shift_JIS lead/trail byte ranges per JIS X 0208.

 use xenia_xex::pe::PeSection;

@@ -23,7 +28,7 @@ use xenia_xex::pe::PeSection;
 pub struct DetectedString {
    /// Absolute VA of the first byte.
    pub address: u32,
-    /// `"ascii"` or `"utf16le"`.
+    /// `"ascii"` | `"utf16le"` | `"shift_jis"` | `"utf8"`.
    pub encoding: &'static str,
    /// Length in bytes (excluding the NUL terminator).
    pub length: u32,
@@ -48,15 +53,21 @@ pub fn analyze(pe: &[u8], image_base: u32, sections: &[PeSection]) -> Vec<Detect

        scan_ascii(bytes, va_base, &mut out);
        scan_utf16le(bytes, va_base, &mut out);
+        scan_shift_jis(bytes, va_base, &mut out);
+        scan_utf8(bytes, va_base, &mut out);
    }

    let elapsed_ms = started.elapsed().as_millis() as f64;
    let n_ascii = out.iter().filter(|s| s.encoding == "ascii").count();
-    let n_utf16 = out.len() - n_ascii;
+    let n_utf16 = out.iter().filter(|s| s.encoding == "utf16le").count();
+    let n_sjis  = out.iter().filter(|s| s.encoding == "shift_jis").count();
+    let n_utf8  = out.iter().filter(|s| s.encoding == "utf8").count();
    metrics::histogram!("analysis.phase_ms", "phase" => "strings").record(elapsed_ms);
    tracing::info!(
        ascii = n_ascii,
        utf16le = n_utf16,
+        shift_jis = n_sjis,
+        utf8 = n_utf8,
        total = out.len(),
        elapsed_ms,
        "string scan complete"
@@ -135,6 +146,134 @@ fn scan_utf16le(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
    }
 }

+/// Per JIS X 0208: Shift_JIS first byte ∈ [0x81, 0x9F] ∪ [0xE0, 0xEF];
+/// trail byte ∈ [0x40, 0x7E] ∪ [0x80, 0xFC]. Single-byte ASCII and JIS
+/// half-width katakana (0xA1..=0xDF) are passed through.
+fn is_sjis_lead(b: u8) -> bool {
+    (0x81..=0x9F).contains(&b) || (0xE0..=0xEF).contains(&b)
+}
+fn is_sjis_trail(b: u8) -> bool {
+    (0x40..=0x7E).contains(&b) || (0x80..=0xFC).contains(&b)
+}
+fn is_sjis_singlebyte(b: u8) -> bool {
+    is_printable_ascii(b) || (0xA1..=0xDF).contains(&b)
+}
+
+/// Scan for Shift_JIS strings — runs of ≥ 6 bytes consisting of valid
+/// SJIS code units (single-byte ASCII / half-width katakana, OR a
+/// lead+trail pair). At least one multi-byte pair must be present so we
+/// don't double-count strings that are purely ASCII.
+fn scan_shift_jis(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
+    let mut i = 0;
+    while i < bytes.len() {
+        let start = i;
+        let mut has_multibyte = false;
+        let mut nbytes = 0;
+        while i < bytes.len() {
+            let b = bytes[i];
+            if is_sjis_lead(b) && i + 1 < bytes.len() && is_sjis_trail(bytes[i + 1]) {
+                has_multibyte = true;
+                nbytes += 2;
+                i += 2;
+            } else if is_sjis_singlebyte(b) {
+                nbytes += 1;
+                i += 1;
+            } else {
+                break;
+            }
+        }
+        // Require NUL terminator + min length + at least one multi-byte char.
+        if has_multibyte
+            && nbytes >= MIN_LEN
+            && i < bytes.len() && bytes[i] == 0
+        {
+            // Decode SJIS → UTF-8 best-effort. We don't ship a full
+            // SJIS decoder; keep the bytes as a `\u{XX}\u{YY}…` style
+            // rendering for diagnostic readability, and let downstream
+            // tooling re-decode if needed.
+            let raw = &bytes[start..i];
+            let mut s = String::with_capacity(raw.len() * 4);
+            let mut p = 0;
+            while p < raw.len() {
+                let b = raw[p];
+                if is_sjis_lead(b) && p + 1 < raw.len() && is_sjis_trail(raw[p + 1]) {
+                    // Render as SJIS hex pair so the string is identifiable
+                    // even without a decoder. Real Japanese decoding is a
+                    // future enhancement.
+                    s.push_str(&format!("\\x{:02X}\\x{:02X}", b, raw[p + 1]));
+                    p += 2;
+                } else {
+                    s.push(b as char);
+                    p += 1;
+                }
+            }
+            out.push(DetectedString {
+                address: va_base + start as u32,
+                encoding: "shift_jis",
+                length: nbytes as u32,
+                content: s,
+            });
+            i += 1; // skip NUL
+        } else {
+            // Advance past whatever didn't match.
+            i = start + 1;
+            if i < bytes.len() && bytes[i] == 0 { i += 1; }
+        }
+    }
+}
+
+/// Scan for UTF-8 strings carrying multi-byte sequences (we already
+/// catch pure-ASCII via `scan_ascii`). Validates 2/3-byte sequences;
+/// 4-byte (supplementary plane) is uncommon in game text and skipped.
+fn scan_utf8(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
+    let mut i = 0;
+    while i < bytes.len() {
+        let start = i;
+        let mut has_multibyte = false;
+        let mut nbytes = 0;
+        while i < bytes.len() {
+            let b = bytes[i];
+            if b < 0x80 {
+                if !is_printable_ascii(b) { break; }
+                nbytes += 1;
+                i += 1;
+            } else if (b & 0xE0) == 0xC0 {
+                // 2-byte: 110xxxxx 10xxxxxx
+                if i + 1 >= bytes.len() || (bytes[i + 1] & 0xC0) != 0x80 { break; }
+                has_multibyte = true;
+                nbytes += 2;
+                i += 2;
+            } else if (b & 0xF0) == 0xE0 {
+                // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx
+                if i + 2 >= bytes.len()
+                    || (bytes[i + 1] & 0xC0) != 0x80
+                    || (bytes[i + 2] & 0xC0) != 0x80 { break; }
+                has_multibyte = true;
+                nbytes += 3;
+                i += 3;
+            } else {
+                break;
+            }
+        }
+        if has_multibyte
+            && nbytes >= MIN_LEN
+            && i < bytes.len() && bytes[i] == 0
+            && let Ok(s) = std::str::from_utf8(&bytes[start..i])
+        {
+            out.push(DetectedString {
+                address: va_base + start as u32,
+                encoding: "utf8",
+                length: nbytes as u32,
+                content: s.to_string(),
+            });
+            i += 1; // skip NUL
+        } else {
+            i = start + 1;
+            if i < bytes.len() && bytes[i] == 0 { i += 1; }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -195,6 +334,37 @@ mod tests {
        assert!(utf16.iter().any(|s| s.content == "Hello!"));
    }

+    #[test]
+    fn detects_shift_jis_string() {
+        let image_base = 0x82000000u32;
+        let mut pe = vec![0u8; 0x1100];
+        let off = 0x1000usize;
+        // "ABC" + (SJIS hiragana 'a' = 0x82 0xA0) + (SJIS 'i' = 0x82 0xA2) + NUL
+        let s: &[u8] = b"ABC\x82\xA0\x82\xA2\0";
+        pe[off..off + s.len()].copy_from_slice(s);
+        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
+        let strings = analyze(&pe, image_base, &sections);
+        let sjis: Vec<_> = strings.iter().filter(|s| s.encoding == "shift_jis").collect();
+        assert_eq!(sjis.len(), 1);
+        assert!(sjis[0].content.contains("ABC"));
+        assert!(sjis[0].content.contains("\\x82\\xA0"));
+    }
+
+    #[test]
+    fn detects_utf8_multibyte_string() {
+        let image_base = 0x82000000u32;
+        let mut pe = vec![0u8; 0x1100];
+        let off = 0x1000usize;
+        // "Café" = 'C', 'a', 'f', 0xC3 0xA9 (é), then more ASCII to reach min length
+        let s: &[u8] = b"Caf\xC3\xA9eteria\0";
+        pe[off..off + s.len()].copy_from_slice(s);
+        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
+        let strings = analyze(&pe, image_base, &sections);
+        let u8s: Vec<_> = strings.iter().filter(|s| s.encoding == "utf8").collect();
+        assert_eq!(u8s.len(), 1);
+        assert_eq!(u8s[0].content, "Café".to_string() + "eteria");
+    }
+
    #[test]
    fn requires_nul_terminator() {
        let image_base = 0x82000000u32;