xenia-rs/crates/xenia-analysis/src/strings.rs

//! String / constant-pool detection in `.rdata`.
//!
//! Scans the `.rdata` section for runs of printable ASCII or null-terminated
//! UTF-16LE characters of length ≥ 6, emitting one row per discovered string.
//! Cross-references against `xrefs.target` are computed by the caller —
//! this module only finds the strings; downstream queries can join.
//!
//! ### What this layer does NOT do
//!
//! - No UTF-8 multibyte detection — Xbox 360 game binaries reliably use
//!   ASCII for debug strings and UTF-16LE for localised text.
//! - Strings in `.data` (mutable globals) are not scanned by default.
//! - Wide strings on Xbox 360 are little-endian (compiler convention even
//!   on this big-endian platform); we do NOT try big-endian UTF-16.
//! - No language detection / classification beyond encoding.
//!
//! Extends the original ASCII / UTF-16LE pass with Shift_JIS detection
//! (Sylpheed is originally Japanese — likely yields mission/UI text
//! invisible to ASCII-only) and UTF-8 multi-byte detection.
//!
//! Reference: `objdump -s` `.rdata` walks rely on the same heuristic;
//! Shift_JIS lead/trail byte ranges per JIS X 0208.

use xenia_xex::pe::PeSection;

/// One detected string.
#[derive(Debug, Clone)]
pub struct DetectedString {
    /// Absolute VA of the first byte.
    pub address: u32,
    /// `"ascii"` | `"utf16le"` | `"shift_jis"` | `"utf8"`.
    pub encoding: &'static str,
    /// Length in bytes (excluding the NUL terminator).
    pub length: u32,
    /// UTF-8 representation of the string content.
    pub content: String,
}

/// Scan all `.rdata` sections (and any other read-only data section the user
/// configures) for ASCII and UTF-16LE strings.
#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
pub fn analyze(pe: &[u8], image_base: u32, sections: &[PeSection]) -> Vec<DetectedString> {
    let started = std::time::Instant::now();
    let mut out: Vec<DetectedString> = Vec::new();

    for section in sections {
        if section.name != ".rdata" { continue; }
        let raw_start = section.virtual_address as usize;
        let raw_end = (section.virtual_address + section.virtual_size) as usize;
        if raw_end > pe.len() { continue; }
        let bytes = &pe[raw_start..raw_end.min(pe.len())];
        let va_base = image_base + section.virtual_address;

        scan_ascii(bytes, va_base, &mut out);
        scan_utf16le(bytes, va_base, &mut out);
        scan_shift_jis(bytes, va_base, &mut out);
        scan_utf8(bytes, va_base, &mut out);
    }

    let elapsed_ms = started.elapsed().as_millis() as f64;
    let n_ascii = out.iter().filter(|s| s.encoding == "ascii").count();
    let n_utf16 = out.iter().filter(|s| s.encoding == "utf16le").count();
    let n_sjis  = out.iter().filter(|s| s.encoding == "shift_jis").count();
    let n_utf8  = out.iter().filter(|s| s.encoding == "utf8").count();
    metrics::histogram!("analysis.phase_ms", "phase" => "strings").record(elapsed_ms);
    tracing::info!(
        ascii = n_ascii,
        utf16le = n_utf16,
        shift_jis = n_sjis,
        utf8 = n_utf8,
        total = out.len(),
        elapsed_ms,
        "string scan complete"
    );
    out
}

const MIN_LEN: usize = 6;

fn is_printable_ascii(b: u8) -> bool {
    // Printable + the common whitespace characters used in real strings.
    matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r')
}

fn scan_ascii(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
    let mut i = 0;
    while i < bytes.len() {
        if !is_printable_ascii(bytes[i]) {
            i += 1;
            continue;
        }
        let start = i;
        while i < bytes.len() && is_printable_ascii(bytes[i]) { i += 1; }
        let run_len = i - start;
        // Require NUL termination and minimum length.
        if run_len >= MIN_LEN && i < bytes.len() && bytes[i] == 0 {
            let s = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
            out.push(DetectedString {
                address: va_base + start as u32,
                encoding: "ascii",
                length: run_len as u32,
                content: s.to_string(),
            });
        }
        // Skip the NUL (if any) before continuing.
        if i < bytes.len() && bytes[i] == 0 { i += 1; }
    }
}

fn scan_utf16le(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
    // UTF-16LE strings are 2-byte aligned in MSVC output. Walk on even
    // offsets to avoid misaligned hits.
    let mut i = 0;
    while i + 2 <= bytes.len() {
        if !i.is_multiple_of(2) { i += 1; continue; }
        let lo = bytes[i];
        let hi = bytes[i + 1];
        // Restrict scan-start to printable ASCII range with a zero high byte —
        // this is what real Xbox 360 wide strings look like.
        if hi != 0 || !is_printable_ascii(lo) {
            i += 2;
            continue;
        }
        let start = i;
        let mut codeunits: Vec<u16> = Vec::new();
        while i + 2 <= bytes.len() {
            let l = bytes[i];
            let h = bytes[i + 1];
            if h != 0 || !is_printable_ascii(l) { break; }
            codeunits.push((h as u16) << 8 | l as u16);
            i += 2;
        }
        // Require NUL u16 terminator.
        let nul_terminated = i + 2 <= bytes.len() && bytes[i] == 0 && bytes[i + 1] == 0;
        if codeunits.len() >= MIN_LEN && nul_terminated {
            let s: String = String::from_utf16_lossy(&codeunits);
            out.push(DetectedString {
                address: va_base + start as u32,
                encoding: "utf16le",
                length: ((i - start) as u32),
                content: s,
            });
        }
        // Skip past the terminator.
        if nul_terminated { i += 2; }
    }
}

/// Per JIS X 0208: Shift_JIS first byte ∈ [0x81, 0x9F] ∪ [0xE0, 0xEF];
/// trail byte ∈ [0x40, 0x7E] ∪ [0x80, 0xFC]. Single-byte ASCII and JIS
/// half-width katakana (0xA1..=0xDF) are passed through.
fn is_sjis_lead(b: u8) -> bool {
    (0x81..=0x9F).contains(&b) || (0xE0..=0xEF).contains(&b)
}
fn is_sjis_trail(b: u8) -> bool {
    (0x40..=0x7E).contains(&b) || (0x80..=0xFC).contains(&b)
}
fn is_sjis_singlebyte(b: u8) -> bool {
    is_printable_ascii(b) || (0xA1..=0xDF).contains(&b)
}

/// Scan for Shift_JIS strings — runs of ≥ 6 bytes consisting of valid
/// SJIS code units (single-byte ASCII / half-width katakana, OR a
/// lead+trail pair). At least one multi-byte pair must be present so we
/// don't double-count strings that are purely ASCII.
fn scan_shift_jis(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
    let mut i = 0;
    while i < bytes.len() {
        let start = i;
        let mut has_multibyte = false;
        let mut nbytes = 0;
        while i < bytes.len() {
            let b = bytes[i];
            if is_sjis_lead(b) && i + 1 < bytes.len() && is_sjis_trail(bytes[i + 1]) {
                has_multibyte = true;
                nbytes += 2;
                i += 2;
            } else if is_sjis_singlebyte(b) {
                nbytes += 1;
                i += 1;
            } else {
                break;
            }
        }
        // Require NUL terminator + min length + at least one multi-byte char.
        if has_multibyte
            && nbytes >= MIN_LEN
            && i < bytes.len() && bytes[i] == 0
        {
            // Decode SJIS → UTF-8 best-effort. We don't ship a full
            // SJIS decoder; keep the bytes as a `\u{XX}\u{YY}…` style
            // rendering for diagnostic readability, and let downstream
            // tooling re-decode if needed.
            let raw = &bytes[start..i];
            let mut s = String::with_capacity(raw.len() * 4);
            let mut p = 0;
            while p < raw.len() {
                let b = raw[p];
                if is_sjis_lead(b) && p + 1 < raw.len() && is_sjis_trail(raw[p + 1]) {
                    // Render as SJIS hex pair so the string is identifiable
                    // even without a decoder. Real Japanese decoding is a
                    // future enhancement.
                    s.push_str(&format!("\\x{:02X}\\x{:02X}", b, raw[p + 1]));
                    p += 2;
                } else {
                    s.push(b as char);
                    p += 1;
                }
            }
            out.push(DetectedString {
                address: va_base + start as u32,
                encoding: "shift_jis",
                length: nbytes as u32,
                content: s,
            });
            i += 1; // skip NUL
        } else {
            // Advance past whatever didn't match.
            i = start + 1;
            if i < bytes.len() && bytes[i] == 0 { i += 1; }
        }
    }
}

/// Scan for UTF-8 strings carrying multi-byte sequences (we already
/// catch pure-ASCII via `scan_ascii`). Validates 2/3-byte sequences;
/// 4-byte (supplementary plane) is uncommon in game text and skipped.
fn scan_utf8(bytes: &[u8], va_base: u32, out: &mut Vec<DetectedString>) {
    let mut i = 0;
    while i < bytes.len() {
        let start = i;
        let mut has_multibyte = false;
        let mut nbytes = 0;
        while i < bytes.len() {
            let b = bytes[i];
            if b < 0x80 {
                if !is_printable_ascii(b) { break; }
                nbytes += 1;
                i += 1;
            } else if (b & 0xE0) == 0xC0 {
                // 2-byte: 110xxxxx 10xxxxxx
                if i + 1 >= bytes.len() || (bytes[i + 1] & 0xC0) != 0x80 { break; }
                has_multibyte = true;
                nbytes += 2;
                i += 2;
            } else if (b & 0xF0) == 0xE0 {
                // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx
                if i + 2 >= bytes.len()
                    || (bytes[i + 1] & 0xC0) != 0x80
                    || (bytes[i + 2] & 0xC0) != 0x80 { break; }
                has_multibyte = true;
                nbytes += 3;
                i += 3;
            } else {
                break;
            }
        }
        if has_multibyte
            && nbytes >= MIN_LEN
            && i < bytes.len() && bytes[i] == 0
            && let Ok(s) = std::str::from_utf8(&bytes[start..i])
        {
            out.push(DetectedString {
                address: va_base + start as u32,
                encoding: "utf8",
                length: nbytes as u32,
                content: s.to_string(),
            });
            i += 1; // skip NUL
        } else {
            i = start + 1;
            if i < bytes.len() && bytes[i] == 0 { i += 1; }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn mk_section(name: &str, va: u32, size: u32) -> PeSection {
        PeSection {
            name: name.into(),
            virtual_address: va,
            virtual_size: size,
            raw_offset: va,
            raw_size: size,
            flags: 0x4000_0040,
        }
    }

    #[test]
    fn detects_ascii_string() {
        let image_base = 0x82000000u32;
        let mut pe = vec![0u8; 0x1100];
        let off = 0x1000usize;
        let s = b"Hello, world!\0";
        pe[off..off + s.len()].copy_from_slice(s);
        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
        let strings = analyze(&pe, image_base, &sections);
        assert_eq!(strings.len(), 1);
        assert_eq!(strings[0].encoding, "ascii");
        assert_eq!(strings[0].content, "Hello, world!");
        assert_eq!(strings[0].address, image_base + 0x1000);
    }

    #[test]
    fn rejects_short_runs() {
        let image_base = 0x82000000u32;
        let mut pe = vec![0u8; 0x1100];
        let off = 0x1000usize;
        let s = b"Hi\0longer string here\0";
        pe[off..off + s.len()].copy_from_slice(s);
        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
        let strings = analyze(&pe, image_base, &sections);
        assert_eq!(strings.len(), 1);
        assert_eq!(strings[0].content, "longer string here");
    }

    #[test]
    fn detects_utf16le_string() {
        let image_base = 0x82000000u32;
        let mut pe = vec![0u8; 0x1100];
        let off = 0x1000usize;
        // "Hello!" in UTF-16LE + NUL u16
        let s: &[u8] = b"H\0e\0l\0l\0o\0!\0\0\0";
        pe[off..off + s.len()].copy_from_slice(s);
        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
        let strings = analyze(&pe, image_base, &sections);
        // Both ASCII and UTF-16 may detect — UTF-16 should find it as wide;
        // ASCII pass scans bytes and won't see this as a contiguous run
        // because of the interleaved 0 bytes (non-printable).
        let utf16: Vec<_> = strings.iter().filter(|s| s.encoding == "utf16le").collect();
        assert!(utf16.iter().any(|s| s.content == "Hello!"));
    }

    #[test]
    fn detects_shift_jis_string() {
        let image_base = 0x82000000u32;
        let mut pe = vec![0u8; 0x1100];
        let off = 0x1000usize;
        // "ABC" + (SJIS hiragana 'a' = 0x82 0xA0) + (SJIS 'i' = 0x82 0xA2) + NUL
        let s: &[u8] = b"ABC\x82\xA0\x82\xA2\0";
        pe[off..off + s.len()].copy_from_slice(s);
        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
        let strings = analyze(&pe, image_base, &sections);
        let sjis: Vec<_> = strings.iter().filter(|s| s.encoding == "shift_jis").collect();
        assert_eq!(sjis.len(), 1);
        assert!(sjis[0].content.contains("ABC"));
        assert!(sjis[0].content.contains("\\x82\\xA0"));
    }

    #[test]
    fn detects_utf8_multibyte_string() {
        let image_base = 0x82000000u32;
        let mut pe = vec![0u8; 0x1100];
        let off = 0x1000usize;
        // "Café" = 'C', 'a', 'f', 0xC3 0xA9 (é), then more ASCII to reach min length
        let s: &[u8] = b"Caf\xC3\xA9eteria\0";
        pe[off..off + s.len()].copy_from_slice(s);
        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
        let strings = analyze(&pe, image_base, &sections);
        let u8s: Vec<_> = strings.iter().filter(|s| s.encoding == "utf8").collect();
        assert_eq!(u8s.len(), 1);
        assert_eq!(u8s[0].content, "Café".to_string() + "eteria");
    }

    #[test]
    fn requires_nul_terminator() {
        let image_base = 0x82000000u32;
        let mut pe = vec![0u8; 0x1100];
        // No trailing NUL — should NOT be detected.
        let off = 0x1000usize;
        let s = b"abcdefghij";
        pe[off..off + s.len()].copy_from_slice(s);
        // Fill rest of section with 0xFF so the run terminates cleanly without NUL.
        for j in off + s.len()..off + 0x100 { pe[j] = 0xFF; }
        let sections = vec![mk_section(".rdata", 0x1000, 0x100)];
        let strings = analyze(&pe, image_base, &sections);
        assert_eq!(strings.len(), 0);
    }
}