//! String / constant-pool detection in `.rdata`. //! //! Scans the `.rdata` section for runs of printable ASCII or null-terminated //! UTF-16LE characters of length ≥ 6, emitting one row per discovered string. //! Cross-references against `xrefs.target` are computed by the caller — //! this module only finds the strings; downstream queries can join. //! //! ### What this layer does NOT do //! //! - No UTF-8 multibyte detection — Xbox 360 game binaries reliably use //! ASCII for debug strings and UTF-16LE for localised text. //! - Strings in `.data` (mutable globals) are not scanned by default. //! - Wide strings on Xbox 360 are little-endian (compiler convention even //! on this big-endian platform); we do NOT try big-endian UTF-16. //! - No language detection / classification beyond encoding. //! //! Extends the original ASCII / UTF-16LE pass with Shift_JIS detection //! (Sylpheed is originally Japanese — likely yields mission/UI text //! invisible to ASCII-only) and UTF-8 multi-byte detection. //! //! Reference: `objdump -s` `.rdata` walks rely on the same heuristic; //! Shift_JIS lead/trail byte ranges per JIS X 0208. use xenia_xex::pe::PeSection; /// One detected string. #[derive(Debug, Clone)] pub struct DetectedString { /// Absolute VA of the first byte. pub address: u32, /// `"ascii"` | `"utf16le"` | `"shift_jis"` | `"utf8"`. pub encoding: &'static str, /// Length in bytes (excluding the NUL terminator). pub length: u32, /// UTF-8 representation of the string content. pub content: String, } /// Scan all `.rdata` sections (and any other read-only data section the user /// configures) for ASCII and UTF-16LE strings. #[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))] pub fn analyze(pe: &[u8], image_base: u32, sections: &[PeSection]) -> Vec { let started = std::time::Instant::now(); let mut out: Vec = Vec::new(); for section in sections { if section.name != ".rdata" { continue; } let raw_start = section.virtual_address as usize; let raw_end = (section.virtual_address + section.virtual_size) as usize; if raw_end > pe.len() { continue; } let bytes = &pe[raw_start..raw_end.min(pe.len())]; let va_base = image_base + section.virtual_address; scan_ascii(bytes, va_base, &mut out); scan_utf16le(bytes, va_base, &mut out); scan_shift_jis(bytes, va_base, &mut out); scan_utf8(bytes, va_base, &mut out); } let elapsed_ms = started.elapsed().as_millis() as f64; let n_ascii = out.iter().filter(|s| s.encoding == "ascii").count(); let n_utf16 = out.iter().filter(|s| s.encoding == "utf16le").count(); let n_sjis = out.iter().filter(|s| s.encoding == "shift_jis").count(); let n_utf8 = out.iter().filter(|s| s.encoding == "utf8").count(); metrics::histogram!("analysis.phase_ms", "phase" => "strings").record(elapsed_ms); tracing::info!( ascii = n_ascii, utf16le = n_utf16, shift_jis = n_sjis, utf8 = n_utf8, total = out.len(), elapsed_ms, "string scan complete" ); out } const MIN_LEN: usize = 6; fn is_printable_ascii(b: u8) -> bool { // Printable + the common whitespace characters used in real strings. matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r') } fn scan_ascii(bytes: &[u8], va_base: u32, out: &mut Vec) { let mut i = 0; while i < bytes.len() { if !is_printable_ascii(bytes[i]) { i += 1; continue; } let start = i; while i < bytes.len() && is_printable_ascii(bytes[i]) { i += 1; } let run_len = i - start; // Require NUL termination and minimum length. if run_len >= MIN_LEN && i < bytes.len() && bytes[i] == 0 { let s = std::str::from_utf8(&bytes[start..i]).unwrap_or(""); out.push(DetectedString { address: va_base + start as u32, encoding: "ascii", length: run_len as u32, content: s.to_string(), }); } // Skip the NUL (if any) before continuing. if i < bytes.len() && bytes[i] == 0 { i += 1; } } } fn scan_utf16le(bytes: &[u8], va_base: u32, out: &mut Vec) { // UTF-16LE strings are 2-byte aligned in MSVC output. Walk on even // offsets to avoid misaligned hits. let mut i = 0; while i + 2 <= bytes.len() { if !i.is_multiple_of(2) { i += 1; continue; } let lo = bytes[i]; let hi = bytes[i + 1]; // Restrict scan-start to printable ASCII range with a zero high byte — // this is what real Xbox 360 wide strings look like. if hi != 0 || !is_printable_ascii(lo) { i += 2; continue; } let start = i; let mut codeunits: Vec = Vec::new(); while i + 2 <= bytes.len() { let l = bytes[i]; let h = bytes[i + 1]; if h != 0 || !is_printable_ascii(l) { break; } codeunits.push((h as u16) << 8 | l as u16); i += 2; } // Require NUL u16 terminator. let nul_terminated = i + 2 <= bytes.len() && bytes[i] == 0 && bytes[i + 1] == 0; if codeunits.len() >= MIN_LEN && nul_terminated { let s: String = String::from_utf16_lossy(&codeunits); out.push(DetectedString { address: va_base + start as u32, encoding: "utf16le", length: ((i - start) as u32), content: s, }); } // Skip past the terminator. if nul_terminated { i += 2; } } } /// Per JIS X 0208: Shift_JIS first byte ∈ [0x81, 0x9F] ∪ [0xE0, 0xEF]; /// trail byte ∈ [0x40, 0x7E] ∪ [0x80, 0xFC]. Single-byte ASCII and JIS /// half-width katakana (0xA1..=0xDF) are passed through. fn is_sjis_lead(b: u8) -> bool { (0x81..=0x9F).contains(&b) || (0xE0..=0xEF).contains(&b) } fn is_sjis_trail(b: u8) -> bool { (0x40..=0x7E).contains(&b) || (0x80..=0xFC).contains(&b) } fn is_sjis_singlebyte(b: u8) -> bool { is_printable_ascii(b) || (0xA1..=0xDF).contains(&b) } /// Scan for Shift_JIS strings — runs of ≥ 6 bytes consisting of valid /// SJIS code units (single-byte ASCII / half-width katakana, OR a /// lead+trail pair). At least one multi-byte pair must be present so we /// don't double-count strings that are purely ASCII. fn scan_shift_jis(bytes: &[u8], va_base: u32, out: &mut Vec) { let mut i = 0; while i < bytes.len() { let start = i; let mut has_multibyte = false; let mut nbytes = 0; while i < bytes.len() { let b = bytes[i]; if is_sjis_lead(b) && i + 1 < bytes.len() && is_sjis_trail(bytes[i + 1]) { has_multibyte = true; nbytes += 2; i += 2; } else if is_sjis_singlebyte(b) { nbytes += 1; i += 1; } else { break; } } // Require NUL terminator + min length + at least one multi-byte char. if has_multibyte && nbytes >= MIN_LEN && i < bytes.len() && bytes[i] == 0 { // Decode SJIS → UTF-8 best-effort. We don't ship a full // SJIS decoder; keep the bytes as a `\u{XX}\u{YY}…` style // rendering for diagnostic readability, and let downstream // tooling re-decode if needed. let raw = &bytes[start..i]; let mut s = String::with_capacity(raw.len() * 4); let mut p = 0; while p < raw.len() { let b = raw[p]; if is_sjis_lead(b) && p + 1 < raw.len() && is_sjis_trail(raw[p + 1]) { // Render as SJIS hex pair so the string is identifiable // even without a decoder. Real Japanese decoding is a // future enhancement. s.push_str(&format!("\\x{:02X}\\x{:02X}", b, raw[p + 1])); p += 2; } else { s.push(b as char); p += 1; } } out.push(DetectedString { address: va_base + start as u32, encoding: "shift_jis", length: nbytes as u32, content: s, }); i += 1; // skip NUL } else { // Advance past whatever didn't match. i = start + 1; if i < bytes.len() && bytes[i] == 0 { i += 1; } } } } /// Scan for UTF-8 strings carrying multi-byte sequences (we already /// catch pure-ASCII via `scan_ascii`). Validates 2/3-byte sequences; /// 4-byte (supplementary plane) is uncommon in game text and skipped. fn scan_utf8(bytes: &[u8], va_base: u32, out: &mut Vec) { let mut i = 0; while i < bytes.len() { let start = i; let mut has_multibyte = false; let mut nbytes = 0; while i < bytes.len() { let b = bytes[i]; if b < 0x80 { if !is_printable_ascii(b) { break; } nbytes += 1; i += 1; } else if (b & 0xE0) == 0xC0 { // 2-byte: 110xxxxx 10xxxxxx if i + 1 >= bytes.len() || (bytes[i + 1] & 0xC0) != 0x80 { break; } has_multibyte = true; nbytes += 2; i += 2; } else if (b & 0xF0) == 0xE0 { // 3-byte: 1110xxxx 10xxxxxx 10xxxxxx if i + 2 >= bytes.len() || (bytes[i + 1] & 0xC0) != 0x80 || (bytes[i + 2] & 0xC0) != 0x80 { break; } has_multibyte = true; nbytes += 3; i += 3; } else { break; } } if has_multibyte && nbytes >= MIN_LEN && i < bytes.len() && bytes[i] == 0 && let Ok(s) = std::str::from_utf8(&bytes[start..i]) { out.push(DetectedString { address: va_base + start as u32, encoding: "utf8", length: nbytes as u32, content: s.to_string(), }); i += 1; // skip NUL } else { i = start + 1; if i < bytes.len() && bytes[i] == 0 { i += 1; } } } } #[cfg(test)] mod tests { use super::*; fn mk_section(name: &str, va: u32, size: u32) -> PeSection { PeSection { name: name.into(), virtual_address: va, virtual_size: size, raw_offset: va, raw_size: size, flags: 0x4000_0040, } } #[test] fn detects_ascii_string() { let image_base = 0x82000000u32; let mut pe = vec![0u8; 0x1100]; let off = 0x1000usize; let s = b"Hello, world!\0"; pe[off..off + s.len()].copy_from_slice(s); let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; let strings = analyze(&pe, image_base, §ions); assert_eq!(strings.len(), 1); assert_eq!(strings[0].encoding, "ascii"); assert_eq!(strings[0].content, "Hello, world!"); assert_eq!(strings[0].address, image_base + 0x1000); } #[test] fn rejects_short_runs() { let image_base = 0x82000000u32; let mut pe = vec![0u8; 0x1100]; let off = 0x1000usize; let s = b"Hi\0longer string here\0"; pe[off..off + s.len()].copy_from_slice(s); let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; let strings = analyze(&pe, image_base, §ions); assert_eq!(strings.len(), 1); assert_eq!(strings[0].content, "longer string here"); } #[test] fn detects_utf16le_string() { let image_base = 0x82000000u32; let mut pe = vec![0u8; 0x1100]; let off = 0x1000usize; // "Hello!" in UTF-16LE + NUL u16 let s: &[u8] = b"H\0e\0l\0l\0o\0!\0\0\0"; pe[off..off + s.len()].copy_from_slice(s); let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; let strings = analyze(&pe, image_base, §ions); // Both ASCII and UTF-16 may detect — UTF-16 should find it as wide; // ASCII pass scans bytes and won't see this as a contiguous run // because of the interleaved 0 bytes (non-printable). let utf16: Vec<_> = strings.iter().filter(|s| s.encoding == "utf16le").collect(); assert!(utf16.iter().any(|s| s.content == "Hello!")); } #[test] fn detects_shift_jis_string() { let image_base = 0x82000000u32; let mut pe = vec![0u8; 0x1100]; let off = 0x1000usize; // "ABC" + (SJIS hiragana 'a' = 0x82 0xA0) + (SJIS 'i' = 0x82 0xA2) + NUL let s: &[u8] = b"ABC\x82\xA0\x82\xA2\0"; pe[off..off + s.len()].copy_from_slice(s); let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; let strings = analyze(&pe, image_base, §ions); let sjis: Vec<_> = strings.iter().filter(|s| s.encoding == "shift_jis").collect(); assert_eq!(sjis.len(), 1); assert!(sjis[0].content.contains("ABC")); assert!(sjis[0].content.contains("\\x82\\xA0")); } #[test] fn detects_utf8_multibyte_string() { let image_base = 0x82000000u32; let mut pe = vec![0u8; 0x1100]; let off = 0x1000usize; // "Café" = 'C', 'a', 'f', 0xC3 0xA9 (é), then more ASCII to reach min length let s: &[u8] = b"Caf\xC3\xA9eteria\0"; pe[off..off + s.len()].copy_from_slice(s); let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; let strings = analyze(&pe, image_base, §ions); let u8s: Vec<_> = strings.iter().filter(|s| s.encoding == "utf8").collect(); assert_eq!(u8s.len(), 1); assert_eq!(u8s[0].content, "Café".to_string() + "eteria"); } #[test] fn requires_nul_terminator() { let image_base = 0x82000000u32; let mut pe = vec![0u8; 0x1100]; // No trailing NUL — should NOT be detected. let off = 0x1000usize; let s = b"abcdefghij"; pe[off..off + s.len()].copy_from_slice(s); // Fill rest of section with 0xFF so the run terminates cleanly without NUL. for j in off + s.len()..off + 0x100 { pe[j] = 0xFF; } let sections = vec![mk_section(".rdata", 0x1000, 0x100)]; let strings = analyze(&pe, image_base, §ions); assert_eq!(strings.len(), 0); } }