""" Scrapes xenia-rs source files for per-instruction references and snippets of the interpreter semantics. Outputs produced for each mnemonic: - opcode_line: line in crates/xenia-cpu/src/opcode.rs where the PpcOpcode variant is declared (1-indexed) - decoder_line: line in crates/xenia-cpu/src/decoder.rs where the variant is produced from raw bits - interp_start: line in crates/xenia-cpu/src/interpreter.rs where the match arm `PpcOpcode:: =>` begins - interp_end: line where the arm closes (matching brace, naive) - interp_body: raw text of the arm body (for reviewer reference) The xenia-rs opcode identifier often has trailing `x` preserved (PpcOpcode::addx) — this scraper matches on the XML mnemonic directly plus a stripped alternative without trailing 'x' and the xenia-style identifier forms. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path import re @dataclass class RustRef: mnem: str opcode_line: int | None = None decoder_line: int | None = None interp_start: int | None = None interp_end: int | None = None interp_body: str = "" # PpcOpcode identifiers in xenia-rs match the XML mnemonic *exactly* except # that '.' is illegal in Rust identifiers. Mnemonics ending in '.' appear as # a trailing 'x' replacement in some cases but the codebase seems to keep the # XML name verbatim (e.g. addic. → addicx OR addic_). Check the codebase. def _rust_ident(mnem: str) -> str: """Convert XML mnemonic to the xenia-rs PpcOpcode variant name.""" # Xenia-rs uses the same name as xenia-canary's opcode enum, which # mirrors ppc-instructions.xml directly. '.' is replaced with 'x' in # the opcode enum (e.g. 'addic.' → 'addicx'), but the XML entry is # already 'addic.'. We only need to handle that single case. return mnem.replace(".", "x") class RustScraper: def __init__(self, repo_root: Path): self.repo_root = repo_root self.cpu_root = repo_root / "xenia-rs" / "crates" / "xenia-cpu" / "src" self._opcode_lines = self._read_lines(self.cpu_root / "opcode.rs") self._decoder_lines = self._read_lines(self.cpu_root / "decoder.rs") self._interp_lines = self._read_lines(self.cpu_root / "interpreter.rs") self._opcode_index: dict[str, int] = self._index_opcode_enum() self._decoder_index: dict[str, int] = self._index_decoder() self._interp_index: dict[str, tuple[int, int]] = self._index_interpreter() @staticmethod def _read_lines(path: Path) -> list[str]: if not path.is_file(): return [] return path.read_text(encoding="utf-8").splitlines() def _index_opcode_enum(self) -> dict[str, int]: """Map rust-identifier → 1-indexed line in opcode.rs. The enum uses comma-separated identifiers (often many per line) so we extract every identifier match inside the enum body.""" idx: dict[str, int] = {} token = re.compile(r"\b([A-Za-z_][A-Za-z0-9_]*)\b") in_enum = False for i, line in enumerate(self._opcode_lines, start=1): if "pub enum PpcOpcode" in line: in_enum = True continue if not in_enum: continue if line.startswith("}"): break stripped = line.strip() # skip blank / comment-only lines if not stripped or stripped.startswith("//"): continue # split off any trailing line comment code = stripped.split("//", 1)[0] for m in token.finditer(code): idx.setdefault(m.group(1), i) return idx def _index_decoder(self) -> dict[str, int]: """Map rust-identifier → 1-indexed line of its `PpcOpcode::` producer.""" idx: dict[str, int] = {} pat = re.compile(r"PpcOpcode::([A-Za-z_][A-Za-z0-9_]*)") for i, line in enumerate(self._decoder_lines, start=1): for m in pat.finditer(line): name = m.group(1) # keep the FIRST occurrence (the match-arm line where it's # produced, not any later references) idx.setdefault(name, i) return idx def _index_interpreter(self) -> dict[str, tuple[int, int]]: """Map rust-identifier → (start, end) lines of the match arm. An arm starts at `PpcOpcode::` and ends at the closing `}` at the same indentation level. We accept multi-variant arms of the form `PpcOpcode::a | PpcOpcode::b => {` by recording the same (start, end) for every named variant. """ arm_header = re.compile(r"^(\s*)((?:PpcOpcode::[A-Za-z_][A-Za-z0-9_]*\s*\|\s*)*PpcOpcode::[A-Za-z_][A-Za-z0-9_]*)\s*=>\s*\{?\s*$") # Some arms use no leading whitespace quirks — adjusted regex: arm_header = re.compile( r"^(\s*)" # indent r"((?:PpcOpcode::[A-Za-z_][A-Za-z0-9_]*" # first variant r"(?:\s*\|\s*PpcOpcode::[A-Za-z_][A-Za-z0-9_]*)*))" # more variants r"\s*=>\s*\{?\s*$" ) var_re = re.compile(r"PpcOpcode::([A-Za-z_][A-Za-z0-9_]*)") idx: dict[str, tuple[int, int]] = {} i = 0 n = len(self._interp_lines) while i < n: line = self._interp_lines[i] m = arm_header.match(line) if not m: i += 1 continue indent = m.group(1) names = var_re.findall(m.group(2)) # Find the closing '}' at the same indentation. The arm body # starts on line i (which ends with '{') and ends at a line # whose content (after `indent`) is '}' (with optional trailing # comma). start = i + 1 # 1-indexed end = start j = i + 1 depth = 1 if line.rstrip().endswith("{") else 0 if depth == 0: # Single-expression arm like `... => foo(),` — treat the line # itself as start=end. end = start j = i + 1 else: while j < n: l = self._interp_lines[j] # A naive brace counter suffices for this codebase — the # interpreter arms use balanced braces and no string # literals containing stray braces. depth += l.count("{") - l.count("}") if depth == 0: end = j + 1 # 1-indexed break j += 1 for name in names: idx.setdefault(name, (start, end)) i = j + 1 return idx def lookup(self, mnem: str) -> RustRef: ident = _rust_ident(mnem) ref = RustRef(mnem=mnem) ref.opcode_line = self._opcode_index.get(ident) ref.decoder_line = self._decoder_index.get(ident) rng = self._interp_index.get(ident) if rng: ref.interp_start, ref.interp_end = rng body = "\n".join(self._interp_lines[ref.interp_start - 1: ref.interp_end]) ref.interp_body = body return ref if __name__ == "__main__": root = Path(__file__).resolve().parent.parent.parent s = RustScraper(root) for m in ("addx", "addic.", "lwz", "bclrx", "mfspr", "stvx", "vaddfp", "vaddfp128", "faddx", "lvsl"): r = s.lookup(m) print(f"{m:12s} opcode@{r.opcode_line} decoder@{r.decoder_line} " f"interp@{r.interp_start}-{r.interp_end}")