xenia-rs/audit-runs/audit-027-v40-mem-diff/diff_v40.py

#!/usr/bin/env python3
"""Comprehensive dword-level diff of canary's v40 vs ours.

For every 4-byte BE-aligned dword in [0x40000000, 0x7F000000):
  - canary_dw = canary[i*4..i*4+4] interpreted as BE u32
  - ours_dw   = ours  [i*4..i*4+4] interpreted as BE u32

Records:
  CASE A (primary): canary_dw in 0x82000000..0x82A00000 (game-code addr) AND ours_dw != canary_dw
  CASE B (inverse): ours_dw   in 0x82000000..0x82A00000 AND canary_dw != ours_dw

Produces:
  diff.txt     full sorted A-list  (truncated for very large outputs)
  diff-b.txt   inverse B-list
  histogram.txt   bucket count by canary PC's 0x1000-aligned function
  l1-hits.txt     specific renderer cluster L1 PC hits  (from cluster_l1_pcs.txt)
  tables.txt      runs of >=4 consecutive dwords with canary-PC where ours differs
  anchors.txt     diff entries within +/-0x100 of named anchor addresses (e.g. 0x40BA9A80)
  pages.txt       which 64K pages contain divergences and how many
"""
import struct
import sys
import os
from collections import defaultdict

V40_BASE = 0x40000000
V40_LEN  = 0x3F000000
PC_LO    = 0x82000000
PC_HI    = 0x82A00000

NAMED_ANCHORS = {
    0x40BA9A80: "audit-016 listener struct (heap)",
}

def main():
    here = os.path.dirname(os.path.abspath(__file__))
    canary_path = os.path.join(here, "canary-v40.bin")
    ours_path   = os.path.join(here, "ours-v40.bin")

    # Load cluster L1 PCs (full set)
    cluster_pcs = {}
    cl1_path = os.path.join(here, "cluster_l1_pcs.txt")
    if os.path.exists(cl1_path):
        with open(cl1_path) as f:
            for ln in f:
                ln = ln.strip()
                if not ln: continue
                parts = ln.split()
                addr = int(parts[0], 16)
                name = parts[1] if len(parts) > 1 else f"sub_{addr:08X}"
                cluster_pcs[addr] = name
    # Hand-curated narrow set (for emphasis if cluster_pcs is empty)
    NARROW = {
        0x822919C8: "sub_822919C8",
        0x82293448: "sub_82293448",
        0x82288028: "sub_82288028",
        0x82292D80: "sub_82292d80",
        0x822851E0: "sub_822851e0",
        0x82286BC8: "sub_82286bc8",
    }

    canary = open(canary_path, "rb").read()
    ours   = open(ours_path,   "rb").read()
    assert len(canary) == V40_LEN, len(canary)
    assert len(ours)   == V40_LEN, len(ours)

    a_list = []  # canary has PC, ours different
    b_list = []  # ours has PC, canary different
    page_size = 65536
    page_a_count = defaultdict(int)
    for i in range(0, V40_LEN, 4):
        cdw = struct.unpack_from(">I", canary, i)[0]
        odw = struct.unpack_from(">I", ours,   i)[0]
        if cdw == odw:
            continue
        addr = V40_BASE + i
        if PC_LO <= cdw < PC_HI:
            a_list.append((addr, cdw, odw))
            page_a_count[addr & ~(page_size - 1)] += 1
        if PC_LO <= odw < PC_HI:
            b_list.append((addr, cdw, odw))

    print(f"[i] case A divergences (canary has PC, ours differs): {len(a_list)}")
    print(f"[i] case B divergences (ours has PC, canary differs): {len(b_list)}")

    LIMIT = 200000  # cap diff.txt
    with open(os.path.join(here, "diff.txt"), "w") as f:
        f.write(f"# A-list: {len(a_list)} entries -- canary has 0x82xxxxxx PC, ours differs\n")
        f.write(f"# (truncated to first {LIMIT} if larger)\n")
        for addr, c, o in a_list[:LIMIT]:
            f.write(f"addr={addr:#010x}  canary={c:#010x}  ours={o:#010x}\n")
    with open(os.path.join(here, "diff-b.txt"), "w") as f:
        f.write(f"# B-list: {len(b_list)} entries -- ours has 0x82xxxxxx PC, canary differs\n")
        for addr, c, o in b_list[:LIMIT]:
            f.write(f"addr={addr:#010x}  canary={c:#010x}  ours={o:#010x}\n")

    # Histogram by canary PC value (0x1000-aligned)
    bucket = defaultdict(int)
    for _addr, c, _o in a_list:
        bucket[c & ~0xFFF] += 1
    sorted_b = sorted(bucket.items(), key=lambda x: -x[1])
    with open(os.path.join(here, "histogram.txt"), "w") as f:
        f.write("# canary PC value bucket (0x1000-aligned) -> count of A-list entries\n")
        for k, v in sorted_b:
            f.write(f"{k:#010x}  {v}\n")
    print(f"[i] top 20 PC buckets (canary value):")
    for k, v in sorted_b[:20]:
        print(f"    {k:#010x}  {v}")

    # L1 PC hits (full 116 + narrow)
    l1_hits = []
    narrow_hits = []
    for addr, c, o in a_list:
        if c in cluster_pcs:
            l1_hits.append((addr, c, o, cluster_pcs[c]))
        if c in NARROW:
            narrow_hits.append((addr, c, o, NARROW[c]))
    with open(os.path.join(here, "l1-hits.txt"), "w") as f:
        f.write(f"# Renderer cluster L1 PC hits in canary's v40 (broad set, count={len(l1_hits)})\n")
        for addr, c, o, name in l1_hits:
            f.write(f"addr={addr:#010x}  canary={c:#010x}  ours={o:#010x}  // {name}\n")
        f.write(f"\n# Narrow hand-picked subset (count={len(narrow_hits)})\n")
        for addr, c, o, name in narrow_hits:
            f.write(f"addr={addr:#010x}  canary={c:#010x}  ours={o:#010x}  // {name}\n")
    print(f"[i] L1 PC hits (broad 116-fn cluster): {len(l1_hits)}")
    print(f"[i] L1 PC hits (narrow 6-fn picks):    {len(narrow_hits)}")
    for addr, c, o, name in l1_hits[:30]:
        print(f"    addr={addr:#010x}  canary={c:#010x}  // {name}")

    # Table detection: runs of 4+ consecutive 4-byte dwords where canary
    # has any 0x82xxxxxx and ours differs.
    runs = []
    a_sorted = sorted(a_list, key=lambda x: x[0])
    j = 0
    while j < len(a_sorted):
        start = j
        while j + 1 < len(a_sorted) and a_sorted[j+1][0] == a_sorted[j][0] + 4:
            j += 1
        if j - start + 1 >= 4:
            entries = a_sorted[start:j+1]
            zero_count = sum(1 for _a, _c, o in entries if o == 0)
            runs.append((entries[0][0], len(entries), zero_count, entries))
        j += 1
    runs.sort(key=lambda r: -r[1])
    with open(os.path.join(here, "tables.txt"), "w") as f:
        f.write(f"# Consecutive A-list runs (>=4 dwords): {len(runs)} runs\n\n")
        for base, length, zeros, entries in runs[:120]:
            f.write(f"=== run base={base:#010x} length={length} zeros_in_ours={zeros} ===\n")
            for addr, c, o in entries[:64]:
                f.write(f"  +{addr-base:#06x}: canary={c:#010x}  ours={o:#010x}\n")
            if length > 64:
                f.write(f"  ... and {length-64} more\n")
            f.write("\n")
    print(f"[i] table-shaped runs (>=4 consecutive A-list dwords): {len(runs)}")
    for base, length, zeros, _ in runs[:12]:
        print(f"    base={base:#010x}  length={length}  zeros={zeros}")

    # Pages with divergences
    page_sorted = sorted(page_a_count.items(), key=lambda x: -x[1])
    with open(os.path.join(here, "pages.txt"), "w") as f:
        f.write(f"# 64K pages with A-list divergences (count={len(page_sorted)})\n")
        for pg, cnt in page_sorted:
            f.write(f"page={pg:#010x}  count={cnt}\n")
    print(f"[i] pages with divergences: {len(page_sorted)}")
    for pg, cnt in page_sorted[:10]:
        print(f"    page={pg:#010x}  divergent_dwords={cnt}")

    # Anchor neighborhood
    with open(os.path.join(here, "anchors.txt"), "w") as f:
        f.write("# Diff entries within +/-0x100 of named anchor addresses\n\n")
        for anchor, name in NAMED_ANCHORS.items():
            f.write(f"=== {anchor:#010x} ({name}) ===\n")
            for addr, c, o in a_list:
                if abs(addr - anchor) <= 0x100:
                    f.write(f"  addr={addr:#010x}  canary={c:#010x}  ours={o:#010x}\n")
            f.write("\n")

if __name__ == "__main__":
    main()