#!/usr/bin/env python3 """Per-tid truncation for canary JSONL logs. Canary's full boot log can exceed 800 MB; the diff tool loads the entire file into RAM. We only need enough events per tid to walk past the first divergence — anything beyond is dead weight. Cap each tid at a configurable max (default: 250k for tid=6 main, 20k for others).""" import json import sys from pathlib import Path MAIN_CAP = 250_000 # tid=6 (canary's main chain — mapped to ours tid=1) SISTER_CAP = 20_000 # everything else def main() -> int: src = Path(sys.argv[1]) dst = Path(sys.argv[2]) counts: dict[int, int] = {} kept = 0 total = 0 with src.open("r", encoding="utf-8") as fin, dst.open("w", encoding="utf-8") as fout: for lineno, line in enumerate(fin, start=1): if lineno == 1: fout.write(line) continue total += 1 try: ev = json.loads(line) except json.JSONDecodeError: continue tid = ev.get("tid", 0) cap = MAIN_CAP if tid == 6 else SISTER_CAP c = counts.get(tid, 0) if c >= cap: continue counts[tid] = c + 1 fout.write(line) kept += 1 print(f"kept {kept}/{total} events across {len(counts)} tids") for tid in sorted(counts): print(f" tid={tid:4d} {counts[tid]}") return 0 if __name__ == "__main__": sys.exit(main())