#!/usr/bin/env python3 """ Phase Non-match Investigation: per-tid profile builder. Streams a large canary cold jsonl and produces: - per-tid event counts - thread.create info (entry_pc, parent_tid, ctx_ptr, priority, name) - thread.exit info (if any) - top kernel.call by name (per tid) - NtSetEvent handle distribution (per tid) - wait.begin handle distribution (per tid) - parent's last few kernel.calls + ExCreateThread LR (per spawn) Usage: python3 build_profiles.py """ import json import sys import os import collections from pathlib import Path def main(): if len(sys.argv) != 3: print("usage: build_profiles.py ", file=sys.stderr) sys.exit(1) src = sys.argv[1] out_dir = Path(sys.argv[2]) out_dir.mkdir(parents=True, exist_ok=True) # Per-tid aggregations. tid_event_count = collections.Counter() tid_first_event = {} # tid -> first event (likely thread.create) tid_thread_create = {} # tid -> thread.create payload (extracted) tid_thread_exit = {} # tid -> (idx, host_ns) tid_call_names = collections.defaultdict(collections.Counter) # tid -> Counter[fn_name] tid_ntset_handles = collections.defaultdict(collections.Counter) # tid -> Counter[raw_handle] tid_wait_handles = collections.defaultdict(collections.Counter) # tid -> Counter[raw_handle] # Spawn-chain capture. # For every kernel.call ExCreateThread / ExCreateThreadEx events: record (tid, idx, return_pc_LR, host_ns) # For every thread.create event: record (tid, parent_tid_in_payload, parent_call_idx) # We then match: for each thread.create, find the most recent ExCreateThread call from parent_tid prior to host_ns. excreate_events = [] # list of {tid, idx, host_ns, name, lr, ctx} create_thread_events = [] # list of full payloads # Handle.create map: raw_handle (hex string lower) -> (object_type, sid, first_seen_tid) handle_create = {} # rolling per-tid last-call cache so we can capture LR # For now: we extract any "lr" field present in kernel.call. total_lines = 0 progress_every = 500_000 import time t0 = time.time() with open(src, 'r', encoding='utf-8', errors='replace') as f: for line in f: total_lines += 1 if total_lines % progress_every == 0: elapsed = time.time() - t0 print(f" lines={total_lines:>10,} elapsed={elapsed:6.1f}s rate={total_lines/elapsed:,.0f}/s", file=sys.stderr) try: e = json.loads(line) except Exception: continue tid = e.get('tid') kind = e.get('kind') tid_event_count[tid] += 1 if tid not in tid_first_event: tid_first_event[tid] = e if kind == 'thread.create': p = e.get('payload', {}) or {} child_tid = p.get('child_tid') if child_tid is None: # fallback: maybe payload has 'new_tid' or 'tid' child_tid = p.get('new_tid') or p.get('thread_id') tid_thread_create[child_tid] = { 'creator_tid': tid, 'event_idx': e.get('tid_event_idx'), 'host_ns': e.get('host_ns'), 'payload': p, } create_thread_events.append({ 'creator_tid': tid, 'child_tid': child_tid, 'host_ns': e.get('host_ns'), 'payload': p, }) elif kind == 'thread.exit': tid_thread_exit[tid] = { 'event_idx': e.get('tid_event_idx'), 'host_ns': e.get('host_ns'), 'payload': e.get('payload', {}), } elif kind == 'handle.create': p = e.get('payload', {}) or {} raw = (p.get('raw_handle_id') or '').lower() if raw: handle_create.setdefault(raw, { 'object_type': p.get('object_type'), 'sid': p.get('handle_semantic_id'), 'object_name': p.get('object_name'), 'first_seen_tid': tid, 'first_seen_host_ns': e.get('host_ns'), }) elif kind == 'wait.begin': p = e.get('payload', {}) or {} raw = (p.get('raw_handle_id') or p.get('handle_id') or '').lower() if raw: tid_wait_handles[tid][raw] += 1 elif kind in ('import.call', 'kernel.call'): p = e.get('payload', {}) or {} name = p.get('name') or p.get('import_name') or p.get('function') if name: tid_call_names[tid][name] += 1 if name in ('ExCreateThread', 'ExCreateThreadEx'): excreate_events.append({ 'tid': tid, 'idx': e.get('tid_event_idx'), 'host_ns': e.get('host_ns'), 'name': name, 'payload': p, }) if name == 'NtSetEvent': raw = (p.get('handle') or p.get('handle_id') or p.get('raw_handle_id') or '') if isinstance(raw, int): raw = f'0x{raw:08x}' if isinstance(raw, str) and raw: tid_ntset_handles[tid][raw.lower()] += 1 # Save raw aggregates. with open(out_dir / 'tid-event-counts.csv', 'w') as fout: fout.write('tid,event_count\n') for tid, n in sorted(tid_event_count.items(), key=lambda x: -x[1]): fout.write(f'{tid},{n}\n') with open(out_dir / 'thread-creates.json', 'w') as fout: json.dump(tid_thread_create, fout, indent=2, sort_keys=True, default=str) with open(out_dir / 'thread-exits.json', 'w') as fout: json.dump(tid_thread_exit, fout, indent=2, sort_keys=True, default=str) with open(out_dir / 'excreate-events.json', 'w') as fout: json.dump(excreate_events, fout, indent=2, default=str) with open(out_dir / 'create-thread-events.json', 'w') as fout: json.dump(create_thread_events, fout, indent=2, default=str) with open(out_dir / 'handle-create.json', 'w') as fout: json.dump(handle_create, fout, indent=2, default=str) # Per-tid call counts top-20. with open(out_dir / 'tid-top-calls.txt', 'w') as fout: for tid in sorted(tid_event_count.keys(), key=lambda t: -tid_event_count[t]): fout.write(f'=== tid={tid} total_events={tid_event_count[tid]:,} ===\n') top = tid_call_names[tid].most_common(20) for name, n in top: fout.write(f' {n:>10,} {name}\n') fout.write('\n') # Per-tid NtSetEvent handle distribution. with open(out_dir / 'tid-ntset-handles.txt', 'w') as fout: for tid in sorted(tid_ntset_handles.keys(), key=lambda t: -sum(tid_ntset_handles[t].values())): if not tid_ntset_handles[tid]: continue total = sum(tid_ntset_handles[tid].values()) fout.write(f'=== tid={tid} NtSetEvent total={total:,} ===\n') for raw, n in tid_ntset_handles[tid].most_common(10): hc = handle_create.get(raw, {}) fout.write(f' {n:>8,} {raw} obj_type={hc.get("object_type")} sid={hc.get("sid")} first_seen_tid={hc.get("first_seen_tid")}\n') fout.write('\n') # Per-tid wait.begin handle distribution. with open(out_dir / 'tid-wait-handles.txt', 'w') as fout: for tid in sorted(tid_wait_handles.keys(), key=lambda t: -sum(tid_wait_handles[t].values())): if not tid_wait_handles[tid]: continue total = sum(tid_wait_handles[tid].values()) fout.write(f'=== tid={tid} wait.begin total={total:,} ===\n') for raw, n in tid_wait_handles[tid].most_common(10): hc = handle_create.get(raw, {}) fout.write(f' {n:>8,} {raw} obj_type={hc.get("object_type")} sid={hc.get("sid")} first_seen_tid={hc.get("first_seen_tid")}\n') fout.write('\n') # Spawn-chain matching. # For each thread.create, find the immediately-preceding ExCreateThread* call on creator_tid before host_ns. # Build per-tid sorted excreate list once. excreate_by_tid = collections.defaultdict(list) for ev in excreate_events: excreate_by_tid[ev['tid']].append(ev) for tid in excreate_by_tid: excreate_by_tid[tid].sort(key=lambda e: e['host_ns']) spawn_chain = [] for tc in create_thread_events: ct = tc['creator_tid'] hns = tc['host_ns'] # Find newest ExCreateThread call on ct with host_ns <= hns cand = excreate_by_tid.get(ct, []) best = None for ev in cand: if ev['host_ns'] <= hns: best = ev else: break spawn_chain.append({ 'child_tid': tc['child_tid'], 'creator_tid': ct, 'child_host_ns': hns, 'child_payload': tc['payload'], 'parent_excreate': best, }) with open(out_dir / 'spawn-chain.json', 'w') as fout: json.dump(spawn_chain, fout, indent=2, default=str) print(f"\nDone. lines={total_lines:,} tids={len(tid_event_count)} outputs at {out_dir}") if __name__ == '__main__': main()