//! Interpreter throughput micro-benchmarks.
//!
//! Custom `harness = false` main — no extra dev-deps. Run via
//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`).
//!
//! Three workloads, each measuring `step_cached` throughput in MIPS:
//!
//!   - `tight_alu_loop`  — pure dispatch + ALU + decode-cache hit.
//!   - `loadstore_loop`  — alternating `lwz`/`stw` against main RAM. Stresses
//!                         every load/store path and `find_mmio` dispatch.
//!   - `mmio_storm`      — same shape as `loadstore_loop` but the address is
//!                         in a registered MMIO aperture. Sanity-checks that
//!                         MMIO writes still dispatch correctly.
//!
//! These are not statistically rigorous — no warmup, no variance — they're
//! just enough to detect 2x-class wins or regressions on the perf-track
//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go
//! into commit messages; there is no automated baseline file.

use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;

use xenia_cpu::context::PpcContext;
use xenia_cpu::decoder::DecodeCache;
use xenia_cpu::interpreter::{step_cached, StepResult};
use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion};
use xenia_memory::page_table::MemoryProtect;

// PPC instruction encoders — minimal subset needed by the benches.

#[inline]
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
    (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
}

#[inline]
fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 {
    (32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
}

#[inline]
fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 {
    (36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
}

/// Set up a `GuestMemory` with one writable region for code+data.
fn make_mem(code_base: u32, code_size: u32) -> GuestMemory {
    let mut mem = GuestMemory::new().expect("reserve 4GB");
    mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE)
        .expect("alloc bench region");
    mem
}

/// Write a sequence of raw PPC instructions starting at `base`.
fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) {
    for (i, &raw) in instrs.iter().enumerate() {
        mem.write_u32(base + (i as u32 * 4), raw);
    }
}

/// Run `total_instrs` interpreter steps over a program of length `n`,
/// wrapping PC back to `base` whenever it falls off the end. Returns the
/// elapsed wall time.
fn run_loop(
    ctx: &mut PpcContext,
    mem: &GuestMemory,
    cache: &mut DecodeCache,
    base: u32,
    n: u32,
    total_instrs: u64,
) -> std::time::Duration {
    let end = base + n * 4;
    ctx.pc = base;
    let t0 = Instant::now();
    for _ in 0..total_instrs {
        let pv = mem.page_version(ctx.pc);
        let r = step_cached(ctx, mem, cache, pv);
        debug_assert!(matches!(r, StepResult::Continue));
        if ctx.pc >= end {
            ctx.pc = base;
        }
    }
    t0.elapsed()
}

fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) {
    let secs = elapsed.as_secs_f64();
    let mips = (total_instrs as f64) / secs / 1.0e6;
    println!(
        "{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS",
        label, total_instrs, secs, mips
    );
}

fn bench_tight_alu_loop() {
    const BASE: u32 = 0x1000;
    const N: u32 = 256;
    const TOTAL: u64 = 50_000_000;

    let mut mem = make_mem(BASE, 0x1000);
    // 256 × `addi r3, r3, 1` — pure register-register, no memory touch
    // beyond instruction fetch.
    let prog: Vec<u32> = (0..N).map(|_| enc_addi(3, 3, 1)).collect();
    write_program(&mut mem, BASE, &prog);

    let mut ctx = PpcContext::new();
    let mut cache = DecodeCache::new();

    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL);
    report("tight_alu_loop", TOTAL, elapsed);
}

fn bench_loadstore_loop() {
    const CODE_BASE: u32 = 0x1000;
    const DATA_BASE: u32 = 0x2000;
    const N: u32 = 256;
    const TOTAL: u64 = 30_000_000;

    let mut mem = make_mem(CODE_BASE, 0x2000);
    // 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every
    // load/store path through `read_u32`/`write_u32` (incl. `find_mmio`).
    let mut prog = Vec::with_capacity(N as usize);
    for _ in 0..(N / 2) {
        prog.push(enc_stw(3, 4, 0));
        prog.push(enc_lwz(5, 4, 0));
    }
    write_program(&mut mem, CODE_BASE, &prog);

    let mut ctx = PpcContext::new();
    ctx.gpr[3] = 0xDEAD_BEEF;
    ctx.gpr[4] = DATA_BASE as u64;
    let mut cache = DecodeCache::new();

    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
    report("loadstore_loop", TOTAL, elapsed);
}

fn bench_mmio_storm() {
    const CODE_BASE: u32 = 0x1000;
    const MMIO_BASE: u32 = 0xEA00_0000;
    const N: u32 = 64;
    // MMIO is slower per access — keep total smaller so the bench stays
    // under a few seconds.
    const TOTAL: u64 = 2_000_000;

    let mut mem = make_mem(CODE_BASE, 0x1000);

    let writes = Arc::new(AtomicU64::new(0));
    let reads = Arc::new(AtomicU32::new(0));
    let writes_clone = writes.clone();
    let reads_clone = reads.clone();
    mem.add_mmio_region(MmioRegion {
        base_address: MMIO_BASE,
        mask: 0xFFFF_0000,
        size: 0x0001_0000,
        read_callback: Box::new(move |_a| {
            reads_clone.fetch_add(1, Ordering::Relaxed);
            0
        }),
        write_callback: Box::new(move |_a, _v| {
            writes_clone.fetch_add(1, Ordering::Relaxed);
        }),
    });

    let mut prog = Vec::with_capacity(N as usize);
    for _ in 0..(N / 2) {
        prog.push(enc_stw(3, 4, 0));
        prog.push(enc_lwz(5, 4, 0));
    }
    write_program(&mut mem, CODE_BASE, &prog);

    let mut ctx = PpcContext::new();
    ctx.gpr[3] = 0x1234_5678;
    ctx.gpr[4] = MMIO_BASE as u64;
    let mut cache = DecodeCache::new();

    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
    report("mmio_storm", TOTAL, elapsed);

    // Sanity assertions — silently catch a refactor that breaks MMIO dispatch.
    let w = writes.load(Ordering::Relaxed);
    let r = reads.load(Ordering::Relaxed);
    assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched");
    assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched");
}

fn main() {
    println!("xenia-cpu interpreter bench");
    println!("  build: {}", if cfg!(debug_assertions) { "debug" } else { "release" });
    bench_tight_alu_loop();
    bench_loadstore_loop();
    bench_mmio_storm();
}