//! Graphics interrupt + synthetic v-sync bookkeeping (P6).
//!
//! The Xbox 360 graphics driver calls `VdSetGraphicsInterruptCallback` to
//! register a single per-process callback that the OS invokes on:
//!
//! 1. **V-sync** — at 60 Hz; source code 0 (`INTERRUPT_SOURCE_VSYNC`).
//! 2. **Command-processor interrupt** — when `PM4_INTERRUPT` fires from the
//!    guest-issued command stream; source code 1 (`INTERRUPT_SOURCE_CP`).
//!
//! Canary's [xboxkrnl_video.cc:303-310](xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc#L303-L310)
//! dispatches the callback on HW thread 0. We follow the same convention.
//!
//! The delivery model is cooperative: we inject the callback entry into HW
//! thread 0 at the top of a scheduler round when it's safe (not mid-export,
//! not already inside another interrupt). When the callback returns to
//! [`LR_HALT_SENTINEL`] the main loop restores the saved [`PpcContext`]
//! fields and the HW thread picks up where it left off.

use std::collections::VecDeque;
use std::time::{Duration, Instant};

use xenia_cpu::context::{CrField, PpcContext};
use xenia_cpu::ThreadRef;

pub const INTERRUPT_SOURCE_VSYNC: u32 = 0;
pub const INTERRUPT_SOURCE_CP: u32 = 1;

/// Guest-registered V-sync / graphics-interrupt callback (from
/// `VdSetGraphicsInterruptCallback`).
#[derive(Debug, Clone, Copy)]
pub struct GraphicsInterruptCallback {
    pub callback_pc: u32,
    pub user_data: u32,
}

/// Snapshot of the fields we mutate when diverting a HW thread into an
/// interrupt callback. Restored when the callback returns to
/// `LR_HALT_SENTINEL`.
///
/// We save **all PPC volatile registers** (r0, r2–r12) plus `r1` (SP),
/// `pc`, `lr`, `ctr`, and `cr`. Non-volatile regs (r13–r31) are preserved
/// by the callback's own `__savegprlr_N` prologue/epilogue per the PPC
/// ELF ABI, so they don't need stashing here.
///
/// **SP (`gpr[1]`) is included because the injector decrements it by
/// [`CALLBACK_STACK_PAD`] before the callback runs** — see that constant's
/// docs for why. Without this, the callback's `__savegprlr_N` prologue
/// overwrites the interrupted function's own stack-saved LR (which lives
/// at `[r1 - 8]`), and when the interrupted function later tries to
/// return, `bclr` jumps to `LR_HALT_SENTINEL` and the thread exits
/// prematurely.
#[derive(Debug, Clone, Copy)]
pub struct SavedCallbackCtx {
    pub pc: u32,
    pub lr: u64,
    pub ctr: u64,
    /// All PPC volatile GPRs (r0, r2–r12) plus r1 (SP) in index order.
    /// Index 0 = r0, 1 = r1, 2 = r2, …, 12 = r12. Index 13..32 unused.
    pub gprs: [u64; 13],
    pub cr: [CrField; 8],
    pub source: u32,
}

/// Bytes the injector reserves below the interrupted thread's SP before
/// running the ISR callback. Matches Canary's
/// [`Processor::Execute`](../../../../xenia-canary/src/xenia/cpu/processor.cc#L383)
/// which decrements `r[1]` by `64 + 112 = 176` before
/// `function->Call(...)` and restores afterwards. The pad must be larger
/// than any plausible sum of `__savegprlr_N`'s save-area (up to 64 B for
/// r25-r31 + 8 B for LR) plus the callback's own `stwu r1,-N(r1)` frame
/// (the Sylpheed vsync ISR uses 128 B).
///
/// Pre-fix: the ISR's `__savegprlr_25` stored the callback's saved LR
/// (= `LR_HALT_SENTINEL`, from injection) at `[r1 - 8]` — exactly where
/// the interrupted thread's current `bl`-saved LR lived. The
/// interrupted function's return site got stomped with `SENTINEL`, so
/// `__restgprlr_N -> bclr` jumped to the halt sentinel and the thread
/// exited through the wrong path. Manifested in Sylpheed as tid=5
/// (producer for the render queue) terminating at cycle 7.5M, starving
/// both `0x10fc` (main's completion wait) and the PKEVENT that tid=6
/// polls — no second `VdSwap`, no first pixel.
pub const CALLBACK_STACK_PAD: u32 = 64 + 112;

impl SavedCallbackCtx {
    pub fn capture(ctx: &PpcContext, source: u32) -> Self {
        let mut gprs = [0u64; 13];
        for i in 0..13 {
            gprs[i] = ctx.gpr[i];
        }
        Self {
            pc: ctx.pc,
            lr: ctx.lr,
            ctr: ctx.ctr,
            gprs,
            cr: ctx.cr,
            source,
        }
    }

    pub fn restore(self, ctx: &mut PpcContext) {
        ctx.pc = self.pc;
        ctx.lr = self.lr;
        ctx.ctr = self.ctr;
        for i in 0..13 {
            ctx.gpr[i] = self.gprs[i];
        }
        ctx.cr = self.cr;
    }
}

/// Maximum pending sources held in the FIFO queue before new ones are
/// dropped. Four is enough to absorb a short burst (a few v-syncs arriving
/// while HW 0 is mid-callback from a prior one) without letting runaway
/// delivery swamp the guest.
pub const INTERRUPT_QUEUE_CAP: usize = 4;

/// All interrupt bookkeeping — single field on `KernelState`.
///
/// **First-Pixels M2 (2026-04-20)** — changed from a single-slot
/// `pending_source: Option<u32>` coalesce to a bounded FIFO so bursts
/// don't drop silently, and dropped `VSYNC_INSTR_PERIOD` from 500k to
/// 150k so cadence approximates 60 Hz at the current ~10 MIPS interpreter
/// throughput. Combined with the `HwState::ServicingIrq` variant added to
/// `xenia-cpu::scheduler`, interrupts can now be delivered even when HW 0
/// is `Blocked(WaitAny)` — the injector stashes the block into the new
/// variant and the restore path re-blocks when the callback returns,
/// unless a `wake()` during the callback resolved the wait.
/// M2.5 — per-slot pending-IRQ bitmask. Each `AtomicU8` holds one bit per
/// interrupt source (currently 2 sources: VSYNC=bit 0, CP=bit 1) destined
/// for that specific HW slot. Used by the M3 parallel path: T_main (or
/// the GPU thread) sets a bit Release on the target slot's atomic; the
/// target T_cpu_i checks the bit Acquire at its quantum boundary and
/// self-injects without taking another thread's slot lock.
///
/// The 6-element fixed-size array mirrors `xenia_cpu::scheduler::HW_THREAD_COUNT`.
pub type PendingLocalIrq = [std::sync::atomic::AtomicU8;
    xenia_cpu::scheduler::HW_THREAD_COUNT];

#[derive(Debug, Default)]
pub struct InterruptState {
    /// Registered callback (set by `VdSetGraphicsInterruptCallback`).
    pub callback: Option<GraphicsInterruptCallback>,
    /// Bounded FIFO of pending interrupt sources awaiting injection.
    /// Push-back on queue, pop-front on inject. Over-cap pushes drop.
    pub pending: VecDeque<u32>,
    /// When `Some`, some HW thread is currently running a callback; on
    /// return-to-sentinel we restore this and clear the flag.
    pub saved: Option<SavedCallbackCtx>,
    /// Which guest thread the current callback was injected into.
    /// Required because we no longer anchor delivery to HW 0 — any
    /// non-Exited thread is a valid target. Meaningful only while
    /// `saved.is_some()`. Stored as a `ThreadRef` so per-slot
    /// runqueues don't get ambiguous addressing.
    pub injected_ref: Option<ThreadRef>,
    /// Monotonic count of delivered interrupts.
    pub delivered: u64,
    /// Dropped interrupts (callback unset, queue full, or thread
    /// exited/idle at inject time).
    pub dropped: u64,
    /// Instruction-count accumulator for the synthetic v-sync ticker
    /// (legacy path used by unit tests via `tick_vsync_instr`). Production
    /// uses `tick_vsync_wallclock` instead — see [`KRNBUG-D08`].
    pub vsync_accumulator: u64,
    /// Last observed instruction count for the legacy instruction-count
    /// ticker. `tick_vsync_instr` diffs against this to advance
    /// `vsync_accumulator`.
    pub last_instr_count: u64,
    /// Wall-clock anchor for the production v-sync ticker. `None` until
    /// the first `tick_vsync_wallclock` call (lazy init so unit tests
    /// that never invoke that function don't construct an Instant).
    /// Each call fires `(elapsed / VSYNC_PERIOD)` v-syncs and advances
    /// the anchor by that many full periods.
    pub last_vsync_instant: Option<Instant>,
    /// M2.5 — per-slot pending-IRQ bits. Set by the producer (M3's
    /// IRQ-routing logic on `T_main`) with `Release`; consumed by the
    /// target T_cpu_i with `Acquire` at quantum boundary. Unused under
    /// the lockstep path (M2's single-host-thread model still uses
    /// `pending` + `try_inject_graphics_interrupt`); the field is wired
    /// here so M3's per-HW-thread path is a flag flip, not a refactor.
    pub pending_local_irq: PendingLocalIrq,
}

/// How many guest instructions correspond to one synthetic v-sync.
///
/// **Legacy** — drives `tick_vsync_instr` only. Production uses
/// `tick_vsync_wallclock` with [`VSYNC_PERIOD`]. Kept because audit M11
/// observed this proxy drifts from 629 v-syncs/100M lockstep down to ~2
/// under `--parallel`, where the dispatcher executes more PPC instructions
/// per tick call. Unit tests still drive the instruction-count ticker for
/// determinism.
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;

/// Wall-clock period for the **production** v-sync ticker. 16.667 ms
/// targets exactly 60 Hz. KRNBUG-D08 — converting from the
/// instruction-count proxy fixes the `--parallel` rate drop while
/// keeping lockstep cadence stable (instruction-count was *also* an
/// approximation; wall-clock is the canonical Xbox 360 v-sync source).
pub const VSYNC_PERIOD: Duration = Duration::from_nanos(16_666_667);

impl InterruptState {
    /// Record a new callback registration.
    pub fn set_callback(&mut self, callback_pc: u32, user_data: u32) {
        self.callback = Some(GraphicsInterruptCallback {
            callback_pc,
            user_data,
        });
    }

    /// Queue an interrupt for the next safe injection point.
    pub fn queue_interrupt(&mut self, source: u32) {
        if self.callback.is_none() {
            self.dropped += 1;
            return;
        }
        if self.pending.len() >= INTERRUPT_QUEUE_CAP {
            self.dropped += 1;
            return;
        }
        self.pending.push_back(source);
    }

    /// Peek at the next pending source without removing it.
    pub fn peek_next(&self) -> Option<u32> {
        self.pending.front().copied()
    }

    /// Pop the next pending source (called by the injector after it has
    /// committed to dispatching it).
    pub fn take_next(&mut self) -> Option<u32> {
        self.pending.pop_front()
    }

    /// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
    /// that need a deterministic clock source. Production code calls
    /// `tick_vsync_wallclock` instead. Returns `true` if at least one
    /// v-sync was queued.
    pub fn tick_vsync_instr(&mut self, current_instr_count: u64) -> bool {
        let delta = current_instr_count.saturating_sub(self.last_instr_count);
        self.last_instr_count = current_instr_count;
        self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
        if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
            return false;
        }
        let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
        self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
        for _ in 0..periods {
            self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        }
        true
    }

    /// **Production** — wall-clock v-sync ticker. Fires
    /// `floor(elapsed / VSYNC_PERIOD)` v-syncs since the last call and
    /// advances the anchor by that many full periods (so a long pause
    /// doesn't lose all the v-syncs it spans, and a quick succession of
    /// calls doesn't over-fire). KRNBUG-D08 — replaces the legacy
    /// instruction-count proxy that drifted under `--parallel`.
    /// Returns `true` if at least one v-sync was queued.
    pub fn tick_vsync_wallclock(&mut self) -> bool {
        let now = Instant::now();
        let anchor = match self.last_vsync_instant {
            Some(t) => t,
            None => {
                self.last_vsync_instant = Some(now);
                return false;
            }
        };
        let elapsed = now.saturating_duration_since(anchor);
        let period_ns = VSYNC_PERIOD.as_nanos() as u64;
        let elapsed_ns = elapsed.as_nanos() as u64;
        let periods = elapsed_ns / period_ns;
        if periods == 0 {
            return false;
        }
        // Advance the anchor by the number of full periods consumed,
        // not to `now`. That lets a long pause distribute its missed
        // v-syncs evenly without lazy-batching the entire backlog into
        // one tick (over-fire would interleave dozens of callback
        // injections back-to-back). Cap at INTERRUPT_QUEUE_CAP so a
        // clock that jumped forward (system suspend) doesn't try to
        // queue more than the FIFO can hold.
        let advance = Duration::from_nanos(periods * period_ns);
        self.last_vsync_instant = Some(anchor + advance);
        let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP);
        for _ in 0..to_queue {
            self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        }
        true
    }

    /// Is HW thread 0 currently in a callback?
    pub fn is_in_callback(&self) -> bool {
        self.saved.is_some()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn queue_interrupt_drops_without_callback() {
        let mut s = InterruptState::default();
        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        assert_eq!(s.dropped, 1);
        assert!(s.pending.is_empty());
    }

    #[test]
    fn queue_interrupt_fifo_preserves_order() {
        let mut s = InterruptState::default();
        s.set_callback(0x1000, 0xAB);
        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        s.queue_interrupt(INTERRUPT_SOURCE_CP);
        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        assert_eq!(s.dropped, 0);
        // FIFO: take_next hands them out in push order.
        assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
        assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_CP));
        assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
        assert_eq!(s.take_next(), None);
    }

    #[test]
    fn queue_interrupt_caps_at_queue_size() {
        let mut s = InterruptState::default();
        s.set_callback(0x1000, 0xAB);
        for _ in 0..INTERRUPT_QUEUE_CAP {
            s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        }
        // Over-cap: drops rather than evicting the oldest.
        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
        assert_eq!(s.dropped, 2);
        assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
    }

    #[test]
    fn tick_vsync_instr_fires_at_new_150k_threshold() {
        let mut s = InterruptState::default();
        s.set_callback(0x1000, 0xAB);
        assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
        assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1));
        assert!(s.pending.is_empty());
        assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD));
        assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
    }

    #[test]
    fn tick_vsync_instr_drains_multiple_periods_in_one_call() {
        // Long kernel export → big instr delta → multiple v-syncs must
        // be delivered, not lost.
        let mut s = InterruptState::default();
        s.set_callback(0x1000, 0xAB);
        assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10));
        assert_eq!(s.pending.len(), 3);
    }

    #[test]
    fn tick_vsync_wallclock_first_call_sets_anchor() {
        // First call seeds the anchor and never fires. KRNBUG-D08:
        // initial wall-clock state has no prior reference, so we can't
        // know the elapsed delta yet.
        let mut s = InterruptState::default();
        s.set_callback(0x1000, 0xAB);
        assert!(!s.tick_vsync_wallclock());
        assert!(s.pending.is_empty());
        assert!(s.last_vsync_instant.is_some());
    }

    #[test]
    fn tick_vsync_wallclock_fires_after_period() {
        // Sleeps one full v-sync period (16.667 ms) and verifies a
        // single v-sync is queued. Sleep is fine in --release tests
        // (one-shot, ~17 ms cost).
        let mut s = InterruptState::default();
        s.set_callback(0x1000, 0xAB);
        s.tick_vsync_wallclock(); // seed
        std::thread::sleep(VSYNC_PERIOD + Duration::from_millis(2));
        assert!(s.tick_vsync_wallclock());
        assert_eq!(s.pending.len(), 1);
        assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
    }

    #[test]
    fn tick_vsync_wallclock_caps_burst_at_queue_cap() {
        // A multi-period elapsed window queues at most
        // INTERRUPT_QUEUE_CAP v-syncs (the FIFO can't hold more anyway).
        // Sleep 6 periods (~100 ms), expect INTERRUPT_QUEUE_CAP queued.
        let mut s = InterruptState::default();
        s.set_callback(0x1000, 0xAB);
        s.tick_vsync_wallclock(); // seed
        std::thread::sleep(VSYNC_PERIOD * 6 + Duration::from_millis(2));
        assert!(s.tick_vsync_wallclock());
        assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
    }

    /// Simulates what the main loop does: inject, execute guest code up
    /// to the sentinel, restore. Uses a single-instruction `bclr` callback
    /// — the interpreter sees `pc == callback_pc`, steps, and the blr
    /// instruction writes `lr` into `pc`, which equals `LR_HALT_SENTINEL`
    /// → main loop detects and triggers restore.
    #[test]
    fn inject_restore_roundtrip_smoke() {
        let mut ctx = PpcContext::new();
        ctx.pc = 0x1000_0000;
        ctx.lr = 0xCAFE_BABE;
        ctx.gpr[3] = 0x1234;
        ctx.gpr[4] = 0x5678;

        let mut s = InterruptState::default();
        s.set_callback(0x2000_0000, 0xDEAD);

        // Simulate main loop inject: save ctx fields, divert pc/lr/r3/r4.
        let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
        s.saved = Some(saved);
        ctx.pc = 0x2000_0000;
        ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
        ctx.gpr[3] = INTERRUPT_SOURCE_VSYNC as u64;
        ctx.gpr[4] = 0xDEAD;
        assert!(s.is_in_callback());

        // Guest callback "runs" to the sentinel — simulate by writing
        // pc = lr (what `blr` would do).
        ctx.pc = ctx.lr as u32;

        // Main loop detects pc == LR_HALT_SENTINEL while in_callback:
        let saved = s.saved.take().unwrap();
        saved.restore(&mut ctx);
        s.delivered += 1;

        assert_eq!(ctx.pc, 0x1000_0000);
        assert_eq!(ctx.lr, 0xCAFE_BABE);
        assert_eq!(ctx.gpr[3], 0x1234);
        assert_eq!(ctx.gpr[4], 0x5678);
        assert!(!s.is_in_callback());
        assert_eq!(s.delivered, 1);
    }

    #[test]
    fn saved_ctx_roundtrip() {
        let mut ctx = PpcContext::new();
        ctx.pc = 0x11223344;
        ctx.lr = 0xDEADBEEF;
        ctx.gpr[3] = 0xAAAA;
        ctx.gpr[4] = 0xBBBB;
        let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
        ctx.pc = 0;
        ctx.lr = 0;
        ctx.gpr[3] = 0;
        ctx.gpr[4] = 0;
        saved.restore(&mut ctx);
        assert_eq!(ctx.pc, 0x11223344);
        assert_eq!(ctx.lr, 0xDEADBEEF);
        assert_eq!(ctx.gpr[3], 0xAAAA);
        assert_eq!(ctx.gpr[4], 0xBBBB);
    }

    /// Full volatile-GPR + SP roundtrip. Regression test for the
    /// 2026-04-24 IRQ-injection fix: the ISR callback's prologue clobbers
    /// `[r1 - 8]` on the interrupted thread's stack unless the injector
    /// pre-decrements SP by [`CALLBACK_STACK_PAD`] and the saved ctx puts
    /// SP (and the rest of the PPC volatile set) back on return.
    #[test]
    fn saved_ctx_covers_sp_and_all_volatile_gprs() {
        let mut ctx = PpcContext::new();
        ctx.pc = 0xAAAA_BBBB;
        ctx.lr = 0x1111_2222;
        ctx.ctr = 0x3333_4444;
        for i in 0..13 {
            ctx.gpr[i] = 0x1000 + i as u64;
        }
        // r13..r31 are non-volatile and should survive the callback's own
        // save/restore — the saved ctx deliberately does NOT cover them.
        for i in 13..32 {
            ctx.gpr[i] = 0xDEAD_0000 + i as u64;
        }

        let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);

        // Simulate injector: flip pc/lr/r1/r3/r4 (what the real injector
        // actually does — see try_inject_graphics_interrupt in main.rs).
        ctx.pc = 0xCAFE;
        ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
        ctx.gpr[1] = ctx.gpr[1].wrapping_sub(CALLBACK_STACK_PAD as u64);
        ctx.gpr[3] = INTERRUPT_SOURCE_VSYNC as u64;
        ctx.gpr[4] = 0xBEEF;
        // Simulate callback clobbering a few volatile regs that aren't
        // part of the "obviously diverted" set.
        ctx.gpr[0] = 0xFEED_FACE;
        ctx.gpr[7] = 0x9999;
        ctx.gpr[12] = 0xABCD;

        saved.restore(&mut ctx);

        // All volatile GPRs restored to pre-injection.
        for i in 0..13 {
            assert_eq!(
                ctx.gpr[i],
                0x1000 + i as u64,
                "volatile r{} clobbered by callback was not restored",
                i
            );
        }
        // SP specifically back to the pre-pad value.
        assert_eq!(ctx.gpr[1], 0x1001, "SP must be restored to pre-injection");
        // Non-volatile regs were never captured; they stay as the callback
        // left them (here, untouched because we didn't modify 13..32).
        for i in 13..32 {
            assert_eq!(ctx.gpr[i], 0xDEAD_0000 + i as u64);
        }
        assert_eq!(ctx.pc, 0xAAAA_BBBB);
        assert_eq!(ctx.lr, 0x1111_2222);
        assert_eq!(ctx.ctr, 0x3333_4444);
    }
}