use std::cell::Cell;
use std::sync::atomic::{AtomicU64, Ordering};

use crate::access::MemoryAccess;
use crate::mmio::MmioRegion;
use crate::page_table::{AllocationState, MemoryProtect, PageEntry};
use crate::MemoryError;

thread_local! {
    static WRITER_CTX: Cell<(u32, u32, u32)> = const { Cell::new((0, 0, 0)) };
}

/// Stamp the (tid, pc, lr) of the executing instruction on the current
/// host thread. Read by [`GuestMemory::check_mem_watch`] when a watched
/// store fires, so the emitted trace line names the writer. Cheap —
/// thread-local `Cell::set`, no syscalls. Default `(0,0,0)` is harmless
/// when no watch is armed.
pub fn set_writer_ctx(tid: u32, pc: u32, lr: u32) {
    WRITER_CTX.with(|c| c.set((tid, pc, lr)));
}

fn writer_ctx() -> (u32, u32, u32) {
    WRITER_CTX.with(|c| c.get())
}

const PAGE_SIZE: u32 = 4096;
/// Total guest address space: 4GB.
const GUEST_ADDRESS_SPACE: usize = 0x1_0000_0000;
/// Number of 4K pages in the 4GB address space.
const PAGE_COUNT: usize = GUEST_ADDRESS_SPACE / PAGE_SIZE as usize;
/// Physical memory mask (512MB physical address space).
const PHYSICAL_ADDR_MASK: u32 = 0x1FFF_FFFF;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HeapType {
    GuestVirtual,
    GuestXex,
    GuestPhysical,
}

/// The core guest memory system. Manages a 4GB virtual address space
/// via mmap/VirtualAlloc, with page-level tracking and MMIO dispatch.
pub struct GuestMemory {
    /// Host pointer to the base of the 4GB guest address space.
    membase: *mut u8,
    /// Page table tracking allocation state for each 4K page. Each entry is
    /// an `AtomicU64` carrying the bit-packed [`PageEntry`] representation.
    /// Atomic so [`Self::alloc`] (and friends) can take `&self` and run
    /// concurrently with the load/store hot path's [`Self::is_mapped`]
    /// checks. Allocation crosses many pages but each per-page Release store
    /// is independently published; readers (`is_mapped`/`page_entry`) use
    /// Acquire loads. Multi-page atomicity is not provided — callers ensure
    /// happens-before via export ordering (alloc completes before any guest
    /// access of the new region).
    page_table: Vec<std::sync::atomic::AtomicU64>,
    /// Registered MMIO regions (sorted by base address for binary search).
    mmio_regions: Vec<MmioRegion>,
    /// Cached *necessary* condition for an address to fall inside *any*
    /// registered MMIO region: an address `a` can match only if
    /// `(a & mmio_aperture_mask) == mmio_aperture_value`. Recomputed
    /// inside [`add_mmio_region`] as the union (greatest common
    /// prefix) of every region's `(mask, base & mask)` pair.
    ///
    /// With the GPU MMIO at `0x7FC8_0000 / 0xFFFF_0000` as the only
    /// registered region, this is a single bit-mask compare per scalar
    /// load/store — eliminating the prior O(N) `iter().find` over the
    /// region list on every access. With zero regions registered the
    /// flag stays at the "match nothing" sentinel and the hot path
    /// returns `None` without touching the Vec.
    mmio_aperture_mask: u32,
    mmio_aperture_value: u32,
    /// Whether the memory mapping is owned (should be unmapped on drop).
    owned: bool,
    /// P5 texture-cache invalidation: per-4KB-page monotonic write
    /// version. Every `write_u8/16/32/64` bumps
    /// `page_versions[addr >> 12]`, and a global `writes_total` counter
    /// (shared by all pages) gets stamped into each page. The texture
    /// cache computes `max(page_versions[..])` over the texture's byte
    /// footprint at bind time and re-decodes if any page has advanced
    /// since the cached entry.
    page_versions: Vec<AtomicU64>,
    /// Monotonic global write counter — makes per-page versions
    /// cross-comparable even when their indices alias.
    writes_total: AtomicU64,
    /// Sorted list of guest byte addresses to log on every store that
    /// touches them. Populated once via [`Self::arm_mem_watch`] before
    /// the run starts; stable for the run. Empty by default → the hot
    /// path's `is_empty()` check is a single cache-resident load.
    mem_watch_addrs: Vec<u32>,
    /// Count of fires observed (for tests / hand-off telemetry).
    mem_watch_count: AtomicU64,
}

/// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit
/// where `mask_a` *and* `mask_b` are set and the masked values agree.
/// Used by `add_mmio_region` to fold a new region into the cached
/// fast-reject pair without losing soundness — the result is always a
/// *necessary* condition for membership in either region.
#[inline]
fn fold_aperture(
    cur_mask: u32,
    cur_value: u32,
    new_mask: u32,
    new_value: u32,
) -> (u32, u32) {
    // Bits that both masks cover AND on which both values agree are the
    // only bits we can keep. Disagreement on any covered bit collapses
    // that bit out of the cache.
    let common_mask = cur_mask & new_mask;
    let agreed = !(cur_value ^ new_value);
    let m = common_mask & agreed;
    (m, cur_value & m)
}

unsafe impl Send for GuestMemory {}
unsafe impl Sync for GuestMemory {}

impl GuestMemory {
    /// Create a new guest memory space by reserving a 4GB virtual address region.
    pub fn new() -> Result<Self, MemoryError> {
        let membase = crate::platform::reserve_address_space(GUEST_ADDRESS_SPACE)?;
        Ok(Self {
            membase,
            page_table: (0..PAGE_COUNT).map(|_| std::sync::atomic::AtomicU64::new(0)).collect(),
            mmio_regions: Vec::new(),
            // Sentinel "match nothing" — `(a & !0) == !0` is false for
            // any `a`, so `find_mmio` short-circuits to `None` until the
            // first region is registered.
            mmio_aperture_mask: u32::MAX,
            mmio_aperture_value: u32::MAX,
            owned: true,
            page_versions: (0..PAGE_COUNT).map(|_| AtomicU64::new(0)).collect(),
            writes_total: AtomicU64::new(0),
            mem_watch_addrs: Vec::new(),
            mem_watch_count: AtomicU64::new(0),
        })
    }

    /// Current version watermark for the page containing `addr`. Bumped by
    /// any write through `write_u8/16/32/64`. Not affected by MMIO writes
    /// (those don't touch the backing texture memory).
    ///
    /// Acquire load: any thread observing a value `v` here also observes
    /// every memory write the bumping thread published before its
    /// Release-store of `v` (see [`bump_page_version`]). This is the
    /// synchronizes-with edge consumed by the texture cache once the GPU
    /// runs on its own host thread.
    pub fn page_version(&self, addr: u32) -> u64 {
        let idx = (addr / PAGE_SIZE) as usize;
        self.page_versions
            .get(idx)
            .map(|a| a.load(Ordering::Acquire))
            .unwrap_or(0)
    }

    /// Maximum page version across the byte span `[addr, addr+len)`.
    /// O(pages) — fast for typical texture sizes (1 MiB = 256 pages).
    pub fn max_page_version(&self, addr: u32, len: u32) -> u64 {
        if len == 0 {
            return self.page_version(addr);
        }
        let first = addr / PAGE_SIZE;
        let last = addr.saturating_add(len.saturating_sub(1)) / PAGE_SIZE;
        let mut m = 0u64;
        for p in first..=last {
            if let Some(slot) = self.page_versions.get(p as usize) {
                let v = slot.load(Ordering::Acquire);
                if v > m {
                    m = v;
                }
            }
        }
        m
    }

    /// Total number of write events observed. Useful for cross-page tie
    /// breaking and HUD-level "is the guest scribbling?" metrics.
    pub fn writes_total(&self) -> u64 {
        self.writes_total.load(Ordering::Relaxed)
    }

    #[inline]
    fn bump_page_version(&self, addr: u32) {
        // Relaxed is sufficient for the global tick — the only payload
        // that depends on a particular value is the per-page slot below,
        // and the publish-edge there is its own Release store.
        let stamp = self
            .writes_total
            .fetch_add(1, Ordering::Relaxed)
            .wrapping_add(1);
        let idx = (addr / PAGE_SIZE) as usize;
        if let Some(slot) = self.page_versions.get(idx) {
            // Release: any reader that Acquire-loads this slot and sees
            // `stamp` also observes the data store that preceded this
            // bump (the unsafe `*ptr = val` in the surrounding write_*).
            slot.store(stamp, Ordering::Release);
        }
    }

    /// Get the host base pointer for the guest address space.
    pub fn membase(&self) -> *const u8 {
        self.membase
    }

    /// Get a mutable host base pointer.
    pub fn membase_mut(&mut self) -> *mut u8 {
        self.membase
    }

    /// Translate a guest virtual address to a host pointer.
    pub fn translate_virtual(&self, guest_addr: u32) -> *const u8 {
        unsafe { self.membase.add(guest_addr as usize) }
    }

    /// Translate a guest virtual address to a mutable host pointer.
    ///
    /// Takes `&self`. The returned pointer is into the shared
    /// `membase` mapping; the soundness contract is the trait-level one
    /// in [`crate::access::MemoryAccess`] — callers must not concurrently
    /// read and write the same byte range from different threads.
    pub fn translate_virtual_mut(&self, guest_addr: u32) -> *mut u8 {
        unsafe { self.membase.add(guest_addr as usize) }
    }

    /// Translate a guest physical address to a host pointer.
    pub fn translate_physical(&self, guest_addr: u32) -> *const u8 {
        let phys = guest_addr & PHYSICAL_ADDR_MASK;
        unsafe { self.membase.add(phys as usize) }
    }

    /// Register an MMIO region.
    pub fn add_mmio_region(&mut self, region: MmioRegion) {
        let new_mask = region.mask;
        let new_value = region.base_address & region.mask;
        if self.mmio_regions.is_empty() {
            self.mmio_aperture_mask = new_mask;
            self.mmio_aperture_value = new_value;
        } else {
            let (m, v) = fold_aperture(
                self.mmio_aperture_mask,
                self.mmio_aperture_value,
                new_mask,
                new_value,
            );
            self.mmio_aperture_mask = m;
            self.mmio_aperture_value = v;
        }
        let base = region.base_address;
        let idx = self
            .mmio_regions
            .binary_search_by_key(&base, |r| r.base_address)
            .unwrap_or_else(|i| i);
        self.mmio_regions.insert(idx, region);
    }

    /// Check if an address is in a registered MMIO region.
    ///
    /// Tier-3 perf — non-MMIO addresses (the common case for code fetch
    /// and main-RAM data accesses) get rejected by a single bit-mask
    /// compare against the cached aperture, skipping the linear search
    /// over `mmio_regions`. The `iter().find` fallback only runs for
    /// addresses that pass the necessary-but-not-sufficient prefilter,
    /// preserving exact MMIO semantics when multiple regions share a
    /// prefix or when a region's `mask` admits non-contiguous addresses.
    #[inline]
    fn find_mmio(&self, addr: u32) -> Option<&MmioRegion> {
        if (addr & self.mmio_aperture_mask) != self.mmio_aperture_value {
            return None;
        }
        self.mmio_regions.iter().find(|r| r.contains(addr))
    }

    /// Allocate a region in the guest address space.
    ///
    /// Validates that `base` is page-aligned and that `base + size` does not
    /// overflow the 4GB guest address space. Takes `&self` — `page_table`
    /// is `Vec<AtomicU64>` so per-page state updates use atomic stores
    /// (`Release` ordering, paired with `Acquire` loads in
    /// [`Self::is_mapped`] / [`Self::page_entry`]). The kernel ensures
    /// happens-before across the alloc-then-use boundary at the export
    /// level (the guest cannot observe the new region until the export
    /// returns), so a single Release per page suffices and we don't need
    /// multi-page atomicity.
    pub fn alloc(
        &self,
        base: u32,
        size: u32,
        protect: MemoryProtect,
    ) -> Result<u32, MemoryError> {
        if !base.is_multiple_of(PAGE_SIZE) {
            return Err(MemoryError::AllocationFailed(format!(
                "alloc base {:#x} is not page-aligned", base
            )));
        }
        let end = (base as u64).saturating_add(size as u64);
        if end > GUEST_ADDRESS_SPACE as u64 {
            return Err(MemoryError::AllocationFailed(format!(
                "alloc range {:#x}+{:#x} exceeds 4GB guest space", base, size
            )));
        }

        let page_start = (base / PAGE_SIZE) as usize;
        let page_count = size.div_ceil(PAGE_SIZE) as usize;

        // Commit pages via platform. `commit_memory` takes `*mut u8` but
        // doesn't actually need exclusive access — the OS-level mmap call
        // is independently thread-safe.
        let host_ptr = unsafe { self.membase.add(base as usize) };
        crate::platform::commit_memory(host_ptr, page_count * PAGE_SIZE as usize)?;

        // Build a single `PageEntry` once, then Release-store it into each
        // affected slot. Using a fresh `PageEntry::default()` per page
        // would yield the same bits but at higher cost.
        let mut entry = PageEntry::default();
        entry.set_base_address(page_start as u32);
        entry.set_region_page_count(page_count as u32);
        entry.set_allocation_protect(protect);
        entry.set_current_protect(protect);
        entry.set_state(AllocationState::RESERVE | AllocationState::COMMIT);
        let raw = entry.raw();
        for i in 0..page_count {
            let idx = page_start + i;
            if let Some(slot) = self.page_table.get(idx) {
                slot.store(raw, std::sync::atomic::Ordering::Release);
            }
        }

        Ok(base)
    }

    /// Read a slice of bytes from guest memory (bypassing MMIO for bulk reads).
    pub fn read_bulk(&self, addr: u32, buf: &mut [u8]) {
        let ptr = self.translate_virtual(addr);
        unsafe {
            std::ptr::copy_nonoverlapping(ptr, buf.as_mut_ptr(), buf.len());
        }
    }

    /// Write a slice of bytes to guest memory (bypassing MMIO for bulk writes).
    ///
    /// Takes `&self` (matches the trait-level write contract): the actual
    /// store goes through a raw `*mut u8` derived from `membase`, which
    /// has no Rust aliasing semantics. Callers must respect the trait
    /// contract — no concurrent read/write of the same byte range from
    /// different threads. Used by the XEX loader (init, single-thread)
    /// and `NtReadFile` (mid-execution; the file's destination buffer is
    /// guest-thread-private by construction).
    ///
    /// XMODBUG-002: bumps `page_versions` for every page the write
    /// touches. Pre-fix, callers like `NtReadFile` could rewrite a page
    /// containing texture or shader bytes that a downstream cache had
    /// already keyed on the prior version — the cache would happily
    /// hand back the stale decoded bytes. The per-byte `write_*` methods
    /// already bump the version after their store; this is the bulk
    /// equivalent. Reservation-table invalidation for `lwarx`/`stwcx.`
    /// remains the caller's responsibility (the table isn't reachable
    /// from `GuestMemory` without a wider plumbing change).
    pub fn write_bulk(&self, addr: u32, buf: &[u8]) {
        let len = buf.len() as u32;
        let old_lane = self.capture_mem_watch_old(addr, len);
        let ptr = self.translate_virtual_mut(addr);
        unsafe {
            std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len());
        }
        if buf.is_empty() {
            return;
        }
        let last_byte = addr.saturating_add(len).saturating_sub(1);
        let first_page = addr / PAGE_SIZE;
        let last_page = last_byte / PAGE_SIZE;
        for page in first_page..=last_page {
            // Use the page-aligned address; bump_page_version computes
            // the slot index by `addr / PAGE_SIZE` so any address within
            // the page works.
            self.bump_page_version(page * PAGE_SIZE);
        }
        self.check_mem_watch(addr, len, old_lane);
    }

    /// Check if a guest address has been allocated/committed. Acquire load
    /// pairs with the Release store in [`Self::alloc`] — any thread that
    /// observes `state.contains(COMMIT)` here also observes every
    /// allocation-side metadata write that preceded the store.
    pub fn is_mapped(&self, addr: u32) -> bool {
        let page = (addr / PAGE_SIZE) as usize;
        if page >= self.page_table.len() {
            return false;
        }
        let raw = self.page_table[page].load(std::sync::atomic::Ordering::Acquire);
        PageEntry::from_raw(raw)
            .state()
            .contains(AllocationState::COMMIT)
    }

    /// Get a page table entry for a given address, or None if out of range.
    /// Returns by value (the storage is now atomic; we publish a snapshot).
    pub fn page_entry(&self, addr: u32) -> Option<PageEntry> {
        let page = (addr / PAGE_SIZE) as usize;
        self.page_table
            .get(page)
            .map(|a| PageEntry::from_raw(a.load(std::sync::atomic::Ordering::Acquire)))
    }

    /// Arm the memory watch set. Each address is checked for byte-exact
    /// overlap with every store; on a hit, one `tracing::info!` line is
    /// emitted at target `mem_watch` with the (tid, pc, lr) of the
    /// writer (set via [`set_writer_ctx`] from the interpreter prologue),
    /// the previous value, and the new value. Read-only diagnostic; the
    /// store itself is unaffected.
    pub fn arm_mem_watch(&mut self, mut addrs: Vec<u32>) {
        addrs.sort();
        addrs.dedup();
        self.mem_watch_addrs = addrs;
    }

    /// Number of mem-watch fires observed since arming.
    pub fn mem_watch_count(&self) -> u64 {
        self.mem_watch_count.load(Ordering::Relaxed)
    }

    /// True iff at least one watch address is armed.
    #[inline]
    pub fn has_mem_watch(&self) -> bool {
        !self.mem_watch_addrs.is_empty()
    }

    /// Hot-path check (post-store): if any watched byte address falls
    /// inside `[addr, addr+len)`, emit a one-line record naming the
    /// (tid, pc, lr) of the writer (per [`set_writer_ctx`]), the
    /// post-store u32 lane at the watched address, and the store
    /// width. `old_lane` is the u32 lane the caller captured BEFORE
    /// the store fired.
    #[inline]
    fn check_mem_watch(&self, addr: u32, len: u32, old_lane_at_watch: Option<(u32, u32)>) {
        if self.mem_watch_addrs.is_empty() {
            return;
        }
        let store_end = addr.saturating_add(len);
        for &watch in &self.mem_watch_addrs {
            if watch >= addr && watch < store_end {
                let new_val = {
                    let p = self.translate_virtual(watch) as *const [u8; 4];
                    u32::from_be_bytes(unsafe { *p })
                };
                let old_val = old_lane_at_watch
                    .and_then(|(w, v)| (w == watch).then_some(v))
                    .unwrap_or(0);
                let (tid, pc, lr) = writer_ctx();
                self.mem_watch_count.fetch_add(1, Ordering::Relaxed);
                tracing::info!(
                    target: "mem_watch",
                    "MEM-WATCH addr={:#010x} old={:#010x} new={:#010x} store_addr={:#010x} store_len={} tid={} pc={:#010x} lr={:#010x}",
                    watch, old_val, new_val, addr, len, tid, pc, lr,
                );
            }
        }
    }

    /// Returns `Some((watch, u32_lane))` if the store at `[addr, addr+len)`
    /// overlaps the first watched address; otherwise `None`. Used by
    /// the write hooks to capture OLD before the store and pass to
    /// [`Self::check_mem_watch`] post-store. Hot-path early-out.
    #[inline]
    fn capture_mem_watch_old(&self, addr: u32, len: u32) -> Option<(u32, u32)> {
        if self.mem_watch_addrs.is_empty() {
            return None;
        }
        let store_end = addr.saturating_add(len);
        for &watch in &self.mem_watch_addrs {
            if watch >= addr && watch < store_end {
                let p = self.translate_virtual(watch) as *const [u8; 4];
                let v = u32::from_be_bytes(unsafe { *p });
                return Some((watch, v));
            }
        }
        None
    }
}

impl MemoryAccess for GuestMemory {
    // Tier-3 perf: `#[inline]` on the hot read/write paths lets LLVM
    // fold the MMIO + mapping checks into the interpreter's load/store
    // handlers, hoisting the "not-MMIO, mapped" branch out of the loop
    // body for consecutive same-page accesses.
    #[inline]
    fn read_u8(&self, addr: u32) -> u8 {
        // MMIO dispatch must come first — a byte read at an MMIO-mapped
        // address should invoke the callback, not the backing memory.
        if let Some(mmio) = self.find_mmio(addr) {
            return (mmio.read_callback)(addr) as u8;
        }
        if !self.is_mapped(addr) { return 0; }
        let ptr = self.translate_virtual(addr);
        unsafe { *ptr }
    }

    #[inline]
    fn read_u16(&self, addr: u32) -> u16 {
        if let Some(mmio) = self.find_mmio(addr) {
            (mmio.read_callback)(addr) as u16
        } else if !self.is_mapped(addr) {
            0
        } else {
            let ptr = self.translate_virtual(addr) as *const [u8; 2];
            u16::from_be_bytes(unsafe { *ptr })
        }
    }

    #[inline]
    fn read_u32(&self, addr: u32) -> u32 {
        if let Some(mmio) = self.find_mmio(addr) {
            (mmio.read_callback)(addr)
        } else if !self.is_mapped(addr) {
            0
        } else {
            let ptr = self.translate_virtual(addr) as *const [u8; 4];
            u32::from_be_bytes(unsafe { *ptr })
        }
    }

    #[inline]
    fn read_u64(&self, addr: u32) -> u64 {
        if let Some(mmio) = self.find_mmio(addr) {
            let hi = (mmio.read_callback)(addr) as u64;
            let lo = (mmio.read_callback)(addr.wrapping_add(4)) as u64;
            (hi << 32) | lo
        } else if !self.is_mapped(addr) {
            0
        } else {
            let ptr = self.translate_virtual(addr) as *const [u8; 8];
            u64::from_be_bytes(unsafe { *ptr })
        }
    }

    fn write_u8(&self, addr: u32, val: u8) {
        // MMIO dispatch first — a byte write at an MMIO-mapped address
        // must invoke the callback, not the backing memory.
        if let Some(mmio) = self.find_mmio(addr) {
            (mmio.write_callback)(addr, val as u32);
            return;
        }
        if !self.is_mapped(addr) { return; }
        let old_lane = self.capture_mem_watch_old(addr, 1);
        let ptr = self.translate_virtual_mut(addr);
        unsafe { *ptr = val };
        self.bump_page_version(addr);
        self.check_mem_watch(addr, 1, old_lane);
    }

    fn write_u16(&self, addr: u32, val: u16) {
        if let Some(mmio) = self.find_mmio(addr) {
            (mmio.write_callback)(addr, val as u32);
        } else if !self.is_mapped(addr) {
        } else {
            let old_lane = self.capture_mem_watch_old(addr, 2);
            let ptr = self.translate_virtual_mut(addr);
            unsafe {
                std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2);
            }
            self.bump_page_version(addr);
            // A 16-bit write can cross a page boundary; bump the neighbour
            // too so the texture cache sees the write even if it's looking
            // at the next page's version.
            if (addr & 0xFFF) >= (PAGE_SIZE - 1) {
                self.bump_page_version(addr.wrapping_add(1));
            }
            self.check_mem_watch(addr, 2, old_lane);
        }
    }

    fn write_u32(&self, addr: u32, val: u32) {
        if let Some(mmio) = self.find_mmio(addr) {
            (mmio.write_callback)(addr, val);
        } else if !self.is_mapped(addr) {
        } else {
            let old_lane = self.capture_mem_watch_old(addr, 4);
            let ptr = self.translate_virtual_mut(addr);
            unsafe {
                std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4);
            }
            self.bump_page_version(addr);
            if (addr & 0xFFF) >= (PAGE_SIZE - 3) {
                self.bump_page_version(addr.wrapping_add(3));
            }
            self.check_mem_watch(addr, 4, old_lane);
        }
    }

    fn write_u64(&self, addr: u32, val: u64) {
        if let Some(mmio) = self.find_mmio(addr) {
            (mmio.write_callback)(addr, (val >> 32) as u32);
            (mmio.write_callback)(addr.wrapping_add(4), val as u32);
        } else if !self.is_mapped(addr) {
        } else {
            let old_lane = self.capture_mem_watch_old(addr, 8);
            let ptr = self.translate_virtual_mut(addr);
            unsafe {
                std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8);
            }
            self.bump_page_version(addr);
            if (addr & 0xFFF) >= (PAGE_SIZE - 7) {
                self.bump_page_version(addr.wrapping_add(7));
            }
            self.check_mem_watch(addr, 8, old_lane);
        }
    }

    fn translate(&self, addr: u32) -> Option<*const u8> {
        if self.find_mmio(addr).is_some() || !self.is_mapped(addr) {
            None
        } else {
            Some(self.translate_virtual(addr))
        }
    }

    fn translate_mut(&self, addr: u32) -> Option<*mut u8> {
        if self.find_mmio(addr).is_some() {
            None
        } else {
            Some(self.translate_virtual_mut(addr))
        }
    }

    /// Override the default impl to hand the xenia-cpu `DecodeCache` a
    /// real per-page version. Zero means "never written" which the cache
    /// treats as a valid version; first write bumps to 1 (via the
    /// global `writes_total` counter already maintained).
    #[inline]
    fn page_version(&self, addr: u32) -> u64 {
        GuestMemory::page_version(self, addr)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicU32, Ordering};
    use std::sync::Arc;

    fn empty_mem() -> GuestMemory { GuestMemory::new().expect("reserve 4GB") }

    #[test]
    fn alloc_rejects_unaligned_base() {
        let mut mem = empty_mem();
        let err = mem.alloc(0x1001, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap_err();
        assert!(matches!(err, MemoryError::AllocationFailed(_)));
    }

    #[test]
    fn alloc_rejects_overflow_past_4gb() {
        let mut mem = empty_mem();
        let err = mem.alloc(0xFFFF_0000, 0x0002_0000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap_err();
        assert!(matches!(err, MemoryError::AllocationFailed(_)));
    }

    #[test]
    fn alloc_succeeds_for_valid_region() {
        let mut mem = empty_mem();
        let base = mem.alloc(0x1000, 0x2000, MemoryProtect::READ | MemoryProtect::WRITE).expect("alloc ok");
        assert_eq!(base, 0x1000);
        assert!(mem.is_mapped(0x1000));
        assert!(mem.is_mapped(0x2FFF));
        assert!(!mem.is_mapped(0x3000));
    }

    #[test]
    fn page_entry_returns_none_out_of_range() {
        let mem = empty_mem();
        // page_entry takes u32; all u32 values fit in the 4GB page table,
        // so OOB-via-addr isn't reachable. Verify the Option behavior on an
        // unmapped but in-range page: entry exists but is free.
        let e = mem.page_entry(0xDEAD_BEEF).expect("in-range");
        assert!(e.is_free());
    }

    #[test]
    fn read_u8_dispatches_to_mmio() {
        let mut mem = empty_mem();
        let seen_addr = Arc::new(AtomicU32::new(0));
        let seen_clone = seen_addr.clone();
        mem.add_mmio_region(MmioRegion {
            base_address: 0xEA00_0000,
            mask: 0xFFFF_FF00,
            size: 0x100,
            read_callback: Box::new(move |a| {
                seen_clone.store(a, Ordering::SeqCst);
                0x42
            }),
            write_callback: Box::new(|_, _| {}),
        });
        let v = mem.read_u8(0xEA00_0008);
        assert_eq!(v, 0x42);
        assert_eq!(seen_addr.load(Ordering::SeqCst), 0xEA00_0008);
    }

    #[test]
    fn write_u8_dispatches_to_mmio() {
        let mut mem = empty_mem();
        let captured = Arc::new(AtomicU32::new(0));
        let captured_clone = captured.clone();
        mem.add_mmio_region(MmioRegion {
            base_address: 0xEB00_0000,
            mask: 0xFFFF_FF00,
            size: 0x100,
            read_callback: Box::new(|_| 0),
            write_callback: Box::new(move |_, v| {
                captured_clone.store(v, Ordering::SeqCst);
            }),
        });
        mem.write_u8(0xEB00_0004, 0xAB);
        assert_eq!(captured.load(Ordering::SeqCst), 0xAB);
    }

    #[test]
    fn u32_read_write_roundtrip_is_big_endian() {
        let mut mem = empty_mem();
        mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap();
        mem.write_u32(0x2000, 0xDEAD_BEEF);
        assert_eq!(mem.read_u32(0x2000), 0xDEAD_BEEF);
        // And verify byte layout is big-endian (PPC native order).
        assert_eq!(mem.read_u8(0x2000), 0xDE);
        assert_eq!(mem.read_u8(0x2001), 0xAD);
        assert_eq!(mem.read_u8(0x2002), 0xBE);
        assert_eq!(mem.read_u8(0x2003), 0xEF);
    }

    #[test]
    fn page_versions_bump_on_write() {
        let mut mem = empty_mem();
        mem.alloc(0x8000, 0x2000, MemoryProtect::READ | MemoryProtect::WRITE)
            .unwrap();
        let v0 = mem.page_version(0x8000);
        assert_eq!(v0, 0);
        mem.write_u32(0x8000, 0xDEAD_BEEF);
        let v1 = mem.page_version(0x8000);
        assert!(v1 > v0, "page version should advance on write");
        // A write to a different page advances only that page.
        mem.write_u8(0x9000, 0xAB);
        assert_eq!(mem.page_version(0x8000), v1);
        assert!(mem.page_version(0x9000) > v1);
        // `max_page_version` across the span picks up the later write.
        let span_max = mem.max_page_version(0x8000, 0x1001);
        assert_eq!(span_max, mem.page_version(0x9000));
    }

    #[test]
    fn mmio_fast_path_skips_non_mmio_address() {
        // After registering a region in the GPU MMIO aperture, a write
        // to an unrelated main-RAM address must NOT be intercepted —
        // it must hit backing memory and bump page_version.
        let mut mem = empty_mem();
        mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE)
            .unwrap();
        let dispatched = Arc::new(AtomicU32::new(0));
        let dispatched_clone = dispatched.clone();
        mem.add_mmio_region(MmioRegion {
            base_address: 0x7FC8_0000,
            mask: 0xFFFF_0000,
            size: 0x0001_0000,
            read_callback: Box::new(move |_| {
                dispatched_clone.fetch_add(1, Ordering::SeqCst);
                0
            }),
            write_callback: Box::new(|_, _| {}),
        });
        let v0 = mem.page_version(0x2000);
        mem.write_u32(0x2000, 0xCAFE_F00D);
        assert_eq!(mem.read_u32(0x2000), 0xCAFE_F00D);
        assert!(mem.page_version(0x2000) > v0);
        assert_eq!(dispatched.load(Ordering::SeqCst), 0,
            "non-MMIO read must not have hit the MMIO callback");
    }

    #[test]
    fn mmio_fast_path_dispatches_for_aperture() {
        // Addresses inside the registered aperture must still hit the
        // callback after the fast-path landed.
        let mut mem = empty_mem();
        let writes = Arc::new(AtomicU32::new(0));
        let reads = Arc::new(AtomicU32::new(0));
        let writes_clone = writes.clone();
        let reads_clone = reads.clone();
        mem.add_mmio_region(MmioRegion {
            base_address: 0x7FC8_0000,
            mask: 0xFFFF_0000,
            size: 0x0001_0000,
            read_callback: Box::new(move |_| {
                reads_clone.fetch_add(1, Ordering::SeqCst);
                0xAA
            }),
            write_callback: Box::new(move |_, _| {
                writes_clone.fetch_add(1, Ordering::SeqCst);
            }),
        });
        mem.write_u32(0x7FC8_0420, 0x1234);
        assert_eq!(writes.load(Ordering::SeqCst), 1);
        let v = mem.read_u32(0x7FC8_0008);
        assert_eq!(v, 0xAA);
        assert_eq!(reads.load(Ordering::SeqCst), 1);
    }

    #[test]
    fn mmio_fast_path_handles_two_disjoint_regions() {
        // Two disjoint MMIO regions — both must dispatch, and a
        // non-MMIO address still must not.
        let mut mem = empty_mem();
        mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE)
            .unwrap();
        let a_writes = Arc::new(AtomicU32::new(0));
        let b_writes = Arc::new(AtomicU32::new(0));
        let a_clone = a_writes.clone();
        let b_clone = b_writes.clone();
        mem.add_mmio_region(MmioRegion {
            base_address: 0x7FC8_0000,
            mask: 0xFFFF_0000,
            size: 0x0001_0000,
            read_callback: Box::new(|_| 0),
            write_callback: Box::new(move |_, _| {
                a_clone.fetch_add(1, Ordering::SeqCst);
            }),
        });
        mem.add_mmio_region(MmioRegion {
            base_address: 0xEA00_0000,
            mask: 0xFFFF_0000,
            size: 0x0001_0000,
            read_callback: Box::new(|_| 0),
            write_callback: Box::new(move |_, _| {
                b_clone.fetch_add(1, Ordering::SeqCst);
            }),
        });
        // Both regions still dispatch.
        mem.write_u32(0x7FC8_0008, 1);
        mem.write_u32(0xEA00_0008, 2);
        assert_eq!(a_writes.load(Ordering::SeqCst), 1);
        assert_eq!(b_writes.load(Ordering::SeqCst), 1);
        // Non-MMIO write still bypasses both callbacks.
        let v0 = mem.page_version(0x2000);
        mem.write_u32(0x2000, 0xDEAD_BEEF);
        assert_eq!(a_writes.load(Ordering::SeqCst), 1);
        assert_eq!(b_writes.load(Ordering::SeqCst), 1);
        assert!(mem.page_version(0x2000) > v0);
        assert_eq!(mem.read_u32(0x2000), 0xDEAD_BEEF);
    }

    #[test]
    fn mmio_fold_aperture_idempotent_for_identical_regions() {
        // Regression: re-registering the same region must not collapse
        // the cached aperture (which would force every fast-rejected
        // address back through the linear iter().find).
        let (m, v) = super::fold_aperture(
            0xFFFF_0000, 0x7FC8_0000,
            0xFFFF_0000, 0x7FC8_0000,
        );
        assert_eq!(m, 0xFFFF_0000);
        assert_eq!(v, 0x7FC8_0000);
    }

    #[test]
    fn mmio_fold_aperture_widens_for_disjoint_regions() {
        // Folding two disjoint regions yields a *necessary*-only mask.
        // The cached pair must accept both region addresses (the inner
        // contains() is the sufficient check) and reject something
        // outside both.
        let (m, v) = super::fold_aperture(
            0xFFFF_0000, 0x7FC8_0000,
            0xFFFF_0000, 0xEA00_0000,
        );
        assert_eq!((0x7FC8_0420u32 & m), v);
        assert_eq!((0xEA00_0008u32 & m), v);
        // 0x2000 is outside both; the fold-mask compare must reject it.
        assert_ne!((0x0000_2000u32 & m), v);
    }

    #[test]
    fn page_versions_ignore_mmio_writes() {
        let mut mem = empty_mem();
        mem.add_mmio_region(MmioRegion {
            base_address: 0xEC00_0000,
            mask: 0xFFFF_FF00,
            size: 0x100,
            read_callback: Box::new(|_| 0),
            write_callback: Box::new(|_, _| {}),
        });
        let before = mem.page_version(0xEC00_0000);
        mem.write_u32(0xEC00_0004, 0x1234);
        assert_eq!(mem.page_version(0xEC00_0000), before);
    }

    #[test]
    fn u64_read_write_roundtrip_is_big_endian() {
        let mut mem = empty_mem();
        mem.alloc(0x3000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap();
        mem.write_u64(0x3000, 0x1122_3344_5566_7788);
        assert_eq!(mem.read_u64(0x3000), 0x1122_3344_5566_7788);
        assert_eq!(mem.read_u8(0x3000), 0x11);
        assert_eq!(mem.read_u8(0x3007), 0x88);
    }

    #[test]
    fn mem_watch_fires_on_overlapping_store() {
        let mut mem = empty_mem();
        mem.alloc(0x4000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap();
        mem.arm_mem_watch(vec![0x4010]);
        super::set_writer_ctx(7, 0x8200_0000, 0x8200_0004);
        // u32 store directly on the watched address fires.
        mem.write_u32(0x4010, 0xDEAD_BEEF);
        assert_eq!(mem.mem_watch_count(), 1);
        // u8 store on the watched byte itself fires.
        mem.write_u8(0x4010, 0x11);
        assert_eq!(mem.mem_watch_count(), 2);
        // u8 store at +2 is outside the byte-exact watch — no fire.
        mem.write_u8(0x4012, 0x22);
        assert_eq!(mem.mem_watch_count(), 2);
        // u16 store strictly outside the watched byte does NOT fire.
        mem.write_u16(0x4014, 0xCAFE);
        assert_eq!(mem.mem_watch_count(), 2);
        // bulk write spanning the watch fires once.
        mem.write_bulk(0x4000, &[0u8; 0x20]);
        assert_eq!(mem.mem_watch_count(), 3);
    }

    #[test]
    fn mem_watch_empty_set_zero_overhead_path() {
        // With no addresses armed, write_u32 must NOT bump the count
        // and must produce identical post-store memory state.
        let mut mem = empty_mem();
        mem.alloc(0x5000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap();
        mem.write_u32(0x5000, 0x1234_5678);
        assert_eq!(mem.read_u32(0x5000), 0x1234_5678);
        assert_eq!(mem.mem_watch_count(), 0);
    }

    #[test]
    fn mem_watch_arm_dedups_and_sorts() {
        let mut mem = empty_mem();
        mem.alloc(0x6000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap();
        mem.arm_mem_watch(vec![0x6008, 0x6004, 0x6008, 0x6004]);
        // A single store hitting either address fires once per watch addr.
        mem.write_u64(0x6004, 0x1111_2222_3333_4444);
        // 0x6004 and 0x6008 are both inside [0x6004, 0x600C); two fires.
        assert_eq!(mem.mem_watch_count(), 2);
    }
}

impl Drop for GuestMemory {
    fn drop(&mut self) {
        if self.owned && !self.membase.is_null() {
            unsafe {
                crate::platform::release_address_space(self.membase, GUEST_ADDRESS_SPACE);
            }
        }
    }
}