/// Trait for all guest memory access. Every load/store goes through this,
/// enabling MMIO checking and debugger observation on every access.
/// This is the key abstraction that eliminates the need for MMIO exception handlers.
///
/// **All methods take `&self`.** Write methods rely on interior mutability
/// (atomics in [`crate::heap::GuestMemory`], `Cell` in test mocks). The
/// actual byte stores into the backing memory are unsynchronized; callers
/// must not concurrently read and write the same byte range from different
/// threads. The per-page write version exposed by [`Self::page_version`] is
/// a coarse cache-invalidation signal and is published with `Release`
/// ordering by the writer — readers using `Acquire` (e.g. the texture
/// cache and the interpreter decode cache) get a synchronizes-with edge to
/// the corresponding data store.
pub trait MemoryAccess {
    fn read_u8(&self, addr: u32) -> u8;
    fn read_u16(&self, addr: u32) -> u16;
    fn read_u32(&self, addr: u32) -> u32;
    fn read_u64(&self, addr: u32) -> u64;
    fn read_f32(&self, addr: u32) -> f32 {
        f32::from_bits(self.read_u32(addr))
    }
    fn read_f64(&self, addr: u32) -> f64 {
        f64::from_bits(self.read_u64(addr))
    }

    fn write_u8(&self, addr: u32, val: u8);
    fn write_u16(&self, addr: u32, val: u16);
    fn write_u32(&self, addr: u32, val: u32);
    fn write_u64(&self, addr: u32, val: u64);
    fn write_f32(&self, addr: u32, val: f32) {
        self.write_u32(addr, val.to_bits());
    }
    fn write_f64(&self, addr: u32, val: f64) {
        self.write_u64(addr, val.to_bits());
    }

    /// Read a block of bytes from guest memory.
    fn read_bytes(&self, addr: u32, buf: &mut [u8]) {
        for (i, byte) in buf.iter_mut().enumerate() {
            *byte = self.read_u8(addr.wrapping_add(i as u32));
        }
    }

    /// Write a block of bytes to guest memory.
    fn write_bytes(&self, addr: u32, buf: &[u8]) {
        for (i, &byte) in buf.iter().enumerate() {
            self.write_u8(addr.wrapping_add(i as u32), byte);
        }
    }

    /// Get a direct host pointer for the given guest address.
    /// Returns None if the address is invalid or in an MMIO region.
    fn translate(&self, addr: u32) -> Option<*const u8>;

    /// Get a mutable direct host pointer for the given guest address.
    fn translate_mut(&self, addr: u32) -> Option<*mut u8>;

    /// Monotonic write-version of the 4 KiB page containing `addr`.
    /// Used by the interpreter's decode cache (xenia-cpu `DecodeCache`)
    /// to invalidate entries when the guest rewrites code pages.
    ///
    /// Default impl returns `1` — a constant non-zero value that works
    /// for mock memories in tests (the decode cache treats
    /// constant-version runs as "never invalidated"). Real memory
    /// (`xenia-memory::GuestMemory`) overrides this with its
    /// per-page counter.
    fn page_version(&self, _addr: u32) -> u64 {
        1
    }

    /// M1.8 — fenced 32-bit write. Used by the GPU's
    /// `PM4_EVENT_WRITE_SHD` to publish a fence value into guest memory
    /// after one or more data writes the CPU thread will read once it
    /// observes the fence. Emits a `Release` fence before the data
    /// store: any earlier writes by the calling thread happen-before
    /// any thread that performs a matching `Acquire` load via
    /// [`Self::read_u32_fence`].
    ///
    /// On x86_64 (TSO) the `Release` fence compiles to a no-op; on
    /// weaker targets it emits the appropriate barrier. The store
    /// itself is 32-bit aligned and naturally atomic on x86_64
    /// (single-copy atomicity) — we rely on that and only fence the
    /// surrounding stores, not the store itself.
    fn write_u32_fence(&self, addr: u32, val: u32) {
        std::sync::atomic::fence(std::sync::atomic::Ordering::Release);
        self.write_u32(addr, val);
    }

    /// M1.8 — fenced 32-bit read. Used by guest fence-poll loops that
    /// busy-spin on a memory location the GPU writes via
    /// [`Self::write_u32_fence`]. Emits an `Acquire` fence after the
    /// load: any reads the calling thread issues *after* this call see
    /// every write the producer issued *before* its `write_u32_fence`.
    fn read_u32_fence(&self, addr: u32) -> u32 {
        let v = self.read_u32(addr);
        std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire);
        v
    }
}