diff --git a/crates/xenia-memory/src/access.rs b/crates/xenia-memory/src/access.rs
index 7ee3855..7359b0d 100644
--- a/crates/xenia-memory/src/access.rs
+++ b/crates/xenia-memory/src/access.rs
@@ -1,6 +1,16 @@
 /// Trait for all guest memory access. Every load/store goes through this,
 /// enabling MMIO checking and debugger observation on every access.
 /// This is the key abstraction that eliminates the need for MMIO exception handlers.
+///
+/// **All methods take `&self`.** Write methods rely on interior mutability
+/// (atomics in [`crate::heap::GuestMemory`], `Cell` in test mocks). The
+/// actual byte stores into the backing memory are unsynchronized; callers
+/// must not concurrently read and write the same byte range from different
+/// threads. The per-page write version exposed by [`Self::page_version`] is
+/// a coarse cache-invalidation signal and is published with `Release`
+/// ordering by the writer — readers using `Acquire` (e.g. the texture
+/// cache and the interpreter decode cache) get a synchronizes-with edge to
+/// the corresponding data store.
 pub trait MemoryAccess {
     fn read_u8(&self, addr: u32) -> u8;
     fn read_u16(&self, addr: u32) -> u16;
@@ -13,14 +23,14 @@ pub trait MemoryAccess {
         f64::from_bits(self.read_u64(addr))
     }
 
-    fn write_u8(&mut self, addr: u32, val: u8);
-    fn write_u16(&mut self, addr: u32, val: u16);
-    fn write_u32(&mut self, addr: u32, val: u32);
-    fn write_u64(&mut self, addr: u32, val: u64);
-    fn write_f32(&mut self, addr: u32, val: f32) {
+    fn write_u8(&self, addr: u32, val: u8);
+    fn write_u16(&self, addr: u32, val: u16);
+    fn write_u32(&self, addr: u32, val: u32);
+    fn write_u64(&self, addr: u32, val: u64);
+    fn write_f32(&self, addr: u32, val: f32) {
         self.write_u32(addr, val.to_bits());
     }
-    fn write_f64(&mut self, addr: u32, val: f64) {
+    fn write_f64(&self, addr: u32, val: f64) {
         self.write_u64(addr, val.to_bits());
     }
 
@@ -32,7 +42,7 @@ pub trait MemoryAccess {
     }
 
     /// Write a block of bytes to guest memory.
-    fn write_bytes(&mut self, addr: u32, buf: &[u8]) {
+    fn write_bytes(&self, addr: u32, buf: &[u8]) {
         for (i, &byte) in buf.iter().enumerate() {
             self.write_u8(addr.wrapping_add(i as u32), byte);
         }
@@ -43,5 +53,47 @@ pub trait MemoryAccess {
     fn translate(&self, addr: u32) -> Option<*const u8>;
 
     /// Get a mutable direct host pointer for the given guest address.
-    fn translate_mut(&mut self, addr: u32) -> Option<*mut u8>;
+    fn translate_mut(&self, addr: u32) -> Option<*mut u8>;
+
+    /// Monotonic write-version of the 4 KiB page containing `addr`.
+    /// Used by the interpreter's decode cache (xenia-cpu `DecodeCache`)
+    /// to invalidate entries when the guest rewrites code pages.
+    ///
+    /// Default impl returns `1` — a constant non-zero value that works
+    /// for mock memories in tests (the decode cache treats
+    /// constant-version runs as "never invalidated"). Real memory
+    /// (`xenia-memory::GuestMemory`) overrides this with its
+    /// per-page counter.
+    fn page_version(&self, _addr: u32) -> u64 {
+        1
+    }
+
+    /// M1.8 — fenced 32-bit write. Used by the GPU's
+    /// `PM4_EVENT_WRITE_SHD` to publish a fence value into guest memory
+    /// after one or more data writes the CPU thread will read once it
+    /// observes the fence. Emits a `Release` fence before the data
+    /// store: any earlier writes by the calling thread happen-before
+    /// any thread that performs a matching `Acquire` load via
+    /// [`Self::read_u32_fence`].
+    ///
+    /// On x86_64 (TSO) the `Release` fence compiles to a no-op; on
+    /// weaker targets it emits the appropriate barrier. The store
+    /// itself is 32-bit aligned and naturally atomic on x86_64
+    /// (single-copy atomicity) — we rely on that and only fence the
+    /// surrounding stores, not the store itself.
+    fn write_u32_fence(&self, addr: u32, val: u32) {
+        std::sync::atomic::fence(std::sync::atomic::Ordering::Release);
+        self.write_u32(addr, val);
+    }
+
+    /// M1.8 — fenced 32-bit read. Used by guest fence-poll loops that
+    /// busy-spin on a memory location the GPU writes via
+    /// [`Self::write_u32_fence`]. Emits an `Acquire` fence after the
+    /// load: any reads the calling thread issues *after* this call see
+    /// every write the producer issued *before* its `write_u32_fence`.
+    fn read_u32_fence(&self, addr: u32) -> u32 {
+        let v = self.read_u32(addr);
+        std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire);
+        v
+    }
 }
diff --git a/crates/xenia-memory/src/heap.rs b/crates/xenia-memory/src/heap.rs
index b34a33f..18cd16f 100644
--- a/crates/xenia-memory/src/heap.rs
+++ b/crates/xenia-memory/src/heap.rs
@@ -1,3 +1,5 @@
+use std::sync::atomic::{AtomicU64, Ordering};
+
 use crate::access::MemoryAccess;
 use crate::mmio::MmioRegion;
 use crate::page_table::{AllocationState, MemoryProtect, PageEntry};
@@ -23,12 +25,66 @@ pub enum HeapType {
 pub struct GuestMemory {
     /// Host pointer to the base of the 4GB guest address space.
     membase: *mut u8,
-    /// Page table tracking allocation state for each 4K page.
-    page_table: Vec<PageEntry>,
+    /// Page table tracking allocation state for each 4K page. Each entry is
+    /// an `AtomicU64` carrying the bit-packed [`PageEntry`] representation.
+    /// Atomic so [`Self::alloc`] (and friends) can take `&self` and run
+    /// concurrently with the load/store hot path's [`Self::is_mapped`]
+    /// checks. Allocation crosses many pages but each per-page Release store
+    /// is independently published; readers (`is_mapped`/`page_entry`) use
+    /// Acquire loads. Multi-page atomicity is not provided — callers ensure
+    /// happens-before via export ordering (alloc completes before any guest
+    /// access of the new region).
+    page_table: Vec<std::sync::atomic::AtomicU64>,
     /// Registered MMIO regions (sorted by base address for binary search).
     mmio_regions: Vec<MmioRegion>,
+    /// Cached *necessary* condition for an address to fall inside *any*
+    /// registered MMIO region: an address `a` can match only if
+    /// `(a & mmio_aperture_mask) == mmio_aperture_value`. Recomputed
+    /// inside [`add_mmio_region`] as the union (greatest common
+    /// prefix) of every region's `(mask, base & mask)` pair.
+    ///
+    /// With the GPU MMIO at `0x7FC8_0000 / 0xFFFF_0000` as the only
+    /// registered region, this is a single bit-mask compare per scalar
+    /// load/store — eliminating the prior O(N) `iter().find` over the
+    /// region list on every access. With zero regions registered the
+    /// flag stays at the "match nothing" sentinel and the hot path
+    /// returns `None` without touching the Vec.
+    mmio_aperture_mask: u32,
+    mmio_aperture_value: u32,
     /// Whether the memory mapping is owned (should be unmapped on drop).
     owned: bool,
+    /// P5 texture-cache invalidation: per-4KB-page monotonic write
+    /// version. Every `write_u8/16/32/64` bumps
+    /// `page_versions[addr >> 12]`, and a global `writes_total` counter
+    /// (shared by all pages) gets stamped into each page. The texture
+    /// cache computes `max(page_versions[..])` over the texture's byte
+    /// footprint at bind time and re-decodes if any page has advanced
+    /// since the cached entry.
+    page_versions: Vec<AtomicU64>,
+    /// Monotonic global write counter — makes per-page versions
+    /// cross-comparable even when their indices alias.
+    writes_total: AtomicU64,
+}
+
+/// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit
+/// where `mask_a` *and* `mask_b` are set and the masked values agree.
+/// Used by `add_mmio_region` to fold a new region into the cached
+/// fast-reject pair without losing soundness — the result is always a
+/// *necessary* condition for membership in either region.
+#[inline]
+fn fold_aperture(
+    cur_mask: u32,
+    cur_value: u32,
+    new_mask: u32,
+    new_value: u32,
+) -> (u32, u32) {
+    // Bits that both masks cover AND on which both values agree are the
+    // only bits we can keep. Disagreement on any covered bit collapses
+    // that bit out of the cache.
+    let common_mask = cur_mask & new_mask;
+    let agreed = !(cur_value ^ new_value);
+    let m = common_mask & agreed;
+    (m, cur_value & m)
 }
 
 unsafe impl Send for GuestMemory {}
@@ -40,12 +96,80 @@ impl GuestMemory {
         let membase = crate::platform::reserve_address_space(GUEST_ADDRESS_SPACE)?;
         Ok(Self {
             membase,
-            page_table: vec![PageEntry::default(); PAGE_COUNT],
+            page_table: (0..PAGE_COUNT).map(|_| std::sync::atomic::AtomicU64::new(0)).collect(),
             mmio_regions: Vec::new(),
+            // Sentinel "match nothing" — `(a & !0) == !0` is false for
+            // any `a`, so `find_mmio` short-circuits to `None` until the
+            // first region is registered.
+            mmio_aperture_mask: u32::MAX,
+            mmio_aperture_value: u32::MAX,
             owned: true,
+            page_versions: (0..PAGE_COUNT).map(|_| AtomicU64::new(0)).collect(),
+            writes_total: AtomicU64::new(0),
         })
     }
 
+    /// Current version watermark for the page containing `addr`. Bumped by
+    /// any write through `write_u8/16/32/64`. Not affected by MMIO writes
+    /// (those don't touch the backing texture memory).
+    ///
+    /// Acquire load: any thread observing a value `v` here also observes
+    /// every memory write the bumping thread published before its
+    /// Release-store of `v` (see [`bump_page_version`]). This is the
+    /// synchronizes-with edge consumed by the texture cache once the GPU
+    /// runs on its own host thread.
+    pub fn page_version(&self, addr: u32) -> u64 {
+        let idx = (addr / PAGE_SIZE) as usize;
+        self.page_versions
+            .get(idx)
+            .map(|a| a.load(Ordering::Acquire))
+            .unwrap_or(0)
+    }
+
+    /// Maximum page version across the byte span `[addr, addr+len)`.
+    /// O(pages) — fast for typical texture sizes (1 MiB = 256 pages).
+    pub fn max_page_version(&self, addr: u32, len: u32) -> u64 {
+        if len == 0 {
+            return self.page_version(addr);
+        }
+        let first = addr / PAGE_SIZE;
+        let last = addr.saturating_add(len.saturating_sub(1)) / PAGE_SIZE;
+        let mut m = 0u64;
+        for p in first..=last {
+            if let Some(slot) = self.page_versions.get(p as usize) {
+                let v = slot.load(Ordering::Acquire);
+                if v > m {
+                    m = v;
+                }
+            }
+        }
+        m
+    }
+
+    /// Total number of write events observed. Useful for cross-page tie
+    /// breaking and HUD-level "is the guest scribbling?" metrics.
+    pub fn writes_total(&self) -> u64 {
+        self.writes_total.load(Ordering::Relaxed)
+    }
+
+    #[inline]
+    fn bump_page_version(&self, addr: u32) {
+        // Relaxed is sufficient for the global tick — the only payload
+        // that depends on a particular value is the per-page slot below,
+        // and the publish-edge there is its own Release store.
+        let stamp = self
+            .writes_total
+            .fetch_add(1, Ordering::Relaxed)
+            .wrapping_add(1);
+        let idx = (addr / PAGE_SIZE) as usize;
+        if let Some(slot) = self.page_versions.get(idx) {
+            // Release: any reader that Acquire-loads this slot and sees
+            // `stamp` also observes the data store that preceded this
+            // bump (the unsafe `*ptr = val` in the surrounding write_*).
+            slot.store(stamp, Ordering::Release);
+        }
+    }
+
     /// Get the host base pointer for the guest address space.
     pub fn membase(&self) -> *const u8 {
         self.membase
@@ -62,7 +186,12 @@ impl GuestMemory {
     }
 
     /// Translate a guest virtual address to a mutable host pointer.
-    pub fn translate_virtual_mut(&mut self, guest_addr: u32) -> *mut u8 {
+    ///
+    /// Takes `&self`. The returned pointer is into the shared
+    /// `membase` mapping; the soundness contract is the trait-level one
+    /// in [`crate::access::MemoryAccess`] — callers must not concurrently
+    /// read and write the same byte range from different threads.
+    pub fn translate_virtual_mut(&self, guest_addr: u32) -> *mut u8 {
         unsafe { self.membase.add(guest_addr as usize) }
     }
 
@@ -74,6 +203,21 @@ impl GuestMemory {
 
     /// Register an MMIO region.
     pub fn add_mmio_region(&mut self, region: MmioRegion) {
+        let new_mask = region.mask;
+        let new_value = region.base_address & region.mask;
+        if self.mmio_regions.is_empty() {
+            self.mmio_aperture_mask = new_mask;
+            self.mmio_aperture_value = new_value;
+        } else {
+            let (m, v) = fold_aperture(
+                self.mmio_aperture_mask,
+                self.mmio_aperture_value,
+                new_mask,
+                new_value,
+            );
+            self.mmio_aperture_mask = m;
+            self.mmio_aperture_value = v;
+        }
         let base = region.base_address;
         let idx = self
             .mmio_regions
@@ -83,34 +227,74 @@ impl GuestMemory {
     }
 
     /// Check if an address is in a registered MMIO region.
+    ///
+    /// Tier-3 perf — non-MMIO addresses (the common case for code fetch
+    /// and main-RAM data accesses) get rejected by a single bit-mask
+    /// compare against the cached aperture, skipping the linear search
+    /// over `mmio_regions`. The `iter().find` fallback only runs for
+    /// addresses that pass the necessary-but-not-sufficient prefilter,
+    /// preserving exact MMIO semantics when multiple regions share a
+    /// prefix or when a region's `mask` admits non-contiguous addresses.
+    #[inline]
     fn find_mmio(&self, addr: u32) -> Option<&MmioRegion> {
+        if (addr & self.mmio_aperture_mask) != self.mmio_aperture_value {
+            return None;
+        }
         self.mmio_regions.iter().find(|r| r.contains(addr))
     }
 
     /// Allocate a region in the guest address space.
+    ///
+    /// Validates that `base` is page-aligned and that `base + size` does not
+    /// overflow the 4GB guest address space. Takes `&self` — `page_table`
+    /// is `Vec<AtomicU64>` so per-page state updates use atomic stores
+    /// (`Release` ordering, paired with `Acquire` loads in
+    /// [`Self::is_mapped`] / [`Self::page_entry`]). The kernel ensures
+    /// happens-before across the alloc-then-use boundary at the export
+    /// level (the guest cannot observe the new region until the export
+    /// returns), so a single Release per page suffices and we don't need
+    /// multi-page atomicity.
     pub fn alloc(
-        &mut self,
+        &self,
         base: u32,
         size: u32,
         protect: MemoryProtect,
     ) -> Result<u32, MemoryError> {
+        if !base.is_multiple_of(PAGE_SIZE) {
+            return Err(MemoryError::AllocationFailed(format!(
+                "alloc base {:#x} is not page-aligned", base
+            )));
+        }
+        let end = (base as u64).saturating_add(size as u64);
+        if end > GUEST_ADDRESS_SPACE as u64 {
+            return Err(MemoryError::AllocationFailed(format!(
+                "alloc range {:#x}+{:#x} exceeds 4GB guest space", base, size
+            )));
+        }
+
         let page_start = (base / PAGE_SIZE) as usize;
-        let page_count = ((size + PAGE_SIZE - 1) / PAGE_SIZE) as usize;
+        let page_count = size.div_ceil(PAGE_SIZE) as usize;
 
-        // Commit pages via platform
+        // Commit pages via platform. `commit_memory` takes `*mut u8` but
+        // doesn't actually need exclusive access — the OS-level mmap call
+        // is independently thread-safe.
         let host_ptr = unsafe { self.membase.add(base as usize) };
-        crate::platform::commit_memory(host_ptr, (page_count * PAGE_SIZE as usize) as usize)?;
+        crate::platform::commit_memory(host_ptr, page_count * PAGE_SIZE as usize)?;
 
-        // Update page table
+        // Build a single `PageEntry` once, then Release-store it into each
+        // affected slot. Using a fresh `PageEntry::default()` per page
+        // would yield the same bits but at higher cost.
+        let mut entry = PageEntry::default();
+        entry.set_base_address(page_start as u32);
+        entry.set_region_page_count(page_count as u32);
+        entry.set_allocation_protect(protect);
+        entry.set_current_protect(protect);
+        entry.set_state(AllocationState::RESERVE | AllocationState::COMMIT);
+        let raw = entry.raw();
         for i in 0..page_count {
             let idx = page_start + i;
-            if idx < self.page_table.len() {
-                let entry = &mut self.page_table[idx];
-                entry.set_base_address(page_start as u32);
-                entry.set_region_page_count(page_count as u32);
-                entry.set_allocation_protect(protect);
-                entry.set_current_protect(protect);
-                entry.set_state(AllocationState::RESERVE | AllocationState::COMMIT);
+            if let Some(slot) = self.page_table.get(idx) {
+                slot.store(raw, std::sync::atomic::Ordering::Release);
             }
         }
 
@@ -126,36 +310,64 @@ impl GuestMemory {
     }
 
     /// Write a slice of bytes to guest memory (bypassing MMIO for bulk writes).
-    pub fn write_bulk(&mut self, addr: u32, buf: &[u8]) {
+    ///
+    /// Takes `&self` (matches the trait-level write contract): the actual
+    /// store goes through a raw `*mut u8` derived from `membase`, which
+    /// has no Rust aliasing semantics. Callers must respect the trait
+    /// contract — no concurrent read/write of the same byte range from
+    /// different threads. Used by the XEX loader (init, single-thread)
+    /// and `NtReadFile` (mid-execution; the file's destination buffer is
+    /// guest-thread-private by construction).
+    pub fn write_bulk(&self, addr: u32, buf: &[u8]) {
         let ptr = self.translate_virtual_mut(addr);
         unsafe {
             std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len());
         }
     }
 
-    /// Check if a guest address has been allocated/committed.
+    /// Check if a guest address has been allocated/committed. Acquire load
+    /// pairs with the Release store in [`Self::alloc`] — any thread that
+    /// observes `state.contains(COMMIT)` here also observes every
+    /// allocation-side metadata write that preceded the store.
     pub fn is_mapped(&self, addr: u32) -> bool {
         let page = (addr / PAGE_SIZE) as usize;
         if page >= self.page_table.len() {
             return false;
         }
-        self.page_table[page].state().contains(AllocationState::COMMIT)
+        let raw = self.page_table[page].load(std::sync::atomic::Ordering::Acquire);
+        PageEntry::from_raw(raw)
+            .state()
+            .contains(AllocationState::COMMIT)
     }
 
-    /// Get a page table entry for a given address.
-    pub fn page_entry(&self, addr: u32) -> &PageEntry {
+    /// Get a page table entry for a given address, or None if out of range.
+    /// Returns by value (the storage is now atomic; we publish a snapshot).
+    pub fn page_entry(&self, addr: u32) -> Option<PageEntry> {
         let page = (addr / PAGE_SIZE) as usize;
-        &self.page_table[page]
+        self.page_table
+            .get(page)
+            .map(|a| PageEntry::from_raw(a.load(std::sync::atomic::Ordering::Acquire)))
     }
 }
 
 impl MemoryAccess for GuestMemory {
+    // Tier-3 perf: `#[inline]` on the hot read/write paths lets LLVM
+    // fold the MMIO + mapping checks into the interpreter's load/store
+    // handlers, hoisting the "not-MMIO, mapped" branch out of the loop
+    // body for consecutive same-page accesses.
+    #[inline]
     fn read_u8(&self, addr: u32) -> u8 {
+        // MMIO dispatch must come first — a byte read at an MMIO-mapped
+        // address should invoke the callback, not the backing memory.
+        if let Some(mmio) = self.find_mmio(addr) {
+            return (mmio.read_callback)(addr) as u8;
+        }
         if !self.is_mapped(addr) { return 0; }
         let ptr = self.translate_virtual(addr);
         unsafe { *ptr }
     }
 
+    #[inline]
     fn read_u16(&self, addr: u32) -> u16 {
         if let Some(mmio) = self.find_mmio(addr) {
             (mmio.read_callback)(addr) as u16
@@ -167,6 +379,7 @@ impl MemoryAccess for GuestMemory {
         }
     }
 
+    #[inline]
     fn read_u32(&self, addr: u32) -> u32 {
         if let Some(mmio) = self.find_mmio(addr) {
             (mmio.read_callback)(addr)
@@ -178,6 +391,7 @@ impl MemoryAccess for GuestMemory {
         }
     }
 
+    #[inline]
     fn read_u64(&self, addr: u32) -> u64 {
         if let Some(mmio) = self.find_mmio(addr) {
             let hi = (mmio.read_callback)(addr) as u64;
@@ -191,49 +405,68 @@ impl MemoryAccess for GuestMemory {
         }
     }
 
-    fn write_u8(&mut self, addr: u32, val: u8) {
+    fn write_u8(&self, addr: u32, val: u8) {
+        // MMIO dispatch first — a byte write at an MMIO-mapped address
+        // must invoke the callback, not the backing memory.
+        if let Some(mmio) = self.find_mmio(addr) {
+            (mmio.write_callback)(addr, val as u32);
+            return;
+        }
         if !self.is_mapped(addr) { return; }
         let ptr = self.translate_virtual_mut(addr);
         unsafe { *ptr = val };
+        self.bump_page_version(addr);
     }
 
-    fn write_u16(&mut self, addr: u32, val: u16) {
+    fn write_u16(&self, addr: u32, val: u16) {
         if let Some(mmio) = self.find_mmio(addr) {
             (mmio.write_callback)(addr, val as u32);
         } else if !self.is_mapped(addr) {
-            return;
         } else {
             let ptr = self.translate_virtual_mut(addr);
             unsafe {
                 std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2);
             }
+            self.bump_page_version(addr);
+            // A 16-bit write can cross a page boundary; bump the neighbour
+            // too so the texture cache sees the write even if it's looking
+            // at the next page's version.
+            if (addr & 0xFFF) >= (PAGE_SIZE - 1) {
+                self.bump_page_version(addr.wrapping_add(1));
+            }
         }
     }
 
-    fn write_u32(&mut self, addr: u32, val: u32) {
+    fn write_u32(&self, addr: u32, val: u32) {
         if let Some(mmio) = self.find_mmio(addr) {
             (mmio.write_callback)(addr, val);
         } else if !self.is_mapped(addr) {
-            return;
         } else {
             let ptr = self.translate_virtual_mut(addr);
             unsafe {
                 std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4);
             }
+            self.bump_page_version(addr);
+            if (addr & 0xFFF) >= (PAGE_SIZE - 3) {
+                self.bump_page_version(addr.wrapping_add(3));
+            }
         }
     }
 
-    fn write_u64(&mut self, addr: u32, val: u64) {
+    fn write_u64(&self, addr: u32, val: u64) {
         if let Some(mmio) = self.find_mmio(addr) {
             (mmio.write_callback)(addr, (val >> 32) as u32);
             (mmio.write_callback)(addr.wrapping_add(4), val as u32);
         } else if !self.is_mapped(addr) {
-            return;
         } else {
             let ptr = self.translate_virtual_mut(addr);
             unsafe {
                 std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8);
             }
+            self.bump_page_version(addr);
+            if (addr & 0xFFF) >= (PAGE_SIZE - 7) {
+                self.bump_page_version(addr.wrapping_add(7));
+            }
         }
     }
 
@@ -245,13 +478,288 @@ impl MemoryAccess for GuestMemory {
         }
     }
 
-    fn translate_mut(&mut self, addr: u32) -> Option<*mut u8> {
+    fn translate_mut(&self, addr: u32) -> Option<*mut u8> {
         if self.find_mmio(addr).is_some() {
             None
         } else {
             Some(self.translate_virtual_mut(addr))
         }
     }
+
+    /// Override the default impl to hand the xenia-cpu `DecodeCache` a
+    /// real per-page version. Zero means "never written" which the cache
+    /// treats as a valid version; first write bumps to 1 (via the
+    /// global `writes_total` counter already maintained).
+    #[inline]
+    fn page_version(&self, addr: u32) -> u64 {
+        GuestMemory::page_version(self, addr)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::atomic::{AtomicU32, Ordering};
+    use std::sync::Arc;
+
+    fn empty_mem() -> GuestMemory { GuestMemory::new().expect("reserve 4GB") }
+
+    #[test]
+    fn alloc_rejects_unaligned_base() {
+        let mut mem = empty_mem();
+        let err = mem.alloc(0x1001, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap_err();
+        assert!(matches!(err, MemoryError::AllocationFailed(_)));
+    }
+
+    #[test]
+    fn alloc_rejects_overflow_past_4gb() {
+        let mut mem = empty_mem();
+        let err = mem.alloc(0xFFFF_0000, 0x0002_0000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap_err();
+        assert!(matches!(err, MemoryError::AllocationFailed(_)));
+    }
+
+    #[test]
+    fn alloc_succeeds_for_valid_region() {
+        let mut mem = empty_mem();
+        let base = mem.alloc(0x1000, 0x2000, MemoryProtect::READ | MemoryProtect::WRITE).expect("alloc ok");
+        assert_eq!(base, 0x1000);
+        assert!(mem.is_mapped(0x1000));
+        assert!(mem.is_mapped(0x2FFF));
+        assert!(!mem.is_mapped(0x3000));
+    }
+
+    #[test]
+    fn page_entry_returns_none_out_of_range() {
+        let mem = empty_mem();
+        // page_entry takes u32; all u32 values fit in the 4GB page table,
+        // so OOB-via-addr isn't reachable. Verify the Option behavior on an
+        // unmapped but in-range page: entry exists but is free.
+        let e = mem.page_entry(0xDEAD_BEEF).expect("in-range");
+        assert!(e.is_free());
+    }
+
+    #[test]
+    fn read_u8_dispatches_to_mmio() {
+        let mut mem = empty_mem();
+        let seen_addr = Arc::new(AtomicU32::new(0));
+        let seen_clone = seen_addr.clone();
+        mem.add_mmio_region(MmioRegion {
+            base_address: 0xEA00_0000,
+            mask: 0xFFFF_FF00,
+            size: 0x100,
+            read_callback: Box::new(move |a| {
+                seen_clone.store(a, Ordering::SeqCst);
+                0x42
+            }),
+            write_callback: Box::new(|_, _| {}),
+        });
+        let v = mem.read_u8(0xEA00_0008);
+        assert_eq!(v, 0x42);
+        assert_eq!(seen_addr.load(Ordering::SeqCst), 0xEA00_0008);
+    }
+
+    #[test]
+    fn write_u8_dispatches_to_mmio() {
+        let mut mem = empty_mem();
+        let captured = Arc::new(AtomicU32::new(0));
+        let captured_clone = captured.clone();
+        mem.add_mmio_region(MmioRegion {
+            base_address: 0xEB00_0000,
+            mask: 0xFFFF_FF00,
+            size: 0x100,
+            read_callback: Box::new(|_| 0),
+            write_callback: Box::new(move |_, v| {
+                captured_clone.store(v, Ordering::SeqCst);
+            }),
+        });
+        mem.write_u8(0xEB00_0004, 0xAB);
+        assert_eq!(captured.load(Ordering::SeqCst), 0xAB);
+    }
+
+    #[test]
+    fn u32_read_write_roundtrip_is_big_endian() {
+        let mut mem = empty_mem();
+        mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap();
+        mem.write_u32(0x2000, 0xDEAD_BEEF);
+        assert_eq!(mem.read_u32(0x2000), 0xDEAD_BEEF);
+        // And verify byte layout is big-endian (PPC native order).
+        assert_eq!(mem.read_u8(0x2000), 0xDE);
+        assert_eq!(mem.read_u8(0x2001), 0xAD);
+        assert_eq!(mem.read_u8(0x2002), 0xBE);
+        assert_eq!(mem.read_u8(0x2003), 0xEF);
+    }
+
+    #[test]
+    fn page_versions_bump_on_write() {
+        let mut mem = empty_mem();
+        mem.alloc(0x8000, 0x2000, MemoryProtect::READ | MemoryProtect::WRITE)
+            .unwrap();
+        let v0 = mem.page_version(0x8000);
+        assert_eq!(v0, 0);
+        mem.write_u32(0x8000, 0xDEAD_BEEF);
+        let v1 = mem.page_version(0x8000);
+        assert!(v1 > v0, "page version should advance on write");
+        // A write to a different page advances only that page.
+        mem.write_u8(0x9000, 0xAB);
+        assert_eq!(mem.page_version(0x8000), v1);
+        assert!(mem.page_version(0x9000) > v1);
+        // `max_page_version` across the span picks up the later write.
+        let span_max = mem.max_page_version(0x8000, 0x1001);
+        assert_eq!(span_max, mem.page_version(0x9000));
+    }
+
+    #[test]
+    fn mmio_fast_path_skips_non_mmio_address() {
+        // After registering a region in the GPU MMIO aperture, a write
+        // to an unrelated main-RAM address must NOT be intercepted —
+        // it must hit backing memory and bump page_version.
+        let mut mem = empty_mem();
+        mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE)
+            .unwrap();
+        let dispatched = Arc::new(AtomicU32::new(0));
+        let dispatched_clone = dispatched.clone();
+        mem.add_mmio_region(MmioRegion {
+            base_address: 0x7FC8_0000,
+            mask: 0xFFFF_0000,
+            size: 0x0001_0000,
+            read_callback: Box::new(move |_| {
+                dispatched_clone.fetch_add(1, Ordering::SeqCst);
+                0
+            }),
+            write_callback: Box::new(|_, _| {}),
+        });
+        let v0 = mem.page_version(0x2000);
+        mem.write_u32(0x2000, 0xCAFE_F00D);
+        assert_eq!(mem.read_u32(0x2000), 0xCAFE_F00D);
+        assert!(mem.page_version(0x2000) > v0);
+        assert_eq!(dispatched.load(Ordering::SeqCst), 0,
+            "non-MMIO read must not have hit the MMIO callback");
+    }
+
+    #[test]
+    fn mmio_fast_path_dispatches_for_aperture() {
+        // Addresses inside the registered aperture must still hit the
+        // callback after the fast-path landed.
+        let mut mem = empty_mem();
+        let writes = Arc::new(AtomicU32::new(0));
+        let reads = Arc::new(AtomicU32::new(0));
+        let writes_clone = writes.clone();
+        let reads_clone = reads.clone();
+        mem.add_mmio_region(MmioRegion {
+            base_address: 0x7FC8_0000,
+            mask: 0xFFFF_0000,
+            size: 0x0001_0000,
+            read_callback: Box::new(move |_| {
+                reads_clone.fetch_add(1, Ordering::SeqCst);
+                0xAA
+            }),
+            write_callback: Box::new(move |_, _| {
+                writes_clone.fetch_add(1, Ordering::SeqCst);
+            }),
+        });
+        mem.write_u32(0x7FC8_0420, 0x1234);
+        assert_eq!(writes.load(Ordering::SeqCst), 1);
+        let v = mem.read_u32(0x7FC8_0008);
+        assert_eq!(v, 0xAA);
+        assert_eq!(reads.load(Ordering::SeqCst), 1);
+    }
+
+    #[test]
+    fn mmio_fast_path_handles_two_disjoint_regions() {
+        // Two disjoint MMIO regions — both must dispatch, and a
+        // non-MMIO address still must not.
+        let mut mem = empty_mem();
+        mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE)
+            .unwrap();
+        let a_writes = Arc::new(AtomicU32::new(0));
+        let b_writes = Arc::new(AtomicU32::new(0));
+        let a_clone = a_writes.clone();
+        let b_clone = b_writes.clone();
+        mem.add_mmio_region(MmioRegion {
+            base_address: 0x7FC8_0000,
+            mask: 0xFFFF_0000,
+            size: 0x0001_0000,
+            read_callback: Box::new(|_| 0),
+            write_callback: Box::new(move |_, _| {
+                a_clone.fetch_add(1, Ordering::SeqCst);
+            }),
+        });
+        mem.add_mmio_region(MmioRegion {
+            base_address: 0xEA00_0000,
+            mask: 0xFFFF_0000,
+            size: 0x0001_0000,
+            read_callback: Box::new(|_| 0),
+            write_callback: Box::new(move |_, _| {
+                b_clone.fetch_add(1, Ordering::SeqCst);
+            }),
+        });
+        // Both regions still dispatch.
+        mem.write_u32(0x7FC8_0008, 1);
+        mem.write_u32(0xEA00_0008, 2);
+        assert_eq!(a_writes.load(Ordering::SeqCst), 1);
+        assert_eq!(b_writes.load(Ordering::SeqCst), 1);
+        // Non-MMIO write still bypasses both callbacks.
+        let v0 = mem.page_version(0x2000);
+        mem.write_u32(0x2000, 0xDEAD_BEEF);
+        assert_eq!(a_writes.load(Ordering::SeqCst), 1);
+        assert_eq!(b_writes.load(Ordering::SeqCst), 1);
+        assert!(mem.page_version(0x2000) > v0);
+        assert_eq!(mem.read_u32(0x2000), 0xDEAD_BEEF);
+    }
+
+    #[test]
+    fn mmio_fold_aperture_idempotent_for_identical_regions() {
+        // Regression: re-registering the same region must not collapse
+        // the cached aperture (which would force every fast-rejected
+        // address back through the linear iter().find).
+        let (m, v) = super::fold_aperture(
+            0xFFFF_0000, 0x7FC8_0000,
+            0xFFFF_0000, 0x7FC8_0000,
+        );
+        assert_eq!(m, 0xFFFF_0000);
+        assert_eq!(v, 0x7FC8_0000);
+    }
+
+    #[test]
+    fn mmio_fold_aperture_widens_for_disjoint_regions() {
+        // Folding two disjoint regions yields a *necessary*-only mask.
+        // The cached pair must accept both region addresses (the inner
+        // contains() is the sufficient check) and reject something
+        // outside both.
+        let (m, v) = super::fold_aperture(
+            0xFFFF_0000, 0x7FC8_0000,
+            0xFFFF_0000, 0xEA00_0000,
+        );
+        assert_eq!((0x7FC8_0420u32 & m), v);
+        assert_eq!((0xEA00_0008u32 & m), v);
+        // 0x2000 is outside both; the fold-mask compare must reject it.
+        assert_ne!((0x0000_2000u32 & m), v);
+    }
+
+    #[test]
+    fn page_versions_ignore_mmio_writes() {
+        let mut mem = empty_mem();
+        mem.add_mmio_region(MmioRegion {
+            base_address: 0xEC00_0000,
+            mask: 0xFFFF_FF00,
+            size: 0x100,
+            read_callback: Box::new(|_| 0),
+            write_callback: Box::new(|_, _| {}),
+        });
+        let before = mem.page_version(0xEC00_0000);
+        mem.write_u32(0xEC00_0004, 0x1234);
+        assert_eq!(mem.page_version(0xEC00_0000), before);
+    }
+
+    #[test]
+    fn u64_read_write_roundtrip_is_big_endian() {
+        let mut mem = empty_mem();
+        mem.alloc(0x3000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap();
+        mem.write_u64(0x3000, 0x1122_3344_5566_7788);
+        assert_eq!(mem.read_u64(0x3000), 0x1122_3344_5566_7788);
+        assert_eq!(mem.read_u8(0x3000), 0x11);
+        assert_eq!(mem.read_u8(0x3007), 0x88);
+    }
 }
 
 impl Drop for GuestMemory {
diff --git a/crates/xenia-memory/src/page_table.rs b/crates/xenia-memory/src/page_table.rs
index 8d116eb..d26bc5b 100644
--- a/crates/xenia-memory/src/page_table.rs
+++ b/crates/xenia-memory/src/page_table.rs
@@ -6,6 +6,19 @@ use bitflags::bitflags;
 pub struct PageEntry(u64);
 
 impl PageEntry {
+    /// Reconstruct a [`PageEntry`] from its packed `u64` representation.
+    /// Used by [`crate::GuestMemory::is_mapped`] and `page_entry` after an
+    /// atomic load from the page table.
+    pub fn from_raw(raw: u64) -> Self {
+        Self(raw)
+    }
+
+    /// The packed `u64` representation, ready to atomically Release-store
+    /// into the page table.
+    pub fn raw(&self) -> u64 {
+        self.0
+    }
+
     /// Base address of the allocated region in 4K pages (20 bits).
     pub fn base_address(&self) -> u32 {
         (self.0 & 0xFFFFF) as u32