xenia-memory: interior-mutable writes, page versioning, fenced ops

Re-shape MemoryAccess so write methods take &self and rely on interior mutability (atomics in GuestMemory, Cell in test mocks). This unblocks the &Arc<KernelState>-only execution model the CPU/HLE crates moved to. GuestMemory grows: per-4 KiB-page write-version counter (page_version) that the CPU's decode cache and the texture cache observe via Acquire, fenced 32-bit/64-bit read/write helpers (Release on writer / Acquire on reader) that PM4_EVENT_WRITE_SHD and the matching CPU consumers use to synchronize fence publication, and broader page-table / heap accounting needed by the new HLE allocators. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:27:13 +02:00
parent e2b8860e10
commit e9b2b57a44
3 changed files with 611 additions and 38 deletions
--- a/crates/xenia-memory/src/access.rs
+++ b/crates/xenia-memory/src/access.rs
@@ -1,6 +1,16 @@
 /// Trait for all guest memory access. Every load/store goes through this,
 /// enabling MMIO checking and debugger observation on every access.
 /// This is the key abstraction that eliminates the need for MMIO exception handlers.
+///
+/// **All methods take `&self`.** Write methods rely on interior mutability
+/// (atomics in [`crate::heap::GuestMemory`], `Cell` in test mocks). The
+/// actual byte stores into the backing memory are unsynchronized; callers
+/// must not concurrently read and write the same byte range from different
+/// threads. The per-page write version exposed by [`Self::page_version`] is
+/// a coarse cache-invalidation signal and is published with `Release`
+/// ordering by the writer — readers using `Acquire` (e.g. the texture
+/// cache and the interpreter decode cache) get a synchronizes-with edge to
+/// the corresponding data store.
 pub trait MemoryAccess {
    fn read_u8(&self, addr: u32) -> u8;
    fn read_u16(&self, addr: u32) -> u16;
@@ -13,14 +23,14 @@ pub trait MemoryAccess {
        f64::from_bits(self.read_u64(addr))
    }

-    fn write_u8(&mut self, addr: u32, val: u8);
-    fn write_u16(&mut self, addr: u32, val: u16);
-    fn write_u32(&mut self, addr: u32, val: u32);
-    fn write_u64(&mut self, addr: u32, val: u64);
-    fn write_f32(&mut self, addr: u32, val: f32) {
+    fn write_u8(&self, addr: u32, val: u8);
+    fn write_u16(&self, addr: u32, val: u16);
+    fn write_u32(&self, addr: u32, val: u32);
+    fn write_u64(&self, addr: u32, val: u64);
+    fn write_f32(&self, addr: u32, val: f32) {
        self.write_u32(addr, val.to_bits());
    }
-    fn write_f64(&mut self, addr: u32, val: f64) {
+    fn write_f64(&self, addr: u32, val: f64) {
        self.write_u64(addr, val.to_bits());
    }

@@ -32,7 +42,7 @@ pub trait MemoryAccess {
    }

    /// Write a block of bytes to guest memory.
-    fn write_bytes(&mut self, addr: u32, buf: &[u8]) {
+    fn write_bytes(&self, addr: u32, buf: &[u8]) {
        for (i, &byte) in buf.iter().enumerate() {
            self.write_u8(addr.wrapping_add(i as u32), byte);
        }
@@ -43,5 +53,47 @@ pub trait MemoryAccess {
    fn translate(&self, addr: u32) -> Option<*const u8>;

    /// Get a mutable direct host pointer for the given guest address.
-    fn translate_mut(&mut self, addr: u32) -> Option<*mut u8>;
+    fn translate_mut(&self, addr: u32) -> Option<*mut u8>;
+
+    /// Monotonic write-version of the 4 KiB page containing `addr`.
+    /// Used by the interpreter's decode cache (xenia-cpu `DecodeCache`)
+    /// to invalidate entries when the guest rewrites code pages.
+    ///
+    /// Default impl returns `1` — a constant non-zero value that works
+    /// for mock memories in tests (the decode cache treats
+    /// constant-version runs as "never invalidated"). Real memory
+    /// (`xenia-memory::GuestMemory`) overrides this with its
+    /// per-page counter.
+    fn page_version(&self, _addr: u32) -> u64 {
+        1
+    }
+
+    /// M1.8 — fenced 32-bit write. Used by the GPU's
+    /// `PM4_EVENT_WRITE_SHD` to publish a fence value into guest memory
+    /// after one or more data writes the CPU thread will read once it
+    /// observes the fence. Emits a `Release` fence before the data
+    /// store: any earlier writes by the calling thread happen-before
+    /// any thread that performs a matching `Acquire` load via
+    /// [`Self::read_u32_fence`].
+    ///
+    /// On x86_64 (TSO) the `Release` fence compiles to a no-op; on
+    /// weaker targets it emits the appropriate barrier. The store
+    /// itself is 32-bit aligned and naturally atomic on x86_64
+    /// (single-copy atomicity) — we rely on that and only fence the
+    /// surrounding stores, not the store itself.
+    fn write_u32_fence(&self, addr: u32, val: u32) {
+        std::sync::atomic::fence(std::sync::atomic::Ordering::Release);
+        self.write_u32(addr, val);
+    }
+
+    /// M1.8 — fenced 32-bit read. Used by guest fence-poll loops that
+    /// busy-spin on a memory location the GPU writes via
+    /// [`Self::write_u32_fence`]. Emits an `Acquire` fence after the
+    /// load: any reads the calling thread issues *after* this call see
+    /// every write the producer issued *before* its `write_u32_fence`.
+    fn read_u32_fence(&self, addr: u32) -> u32 {
+        let v = self.read_u32(addr);
+        std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire);
+        v
+    }
 }