diff --git a/crates/xenia-memory/src/access.rs b/crates/xenia-memory/src/access.rs index 7ee3855..7359b0d 100644 --- a/crates/xenia-memory/src/access.rs +++ b/crates/xenia-memory/src/access.rs @@ -1,6 +1,16 @@ /// Trait for all guest memory access. Every load/store goes through this, /// enabling MMIO checking and debugger observation on every access. /// This is the key abstraction that eliminates the need for MMIO exception handlers. +/// +/// **All methods take `&self`.** Write methods rely on interior mutability +/// (atomics in [`crate::heap::GuestMemory`], `Cell` in test mocks). The +/// actual byte stores into the backing memory are unsynchronized; callers +/// must not concurrently read and write the same byte range from different +/// threads. The per-page write version exposed by [`Self::page_version`] is +/// a coarse cache-invalidation signal and is published with `Release` +/// ordering by the writer — readers using `Acquire` (e.g. the texture +/// cache and the interpreter decode cache) get a synchronizes-with edge to +/// the corresponding data store. pub trait MemoryAccess { fn read_u8(&self, addr: u32) -> u8; fn read_u16(&self, addr: u32) -> u16; @@ -13,14 +23,14 @@ pub trait MemoryAccess { f64::from_bits(self.read_u64(addr)) } - fn write_u8(&mut self, addr: u32, val: u8); - fn write_u16(&mut self, addr: u32, val: u16); - fn write_u32(&mut self, addr: u32, val: u32); - fn write_u64(&mut self, addr: u32, val: u64); - fn write_f32(&mut self, addr: u32, val: f32) { + fn write_u8(&self, addr: u32, val: u8); + fn write_u16(&self, addr: u32, val: u16); + fn write_u32(&self, addr: u32, val: u32); + fn write_u64(&self, addr: u32, val: u64); + fn write_f32(&self, addr: u32, val: f32) { self.write_u32(addr, val.to_bits()); } - fn write_f64(&mut self, addr: u32, val: f64) { + fn write_f64(&self, addr: u32, val: f64) { self.write_u64(addr, val.to_bits()); } @@ -32,7 +42,7 @@ pub trait MemoryAccess { } /// Write a block of bytes to guest memory. - fn write_bytes(&mut self, addr: u32, buf: &[u8]) { + fn write_bytes(&self, addr: u32, buf: &[u8]) { for (i, &byte) in buf.iter().enumerate() { self.write_u8(addr.wrapping_add(i as u32), byte); } @@ -43,5 +53,47 @@ pub trait MemoryAccess { fn translate(&self, addr: u32) -> Option<*const u8>; /// Get a mutable direct host pointer for the given guest address. - fn translate_mut(&mut self, addr: u32) -> Option<*mut u8>; + fn translate_mut(&self, addr: u32) -> Option<*mut u8>; + + /// Monotonic write-version of the 4 KiB page containing `addr`. + /// Used by the interpreter's decode cache (xenia-cpu `DecodeCache`) + /// to invalidate entries when the guest rewrites code pages. + /// + /// Default impl returns `1` — a constant non-zero value that works + /// for mock memories in tests (the decode cache treats + /// constant-version runs as "never invalidated"). Real memory + /// (`xenia-memory::GuestMemory`) overrides this with its + /// per-page counter. + fn page_version(&self, _addr: u32) -> u64 { + 1 + } + + /// M1.8 — fenced 32-bit write. Used by the GPU's + /// `PM4_EVENT_WRITE_SHD` to publish a fence value into guest memory + /// after one or more data writes the CPU thread will read once it + /// observes the fence. Emits a `Release` fence before the data + /// store: any earlier writes by the calling thread happen-before + /// any thread that performs a matching `Acquire` load via + /// [`Self::read_u32_fence`]. + /// + /// On x86_64 (TSO) the `Release` fence compiles to a no-op; on + /// weaker targets it emits the appropriate barrier. The store + /// itself is 32-bit aligned and naturally atomic on x86_64 + /// (single-copy atomicity) — we rely on that and only fence the + /// surrounding stores, not the store itself. + fn write_u32_fence(&self, addr: u32, val: u32) { + std::sync::atomic::fence(std::sync::atomic::Ordering::Release); + self.write_u32(addr, val); + } + + /// M1.8 — fenced 32-bit read. Used by guest fence-poll loops that + /// busy-spin on a memory location the GPU writes via + /// [`Self::write_u32_fence`]. Emits an `Acquire` fence after the + /// load: any reads the calling thread issues *after* this call see + /// every write the producer issued *before* its `write_u32_fence`. + fn read_u32_fence(&self, addr: u32) -> u32 { + let v = self.read_u32(addr); + std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire); + v + } } diff --git a/crates/xenia-memory/src/heap.rs b/crates/xenia-memory/src/heap.rs index b34a33f..18cd16f 100644 --- a/crates/xenia-memory/src/heap.rs +++ b/crates/xenia-memory/src/heap.rs @@ -1,3 +1,5 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + use crate::access::MemoryAccess; use crate::mmio::MmioRegion; use crate::page_table::{AllocationState, MemoryProtect, PageEntry}; @@ -23,12 +25,66 @@ pub enum HeapType { pub struct GuestMemory { /// Host pointer to the base of the 4GB guest address space. membase: *mut u8, - /// Page table tracking allocation state for each 4K page. - page_table: Vec, + /// Page table tracking allocation state for each 4K page. Each entry is + /// an `AtomicU64` carrying the bit-packed [`PageEntry`] representation. + /// Atomic so [`Self::alloc`] (and friends) can take `&self` and run + /// concurrently with the load/store hot path's [`Self::is_mapped`] + /// checks. Allocation crosses many pages but each per-page Release store + /// is independently published; readers (`is_mapped`/`page_entry`) use + /// Acquire loads. Multi-page atomicity is not provided — callers ensure + /// happens-before via export ordering (alloc completes before any guest + /// access of the new region). + page_table: Vec, /// Registered MMIO regions (sorted by base address for binary search). mmio_regions: Vec, + /// Cached *necessary* condition for an address to fall inside *any* + /// registered MMIO region: an address `a` can match only if + /// `(a & mmio_aperture_mask) == mmio_aperture_value`. Recomputed + /// inside [`add_mmio_region`] as the union (greatest common + /// prefix) of every region's `(mask, base & mask)` pair. + /// + /// With the GPU MMIO at `0x7FC8_0000 / 0xFFFF_0000` as the only + /// registered region, this is a single bit-mask compare per scalar + /// load/store — eliminating the prior O(N) `iter().find` over the + /// region list on every access. With zero regions registered the + /// flag stays at the "match nothing" sentinel and the hot path + /// returns `None` without touching the Vec. + mmio_aperture_mask: u32, + mmio_aperture_value: u32, /// Whether the memory mapping is owned (should be unmapped on drop). owned: bool, + /// P5 texture-cache invalidation: per-4KB-page monotonic write + /// version. Every `write_u8/16/32/64` bumps + /// `page_versions[addr >> 12]`, and a global `writes_total` counter + /// (shared by all pages) gets stamped into each page. The texture + /// cache computes `max(page_versions[..])` over the texture's byte + /// footprint at bind time and re-decodes if any page has advanced + /// since the cached entry. + page_versions: Vec, + /// Monotonic global write counter — makes per-page versions + /// cross-comparable even when their indices alias. + writes_total: AtomicU64, +} + +/// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit +/// where `mask_a` *and* `mask_b` are set and the masked values agree. +/// Used by `add_mmio_region` to fold a new region into the cached +/// fast-reject pair without losing soundness — the result is always a +/// *necessary* condition for membership in either region. +#[inline] +fn fold_aperture( + cur_mask: u32, + cur_value: u32, + new_mask: u32, + new_value: u32, +) -> (u32, u32) { + // Bits that both masks cover AND on which both values agree are the + // only bits we can keep. Disagreement on any covered bit collapses + // that bit out of the cache. + let common_mask = cur_mask & new_mask; + let agreed = !(cur_value ^ new_value); + let m = common_mask & agreed; + (m, cur_value & m) } unsafe impl Send for GuestMemory {} @@ -40,12 +96,80 @@ impl GuestMemory { let membase = crate::platform::reserve_address_space(GUEST_ADDRESS_SPACE)?; Ok(Self { membase, - page_table: vec![PageEntry::default(); PAGE_COUNT], + page_table: (0..PAGE_COUNT).map(|_| std::sync::atomic::AtomicU64::new(0)).collect(), mmio_regions: Vec::new(), + // Sentinel "match nothing" — `(a & !0) == !0` is false for + // any `a`, so `find_mmio` short-circuits to `None` until the + // first region is registered. + mmio_aperture_mask: u32::MAX, + mmio_aperture_value: u32::MAX, owned: true, + page_versions: (0..PAGE_COUNT).map(|_| AtomicU64::new(0)).collect(), + writes_total: AtomicU64::new(0), }) } + /// Current version watermark for the page containing `addr`. Bumped by + /// any write through `write_u8/16/32/64`. Not affected by MMIO writes + /// (those don't touch the backing texture memory). + /// + /// Acquire load: any thread observing a value `v` here also observes + /// every memory write the bumping thread published before its + /// Release-store of `v` (see [`bump_page_version`]). This is the + /// synchronizes-with edge consumed by the texture cache once the GPU + /// runs on its own host thread. + pub fn page_version(&self, addr: u32) -> u64 { + let idx = (addr / PAGE_SIZE) as usize; + self.page_versions + .get(idx) + .map(|a| a.load(Ordering::Acquire)) + .unwrap_or(0) + } + + /// Maximum page version across the byte span `[addr, addr+len)`. + /// O(pages) — fast for typical texture sizes (1 MiB = 256 pages). + pub fn max_page_version(&self, addr: u32, len: u32) -> u64 { + if len == 0 { + return self.page_version(addr); + } + let first = addr / PAGE_SIZE; + let last = addr.saturating_add(len.saturating_sub(1)) / PAGE_SIZE; + let mut m = 0u64; + for p in first..=last { + if let Some(slot) = self.page_versions.get(p as usize) { + let v = slot.load(Ordering::Acquire); + if v > m { + m = v; + } + } + } + m + } + + /// Total number of write events observed. Useful for cross-page tie + /// breaking and HUD-level "is the guest scribbling?" metrics. + pub fn writes_total(&self) -> u64 { + self.writes_total.load(Ordering::Relaxed) + } + + #[inline] + fn bump_page_version(&self, addr: u32) { + // Relaxed is sufficient for the global tick — the only payload + // that depends on a particular value is the per-page slot below, + // and the publish-edge there is its own Release store. + let stamp = self + .writes_total + .fetch_add(1, Ordering::Relaxed) + .wrapping_add(1); + let idx = (addr / PAGE_SIZE) as usize; + if let Some(slot) = self.page_versions.get(idx) { + // Release: any reader that Acquire-loads this slot and sees + // `stamp` also observes the data store that preceded this + // bump (the unsafe `*ptr = val` in the surrounding write_*). + slot.store(stamp, Ordering::Release); + } + } + /// Get the host base pointer for the guest address space. pub fn membase(&self) -> *const u8 { self.membase @@ -62,7 +186,12 @@ impl GuestMemory { } /// Translate a guest virtual address to a mutable host pointer. - pub fn translate_virtual_mut(&mut self, guest_addr: u32) -> *mut u8 { + /// + /// Takes `&self`. The returned pointer is into the shared + /// `membase` mapping; the soundness contract is the trait-level one + /// in [`crate::access::MemoryAccess`] — callers must not concurrently + /// read and write the same byte range from different threads. + pub fn translate_virtual_mut(&self, guest_addr: u32) -> *mut u8 { unsafe { self.membase.add(guest_addr as usize) } } @@ -74,6 +203,21 @@ impl GuestMemory { /// Register an MMIO region. pub fn add_mmio_region(&mut self, region: MmioRegion) { + let new_mask = region.mask; + let new_value = region.base_address & region.mask; + if self.mmio_regions.is_empty() { + self.mmio_aperture_mask = new_mask; + self.mmio_aperture_value = new_value; + } else { + let (m, v) = fold_aperture( + self.mmio_aperture_mask, + self.mmio_aperture_value, + new_mask, + new_value, + ); + self.mmio_aperture_mask = m; + self.mmio_aperture_value = v; + } let base = region.base_address; let idx = self .mmio_regions @@ -83,34 +227,74 @@ impl GuestMemory { } /// Check if an address is in a registered MMIO region. + /// + /// Tier-3 perf — non-MMIO addresses (the common case for code fetch + /// and main-RAM data accesses) get rejected by a single bit-mask + /// compare against the cached aperture, skipping the linear search + /// over `mmio_regions`. The `iter().find` fallback only runs for + /// addresses that pass the necessary-but-not-sufficient prefilter, + /// preserving exact MMIO semantics when multiple regions share a + /// prefix or when a region's `mask` admits non-contiguous addresses. + #[inline] fn find_mmio(&self, addr: u32) -> Option<&MmioRegion> { + if (addr & self.mmio_aperture_mask) != self.mmio_aperture_value { + return None; + } self.mmio_regions.iter().find(|r| r.contains(addr)) } /// Allocate a region in the guest address space. + /// + /// Validates that `base` is page-aligned and that `base + size` does not + /// overflow the 4GB guest address space. Takes `&self` — `page_table` + /// is `Vec` so per-page state updates use atomic stores + /// (`Release` ordering, paired with `Acquire` loads in + /// [`Self::is_mapped`] / [`Self::page_entry`]). The kernel ensures + /// happens-before across the alloc-then-use boundary at the export + /// level (the guest cannot observe the new region until the export + /// returns), so a single Release per page suffices and we don't need + /// multi-page atomicity. pub fn alloc( - &mut self, + &self, base: u32, size: u32, protect: MemoryProtect, ) -> Result { + if !base.is_multiple_of(PAGE_SIZE) { + return Err(MemoryError::AllocationFailed(format!( + "alloc base {:#x} is not page-aligned", base + ))); + } + let end = (base as u64).saturating_add(size as u64); + if end > GUEST_ADDRESS_SPACE as u64 { + return Err(MemoryError::AllocationFailed(format!( + "alloc range {:#x}+{:#x} exceeds 4GB guest space", base, size + ))); + } + let page_start = (base / PAGE_SIZE) as usize; - let page_count = ((size + PAGE_SIZE - 1) / PAGE_SIZE) as usize; + let page_count = size.div_ceil(PAGE_SIZE) as usize; - // Commit pages via platform + // Commit pages via platform. `commit_memory` takes `*mut u8` but + // doesn't actually need exclusive access — the OS-level mmap call + // is independently thread-safe. let host_ptr = unsafe { self.membase.add(base as usize) }; - crate::platform::commit_memory(host_ptr, (page_count * PAGE_SIZE as usize) as usize)?; + crate::platform::commit_memory(host_ptr, page_count * PAGE_SIZE as usize)?; - // Update page table + // Build a single `PageEntry` once, then Release-store it into each + // affected slot. Using a fresh `PageEntry::default()` per page + // would yield the same bits but at higher cost. + let mut entry = PageEntry::default(); + entry.set_base_address(page_start as u32); + entry.set_region_page_count(page_count as u32); + entry.set_allocation_protect(protect); + entry.set_current_protect(protect); + entry.set_state(AllocationState::RESERVE | AllocationState::COMMIT); + let raw = entry.raw(); for i in 0..page_count { let idx = page_start + i; - if idx < self.page_table.len() { - let entry = &mut self.page_table[idx]; - entry.set_base_address(page_start as u32); - entry.set_region_page_count(page_count as u32); - entry.set_allocation_protect(protect); - entry.set_current_protect(protect); - entry.set_state(AllocationState::RESERVE | AllocationState::COMMIT); + if let Some(slot) = self.page_table.get(idx) { + slot.store(raw, std::sync::atomic::Ordering::Release); } } @@ -126,36 +310,64 @@ impl GuestMemory { } /// Write a slice of bytes to guest memory (bypassing MMIO for bulk writes). - pub fn write_bulk(&mut self, addr: u32, buf: &[u8]) { + /// + /// Takes `&self` (matches the trait-level write contract): the actual + /// store goes through a raw `*mut u8` derived from `membase`, which + /// has no Rust aliasing semantics. Callers must respect the trait + /// contract — no concurrent read/write of the same byte range from + /// different threads. Used by the XEX loader (init, single-thread) + /// and `NtReadFile` (mid-execution; the file's destination buffer is + /// guest-thread-private by construction). + pub fn write_bulk(&self, addr: u32, buf: &[u8]) { let ptr = self.translate_virtual_mut(addr); unsafe { std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len()); } } - /// Check if a guest address has been allocated/committed. + /// Check if a guest address has been allocated/committed. Acquire load + /// pairs with the Release store in [`Self::alloc`] — any thread that + /// observes `state.contains(COMMIT)` here also observes every + /// allocation-side metadata write that preceded the store. pub fn is_mapped(&self, addr: u32) -> bool { let page = (addr / PAGE_SIZE) as usize; if page >= self.page_table.len() { return false; } - self.page_table[page].state().contains(AllocationState::COMMIT) + let raw = self.page_table[page].load(std::sync::atomic::Ordering::Acquire); + PageEntry::from_raw(raw) + .state() + .contains(AllocationState::COMMIT) } - /// Get a page table entry for a given address. - pub fn page_entry(&self, addr: u32) -> &PageEntry { + /// Get a page table entry for a given address, or None if out of range. + /// Returns by value (the storage is now atomic; we publish a snapshot). + pub fn page_entry(&self, addr: u32) -> Option { let page = (addr / PAGE_SIZE) as usize; - &self.page_table[page] + self.page_table + .get(page) + .map(|a| PageEntry::from_raw(a.load(std::sync::atomic::Ordering::Acquire))) } } impl MemoryAccess for GuestMemory { + // Tier-3 perf: `#[inline]` on the hot read/write paths lets LLVM + // fold the MMIO + mapping checks into the interpreter's load/store + // handlers, hoisting the "not-MMIO, mapped" branch out of the loop + // body for consecutive same-page accesses. + #[inline] fn read_u8(&self, addr: u32) -> u8 { + // MMIO dispatch must come first — a byte read at an MMIO-mapped + // address should invoke the callback, not the backing memory. + if let Some(mmio) = self.find_mmio(addr) { + return (mmio.read_callback)(addr) as u8; + } if !self.is_mapped(addr) { return 0; } let ptr = self.translate_virtual(addr); unsafe { *ptr } } + #[inline] fn read_u16(&self, addr: u32) -> u16 { if let Some(mmio) = self.find_mmio(addr) { (mmio.read_callback)(addr) as u16 @@ -167,6 +379,7 @@ impl MemoryAccess for GuestMemory { } } + #[inline] fn read_u32(&self, addr: u32) -> u32 { if let Some(mmio) = self.find_mmio(addr) { (mmio.read_callback)(addr) @@ -178,6 +391,7 @@ impl MemoryAccess for GuestMemory { } } + #[inline] fn read_u64(&self, addr: u32) -> u64 { if let Some(mmio) = self.find_mmio(addr) { let hi = (mmio.read_callback)(addr) as u64; @@ -191,49 +405,68 @@ impl MemoryAccess for GuestMemory { } } - fn write_u8(&mut self, addr: u32, val: u8) { + fn write_u8(&self, addr: u32, val: u8) { + // MMIO dispatch first — a byte write at an MMIO-mapped address + // must invoke the callback, not the backing memory. + if let Some(mmio) = self.find_mmio(addr) { + (mmio.write_callback)(addr, val as u32); + return; + } if !self.is_mapped(addr) { return; } let ptr = self.translate_virtual_mut(addr); unsafe { *ptr = val }; + self.bump_page_version(addr); } - fn write_u16(&mut self, addr: u32, val: u16) { + fn write_u16(&self, addr: u32, val: u16) { if let Some(mmio) = self.find_mmio(addr) { (mmio.write_callback)(addr, val as u32); } else if !self.is_mapped(addr) { - return; } else { let ptr = self.translate_virtual_mut(addr); unsafe { std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2); } + self.bump_page_version(addr); + // A 16-bit write can cross a page boundary; bump the neighbour + // too so the texture cache sees the write even if it's looking + // at the next page's version. + if (addr & 0xFFF) >= (PAGE_SIZE - 1) { + self.bump_page_version(addr.wrapping_add(1)); + } } } - fn write_u32(&mut self, addr: u32, val: u32) { + fn write_u32(&self, addr: u32, val: u32) { if let Some(mmio) = self.find_mmio(addr) { (mmio.write_callback)(addr, val); } else if !self.is_mapped(addr) { - return; } else { let ptr = self.translate_virtual_mut(addr); unsafe { std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4); } + self.bump_page_version(addr); + if (addr & 0xFFF) >= (PAGE_SIZE - 3) { + self.bump_page_version(addr.wrapping_add(3)); + } } } - fn write_u64(&mut self, addr: u32, val: u64) { + fn write_u64(&self, addr: u32, val: u64) { if let Some(mmio) = self.find_mmio(addr) { (mmio.write_callback)(addr, (val >> 32) as u32); (mmio.write_callback)(addr.wrapping_add(4), val as u32); } else if !self.is_mapped(addr) { - return; } else { let ptr = self.translate_virtual_mut(addr); unsafe { std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8); } + self.bump_page_version(addr); + if (addr & 0xFFF) >= (PAGE_SIZE - 7) { + self.bump_page_version(addr.wrapping_add(7)); + } } } @@ -245,13 +478,288 @@ impl MemoryAccess for GuestMemory { } } - fn translate_mut(&mut self, addr: u32) -> Option<*mut u8> { + fn translate_mut(&self, addr: u32) -> Option<*mut u8> { if self.find_mmio(addr).is_some() { None } else { Some(self.translate_virtual_mut(addr)) } } + + /// Override the default impl to hand the xenia-cpu `DecodeCache` a + /// real per-page version. Zero means "never written" which the cache + /// treats as a valid version; first write bumps to 1 (via the + /// global `writes_total` counter already maintained). + #[inline] + fn page_version(&self, addr: u32) -> u64 { + GuestMemory::page_version(self, addr) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicU32, Ordering}; + use std::sync::Arc; + + fn empty_mem() -> GuestMemory { GuestMemory::new().expect("reserve 4GB") } + + #[test] + fn alloc_rejects_unaligned_base() { + let mut mem = empty_mem(); + let err = mem.alloc(0x1001, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap_err(); + assert!(matches!(err, MemoryError::AllocationFailed(_))); + } + + #[test] + fn alloc_rejects_overflow_past_4gb() { + let mut mem = empty_mem(); + let err = mem.alloc(0xFFFF_0000, 0x0002_0000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap_err(); + assert!(matches!(err, MemoryError::AllocationFailed(_))); + } + + #[test] + fn alloc_succeeds_for_valid_region() { + let mut mem = empty_mem(); + let base = mem.alloc(0x1000, 0x2000, MemoryProtect::READ | MemoryProtect::WRITE).expect("alloc ok"); + assert_eq!(base, 0x1000); + assert!(mem.is_mapped(0x1000)); + assert!(mem.is_mapped(0x2FFF)); + assert!(!mem.is_mapped(0x3000)); + } + + #[test] + fn page_entry_returns_none_out_of_range() { + let mem = empty_mem(); + // page_entry takes u32; all u32 values fit in the 4GB page table, + // so OOB-via-addr isn't reachable. Verify the Option behavior on an + // unmapped but in-range page: entry exists but is free. + let e = mem.page_entry(0xDEAD_BEEF).expect("in-range"); + assert!(e.is_free()); + } + + #[test] + fn read_u8_dispatches_to_mmio() { + let mut mem = empty_mem(); + let seen_addr = Arc::new(AtomicU32::new(0)); + let seen_clone = seen_addr.clone(); + mem.add_mmio_region(MmioRegion { + base_address: 0xEA00_0000, + mask: 0xFFFF_FF00, + size: 0x100, + read_callback: Box::new(move |a| { + seen_clone.store(a, Ordering::SeqCst); + 0x42 + }), + write_callback: Box::new(|_, _| {}), + }); + let v = mem.read_u8(0xEA00_0008); + assert_eq!(v, 0x42); + assert_eq!(seen_addr.load(Ordering::SeqCst), 0xEA00_0008); + } + + #[test] + fn write_u8_dispatches_to_mmio() { + let mut mem = empty_mem(); + let captured = Arc::new(AtomicU32::new(0)); + let captured_clone = captured.clone(); + mem.add_mmio_region(MmioRegion { + base_address: 0xEB00_0000, + mask: 0xFFFF_FF00, + size: 0x100, + read_callback: Box::new(|_| 0), + write_callback: Box::new(move |_, v| { + captured_clone.store(v, Ordering::SeqCst); + }), + }); + mem.write_u8(0xEB00_0004, 0xAB); + assert_eq!(captured.load(Ordering::SeqCst), 0xAB); + } + + #[test] + fn u32_read_write_roundtrip_is_big_endian() { + let mut mem = empty_mem(); + mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap(); + mem.write_u32(0x2000, 0xDEAD_BEEF); + assert_eq!(mem.read_u32(0x2000), 0xDEAD_BEEF); + // And verify byte layout is big-endian (PPC native order). + assert_eq!(mem.read_u8(0x2000), 0xDE); + assert_eq!(mem.read_u8(0x2001), 0xAD); + assert_eq!(mem.read_u8(0x2002), 0xBE); + assert_eq!(mem.read_u8(0x2003), 0xEF); + } + + #[test] + fn page_versions_bump_on_write() { + let mut mem = empty_mem(); + mem.alloc(0x8000, 0x2000, MemoryProtect::READ | MemoryProtect::WRITE) + .unwrap(); + let v0 = mem.page_version(0x8000); + assert_eq!(v0, 0); + mem.write_u32(0x8000, 0xDEAD_BEEF); + let v1 = mem.page_version(0x8000); + assert!(v1 > v0, "page version should advance on write"); + // A write to a different page advances only that page. + mem.write_u8(0x9000, 0xAB); + assert_eq!(mem.page_version(0x8000), v1); + assert!(mem.page_version(0x9000) > v1); + // `max_page_version` across the span picks up the later write. + let span_max = mem.max_page_version(0x8000, 0x1001); + assert_eq!(span_max, mem.page_version(0x9000)); + } + + #[test] + fn mmio_fast_path_skips_non_mmio_address() { + // After registering a region in the GPU MMIO aperture, a write + // to an unrelated main-RAM address must NOT be intercepted — + // it must hit backing memory and bump page_version. + let mut mem = empty_mem(); + mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE) + .unwrap(); + let dispatched = Arc::new(AtomicU32::new(0)); + let dispatched_clone = dispatched.clone(); + mem.add_mmio_region(MmioRegion { + base_address: 0x7FC8_0000, + mask: 0xFFFF_0000, + size: 0x0001_0000, + read_callback: Box::new(move |_| { + dispatched_clone.fetch_add(1, Ordering::SeqCst); + 0 + }), + write_callback: Box::new(|_, _| {}), + }); + let v0 = mem.page_version(0x2000); + mem.write_u32(0x2000, 0xCAFE_F00D); + assert_eq!(mem.read_u32(0x2000), 0xCAFE_F00D); + assert!(mem.page_version(0x2000) > v0); + assert_eq!(dispatched.load(Ordering::SeqCst), 0, + "non-MMIO read must not have hit the MMIO callback"); + } + + #[test] + fn mmio_fast_path_dispatches_for_aperture() { + // Addresses inside the registered aperture must still hit the + // callback after the fast-path landed. + let mut mem = empty_mem(); + let writes = Arc::new(AtomicU32::new(0)); + let reads = Arc::new(AtomicU32::new(0)); + let writes_clone = writes.clone(); + let reads_clone = reads.clone(); + mem.add_mmio_region(MmioRegion { + base_address: 0x7FC8_0000, + mask: 0xFFFF_0000, + size: 0x0001_0000, + read_callback: Box::new(move |_| { + reads_clone.fetch_add(1, Ordering::SeqCst); + 0xAA + }), + write_callback: Box::new(move |_, _| { + writes_clone.fetch_add(1, Ordering::SeqCst); + }), + }); + mem.write_u32(0x7FC8_0420, 0x1234); + assert_eq!(writes.load(Ordering::SeqCst), 1); + let v = mem.read_u32(0x7FC8_0008); + assert_eq!(v, 0xAA); + assert_eq!(reads.load(Ordering::SeqCst), 1); + } + + #[test] + fn mmio_fast_path_handles_two_disjoint_regions() { + // Two disjoint MMIO regions — both must dispatch, and a + // non-MMIO address still must not. + let mut mem = empty_mem(); + mem.alloc(0x2000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE) + .unwrap(); + let a_writes = Arc::new(AtomicU32::new(0)); + let b_writes = Arc::new(AtomicU32::new(0)); + let a_clone = a_writes.clone(); + let b_clone = b_writes.clone(); + mem.add_mmio_region(MmioRegion { + base_address: 0x7FC8_0000, + mask: 0xFFFF_0000, + size: 0x0001_0000, + read_callback: Box::new(|_| 0), + write_callback: Box::new(move |_, _| { + a_clone.fetch_add(1, Ordering::SeqCst); + }), + }); + mem.add_mmio_region(MmioRegion { + base_address: 0xEA00_0000, + mask: 0xFFFF_0000, + size: 0x0001_0000, + read_callback: Box::new(|_| 0), + write_callback: Box::new(move |_, _| { + b_clone.fetch_add(1, Ordering::SeqCst); + }), + }); + // Both regions still dispatch. + mem.write_u32(0x7FC8_0008, 1); + mem.write_u32(0xEA00_0008, 2); + assert_eq!(a_writes.load(Ordering::SeqCst), 1); + assert_eq!(b_writes.load(Ordering::SeqCst), 1); + // Non-MMIO write still bypasses both callbacks. + let v0 = mem.page_version(0x2000); + mem.write_u32(0x2000, 0xDEAD_BEEF); + assert_eq!(a_writes.load(Ordering::SeqCst), 1); + assert_eq!(b_writes.load(Ordering::SeqCst), 1); + assert!(mem.page_version(0x2000) > v0); + assert_eq!(mem.read_u32(0x2000), 0xDEAD_BEEF); + } + + #[test] + fn mmio_fold_aperture_idempotent_for_identical_regions() { + // Regression: re-registering the same region must not collapse + // the cached aperture (which would force every fast-rejected + // address back through the linear iter().find). + let (m, v) = super::fold_aperture( + 0xFFFF_0000, 0x7FC8_0000, + 0xFFFF_0000, 0x7FC8_0000, + ); + assert_eq!(m, 0xFFFF_0000); + assert_eq!(v, 0x7FC8_0000); + } + + #[test] + fn mmio_fold_aperture_widens_for_disjoint_regions() { + // Folding two disjoint regions yields a *necessary*-only mask. + // The cached pair must accept both region addresses (the inner + // contains() is the sufficient check) and reject something + // outside both. + let (m, v) = super::fold_aperture( + 0xFFFF_0000, 0x7FC8_0000, + 0xFFFF_0000, 0xEA00_0000, + ); + assert_eq!((0x7FC8_0420u32 & m), v); + assert_eq!((0xEA00_0008u32 & m), v); + // 0x2000 is outside both; the fold-mask compare must reject it. + assert_ne!((0x0000_2000u32 & m), v); + } + + #[test] + fn page_versions_ignore_mmio_writes() { + let mut mem = empty_mem(); + mem.add_mmio_region(MmioRegion { + base_address: 0xEC00_0000, + mask: 0xFFFF_FF00, + size: 0x100, + read_callback: Box::new(|_| 0), + write_callback: Box::new(|_, _| {}), + }); + let before = mem.page_version(0xEC00_0000); + mem.write_u32(0xEC00_0004, 0x1234); + assert_eq!(mem.page_version(0xEC00_0000), before); + } + + #[test] + fn u64_read_write_roundtrip_is_big_endian() { + let mut mem = empty_mem(); + mem.alloc(0x3000, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE).unwrap(); + mem.write_u64(0x3000, 0x1122_3344_5566_7788); + assert_eq!(mem.read_u64(0x3000), 0x1122_3344_5566_7788); + assert_eq!(mem.read_u8(0x3000), 0x11); + assert_eq!(mem.read_u8(0x3007), 0x88); + } } impl Drop for GuestMemory { diff --git a/crates/xenia-memory/src/page_table.rs b/crates/xenia-memory/src/page_table.rs index 8d116eb..d26bc5b 100644 --- a/crates/xenia-memory/src/page_table.rs +++ b/crates/xenia-memory/src/page_table.rs @@ -6,6 +6,19 @@ use bitflags::bitflags; pub struct PageEntry(u64); impl PageEntry { + /// Reconstruct a [`PageEntry`] from its packed `u64` representation. + /// Used by [`crate::GuestMemory::is_mapped`] and `page_entry` after an + /// atomic load from the page table. + pub fn from_raw(raw: u64) -> Self { + Self(raw) + } + + /// The packed `u64` representation, ready to atomically Release-store + /// into the page table. + pub fn raw(&self) -> u64 { + self.0 + } + /// Base address of the allocated region in 4K pages (20 bits). pub fn base_address(&self) -> u32 { (self.0 & 0xFFFFF) as u32