diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs
index 079759f..e14ed39 100644
--- a/crates/xenia-gpu/src/gpu_system.rs
+++ b/crates/xenia-gpu/src/gpu_system.rs
@@ -522,15 +522,27 @@ impl GpuSystem {
     /// per-round GPU hook: the guest may have advanced `CP_RB_WPTR` since
     /// we last ran, and we in turn reflect our read-pointer back to the
     /// mirror register so the guest sees progress.
+    ///
+    /// GPUBUG-006: under `--parallel`, the producer (the guest CP_RB_WPTR
+    /// MMIO write) uses `Release` to publish prior ring-memory writes;
+    /// the consumer here must `Acquire`-load to pair correctly. With
+    /// Relaxed-on-load, ring-memory writes that the guest performed
+    /// before bumping WPTR could be reordered past our subsequent reads
+    /// — leading to garbage PM4 packet contents. The producer side at
+    /// `mmio_region.rs:78` already uses Release; the consumer's Relaxed
+    /// was the missing half. Symmetrically, the RPTR mirror store
+    /// publishes our read progress to the guest and benefits from a
+    /// Release.
     pub fn sync_with_mmio(&mut self) {
-        let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Relaxed);
+        let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire);
         if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 {
             self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords;
         }
-        // Mirror our read pointer.
+        // Mirror our read pointer (Release pairs with any guest-side
+        // Acquire-load of CP_RB_RPTR for ring writeback bookkeeping).
         self.mmio
             .cp_rb_rptr
-            .store(self.ring.read_offset_dwords, Ordering::Relaxed);
+            .store(self.ring.read_offset_dwords, Ordering::Release);
     }
 
     /// True iff `execute_one` is expected to make progress without blocking.