diff --git a/crates/xenia-gpu/Cargo.toml b/crates/xenia-gpu/Cargo.toml index fe02e00..ca1a775 100644 --- a/crates/xenia-gpu/Cargo.toml +++ b/crates/xenia-gpu/Cargo.toml @@ -11,3 +11,11 @@ tracing = { workspace = true } thiserror = { workspace = true } anyhow = { workspace = true } byteorder = { workspace = true } +metrics = { workspace = true } +bytemuck = { workspace = true } +crossbeam-channel = { workspace = true } + +[dev-dependencies] +# Used to validate bundled WGSL placeholders compile cleanly. Matches the +# wgpu-22 transitive dep so we don't pull in a second naga version. +naga = { version = "22", features = ["wgsl-in"] } diff --git a/crates/xenia-gpu/src/draw_state.rs b/crates/xenia-gpu/src/draw_state.rs new file mode 100644 index 0000000..0b078a8 --- /dev/null +++ b/crates/xenia-gpu/src/draw_state.rs @@ -0,0 +1,1113 @@ +//! Extract draw state from the Xenos register file at `PM4_DRAW_INDX` time. +//! +//! This is the "what are we drawing?" snapshot: primitive type, vertex count, +//! index buffer (if any), viewport, scissor, blend, depth state, and enough +//! handles for a future translator / uber-shader to pull fetch constants + +//! shader blobs. Ground truth: `xenia-canary/src/xenia/gpu/draw_util.h` and +//! the PM4 handler at `pm4_command_processor_implement.h:1128-1151`. +//! +//! We only extract what the P3 uber-shader actually consumes; the rest is +//! reserved for later phases. + +use crate::register_file::RegisterFile; + +/// Primitive type (Xenos `PrimitiveType` enum from `xenos.h`). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PrimitiveType { + None, + PointList, + LineList, + LineStrip, + TriangleList, + TriangleFan, + TriangleStrip, + RectangleList, + QuadList, + Unknown(u8), +} + +impl PrimitiveType { + pub fn from_bits(b: u32) -> Self { + match b & 0x3F { + 0 => PrimitiveType::None, + 1 => PrimitiveType::PointList, + 2 => PrimitiveType::LineList, + 3 => PrimitiveType::LineStrip, + 4 => PrimitiveType::TriangleList, + 5 => PrimitiveType::TriangleFan, + 6 => PrimitiveType::TriangleStrip, + 8 => PrimitiveType::RectangleList, + 13 => PrimitiveType::QuadList, + other => PrimitiveType::Unknown(other as u8), + } + } +} + +/// How the draw was issued per `VGT_DRAW_INITIATOR.source_select`: +/// 0=DMA, 1=Immediate (in-packet indices), 2=AutoIndex. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IndexSource { + /// Index buffer fetched from `VGT_DMA_BASE` / `VGT_DMA_SIZE`. + Dma { + base_address: u32, + size_dwords: u32, + index_size: IndexSize, + }, + /// Indices follow the `DRAW_INDX_2` packet header inline. + Immediate { index_size: IndexSize }, + /// No index buffer; generate `0..vertex_count - 1` on the host. + AutoIndex, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IndexSize { + /// 16-bit indices. + Sixteen, + /// 32-bit indices. + ThirtyTwo, +} + +/// Snapshot of one draw call's state, sampled from the register file. +#[derive(Debug, Clone, Copy)] +pub struct DrawState { + pub primitive: PrimitiveType, + pub vertex_count: u32, + pub index_source: IndexSource, + pub viewport: Viewport, + pub scissor: Scissor, + /// RB_COLOR_INFO for each of the 4 possible color render targets; `None` + /// where the target is not bound. + pub color_info: [Option; 4], + pub depth_info: Option, + pub rb_modecontrol: u32, + pub rb_colorcontrol: u32, + pub rb_depthcontrol: u32, + /// P4: per-color-target blend state. Index matches `color_info`. + pub rb_blendcontrol: [u32; 4], + /// P4: stencil state. + pub rb_stencilrefmask: u32, + pub rb_stencilrefmask_bf: u32, + /// P4: pixel offset applied at rasterization. + pub pa_sc_window_offset: u32, + /// P4: resolve destination registers (`RB_COPY_*`). These are set by + /// the guest just before triggering a TILE_FLUSH event and describe + /// where an EDRAM→texture copy should land. + pub rb_copy_control: u32, + pub rb_copy_dest_base: u32, + pub rb_copy_dest_pitch: u32, + pub rb_copy_dest_info: u32, + /// Key of the VS blob that was active at draw time (from + /// `GpuSystem::active_vs_key`). `None` = no VS loaded yet; the draw is + /// meaningless and will be rejected by the dispatcher. + pub vs_blob_key: Option, + /// Key of the PS blob that was active at draw time. + pub ps_blob_key: Option, +} + +#[derive(Debug, Clone, Copy, Default)] +pub struct Viewport { + pub scale_x: f32, + pub scale_y: f32, + pub scale_z: f32, + pub offset_x: f32, + pub offset_y: f32, + pub offset_z: f32, +} + +#[derive(Debug, Clone, Copy, Default)] +pub struct Scissor { + pub tl_x: u16, + pub tl_y: u16, + pub br_x: u16, + pub br_y: u16, +} + +#[derive(Debug, Clone, Copy)] +pub struct ColorTargetInfo { + /// EDRAM tile base for this color target (`RB_COLOR_INFO.base_tiles`). + pub base_tiles: u16, + /// Color format (`RB_COLOR_INFO.color_format`). + pub format: u8, +} + +#[derive(Debug, Clone, Copy)] +pub struct DepthTargetInfo { + /// EDRAM tile base for depth/stencil. + pub base_tiles: u16, + /// 0=D24S8, 1=D24FS8 (per `xenos.h:404-408`). + pub format: u8, +} + +/// Resolve source: either one of four color render targets or the depth RT. +/// Packed into `RB_COPY_CONTROL.copy_src_select` (bits [2:0]): 0..=3 pick +/// color0..3, 4 picks depth. Canary `registers.h:853`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ResolveSource { + Color(u8), + Depth, +} + +/// Resolve rectangle in pixel coordinates at the destination resolution, +/// 8-pixel aligned per Canary's `kResolveAlignmentPixels = 8`. MSAA scaling +/// is kept separate — `sample_count_log2_x/y` tell the resolve how many +/// samples to step per destination pixel. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct ResolveCoordinates { + pub x0: u32, + pub y0: u32, + pub width: u32, + pub height: u32, + /// 1 iff 4x MSAA (samples laid out 2x wider than pixels). + pub sample_count_log2_x: u32, + /// 1 iff 2x+ MSAA (samples laid out 2x taller than pixels). + pub sample_count_log2_y: u32, +} + +/// Decoded resolve state — describes how a `TILE_FLUSH` event should copy +/// EDRAM bytes to a guest-memory tiled texture. Canary equivalent: +/// `draw_util::ResolveInfo` at `draw_util.h:627`. Bit-field layout in +/// `RB_COPY_CONTROL / RB_COPY_DEST_INFO / RB_COPY_DEST_PITCH` comes from +/// `registers.h:853-897`. +#[derive(Debug, Clone, Copy)] +pub struct ResolveInfo { + /// Which source RT (0..=3=color, 4=depth). Raw register bits. + pub copy_src_select: u8, + /// Sample selector for MSAA sources. See `xenos::CopySampleSelect`. + pub copy_sample_select: u8, + /// Enable clear of the source render target after the copy. + pub color_clear_enable: bool, + pub depth_clear_enable: bool, + /// 0 = raw tile copy (same format), 1 = convert to `copy_dest_format`. + /// 2 = constantOne, 3 = null (no copy). + pub copy_command: u8, + /// Guest-memory destination address, already masked to the 29-bit + /// Xenon physical range (`& 0x1FFF_FFFF`). + pub dest_base: u32, + /// Destination pitch in pixels (0..=16383). Byte pitch = pitch * bpp + /// after the caller pitch-aligns to `kStoragePitchHeightAlignmentBlocks + /// = 32`. + pub dest_pitch_pixels: u32, + pub dest_height_pixels: u32, + /// Destination format (`xenos::ColorFormat`, 6 bits). + pub dest_format: u8, + /// Byte-swap mode applied before the write (`xenos::Endian128`, 0..=5). + pub dest_endian: u8, + /// Signed [-32, 31] exponent bias applied during conversion. + pub dest_exp_bias: i8, + /// Decoded resolve source (color0..3 or depth). + pub source: ResolveSource, + /// 8-pixel-aligned resolve rectangle. + pub coords: ResolveCoordinates, + /// Source format: `ColorRenderTargetFormat` when color, + /// `DepthRenderTargetFormat` when depth. + pub source_format: u8, + /// EDRAM tile origin of the source RT (from `RB_COLOR_INFO.color_base` + /// or `RB_DEPTH_INFO.depth_base`, 11-bit mod 2048). + pub source_base_tiles: u16, + /// `GetSurfacePitchTiles(surface_pitch, msaa, is_64bpp)` — how many + /// 80-sample-wide tiles make up one EDRAM row. + pub surface_pitch_tiles: u32, + /// MSAA mode from `RB_SURFACE_INFO`. + pub msaa: crate::render_target_cache::MsaaSamples, + /// True iff the source color format is 64bpp (doubles EDRAM pitch/base). + pub source_is_64bpp: bool, + /// `RB_COLOR_CLEAR` — constant written into EDRAM when + /// `color_clear_enable` is set. + pub color_clear_value: u32, + /// `RB_COLOR_CLEAR_LO` — second 32-bit lane for 64bpp clear. + pub color_clear_value_lo: u32, + /// `RB_DEPTH_CLEAR` — constant written into EDRAM depth tiles on + /// `depth_clear_enable`. + pub depth_clear_value: u32, + /// `RB_COPY_DEST_INFO.copy_dest_array` — 2D (false) vs 3D/stacked (true). + pub copy_dest_array: bool, +} + +/// `GetSurfacePitchTiles(pitch_pixels, msaa, is_64bpp)` — ported from +/// `xenos.h:465-476`. Returns the number of 80-sample-wide EDRAM tiles +/// that make up one row of a surface with `pitch_pixels`-pixel pitch. +/// +/// At 4x MSAA samples span twice the pixel width, so the sample pitch +/// doubles. 64bpp formats pack two EDRAM tiles per color value, so the +/// effective tile pitch doubles again. +#[inline] +pub fn surface_pitch_tiles( + pitch_pixels: u32, + msaa: crate::render_target_cache::MsaaSamples, + is_64bpp: bool, +) -> u32 { + use crate::render_target_cache::MsaaSamples; + const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80; + let pitch_samples = pitch_pixels << u32::from(msaa == MsaaSamples::X4); + let pitch_tiles = pitch_samples.div_ceil(EDRAM_TILE_WIDTH_SAMPLES); + pitch_tiles << u32::from(is_64bpp) +} + +/// Canary `ColorRenderTargetFormat` is 64bpp iff its numeric value is one +/// of {5, 7, 15} — i.e. `k_16_16_16_16`, `k_16_16_16_16_FLOAT`, or +/// `k_32_32_FLOAT`. `xenos.h:297-317` + the enum's `IsColorRenderTarget +/// Format64bpp` helper. +#[inline] +pub fn color_render_target_format_is_64bpp(fmt: u8) -> bool { + matches!(fmt, 5 | 7 | 15) +} + +/// `kResolveAlignmentPixels` from Canary (`draw_util.cc:925` area). +pub const RESOLVE_ALIGNMENT_PIXELS: u32 = 8; + +/// Clamp a raw resolve rectangle to the `PA_SC_WINDOW_SCISSOR_*` registers +/// and align to the 8-pixel grid. Caller passes `i32` because the VF0 +/// derivation can produce negative bounding-box values; this helper clamps +/// them to the non-negative window defined by the scissor. +/// +/// Returns `(x0, y0, width, height)` in pixels, all non-negative, all +/// 8-pixel-aligned, `width`/`height` already `>= 0`. Width/height of 0 +/// signals "empty resolve; skip". +pub fn resolve_rect_apply_scissor_and_align_8( + rf: &RegisterFile, + x0_in: i32, + y0_in: i32, + x1_in: i32, + y1_in: i32, +) -> (u32, u32, u32, u32) { + let tl = rf.read(reg::PA_SC_WINDOW_SCISSOR_TL); + let br = rf.read(reg::PA_SC_WINDOW_SCISSOR_BR); + let tl_x = (tl & 0x3FFF) as i32; + let tl_y = ((tl >> 16) & 0x3FFF) as i32; + let br_x = (br & 0x3FFF) as i32; + let br_y = ((br >> 16) & 0x3FFF) as i32; + + // Clamp only when the scissor is a non-degenerate window; otherwise + // leave the input rect alone (Canary's `kResolveAlignmentPixels` will + // still 8-align it below). + let (mut x0, mut y0, mut x1, mut y1) = (x0_in, y0_in, x1_in, y1_in); + if br_x > tl_x && br_y > tl_y { + let clamp = |v: i32, lo: i32, hi: i32| v.max(lo).min(hi); + x0 = clamp(x0, tl_x, br_x); + y0 = clamp(y0, tl_y, br_y); + x1 = clamp(x1, tl_x, br_x); + y1 = clamp(y1, tl_y, br_y); + } + if x1 < x0 { + x1 = x0; + } + if y1 < y0 { + y1 = y0; + } + + // 8-pixel align. Floor top-left; ceil bottom-right. + let align_mask = (RESOLVE_ALIGNMENT_PIXELS as i32) - 1; + x0 &= !align_mask; + y0 &= !align_mask; + x1 = (x1 + align_mask) & !align_mask; + y1 = (y1 + align_mask) & !align_mask; + + let x0u = x0.max(0) as u32; + let y0u = y0.max(0) as u32; + let x1u = x1.max(0) as u32; + let y1u = y1.max(0) as u32; + ( + x0u, + y0u, + x1u.saturating_sub(x0u), + y1u.saturating_sub(y0u), + ) +} + +/// Parse vertex fetch constant 0 (Canary `xe_gpu_vertex_fetch_t`, +/// `xenos.h:1158-1172`) and derive the resolve bounding-box in pixel units. +/// Returns `None` when the fetch isn't the 6-float vertex buffer the +/// resolve shader expects (type != kVertex or size != 6). +/// +/// This mirrors `draw_util.cc:950-1014` minus window-offset and half-pixel +/// nudging — the pitfalls there are (a) handling endian via `GpuSwap` and +/// (b) Fixed16p8 top-left rounding `(v + 127) >> 8`. Both are replicated. +/// +/// The returned rect is in *pixel* coordinates, *pre-scissor-clamp* and +/// *pre-alignment*. Caller feeds it through +/// [`resolve_rect_apply_scissor_and_align_8`]. +pub fn vertex_fetch_0_rect( + rf: &RegisterFile, + mem: &dyn xenia_memory::access::MemoryAccess, +) -> Option<(i32, i32, i32, i32)> { + const CONST_BASE_FETCH: u32 = 0x4800; + let dword_0 = rf.read(CONST_BASE_FETCH); + let dword_1 = rf.read(CONST_BASE_FETCH + 1); + + // type:2 at bits [1:0]; kVertex = 3 per xenos.h:1147-1152. + let fetch_type = dword_0 & 0x3; + if fetch_type != 3 { + return None; + } + // size:24 at bits [25:2] of dword_1 — in dwords; expect 6 (3 × vec2). + let size = (dword_1 >> 2) & 0x00FF_FFFF; + if size != 6 { + return None; + } + // address:30 at bits [31:2] of dword_0 — in dwords. + let address_bytes = dword_0 & 0xFFFF_FFFC; + // endian:2 at bits [1:0] of dword_1 — xenos::Endian (kNone/k8in16/k8in32/k16in32). + let fetch_endian = (dword_1 & 0x3) as u8; + + // Read 6 floats from guest memory. `mem.read_u32` stores BE bytes as a + // u32 value; to mirror Canary's "raw LE bytes → u32 → GpuSwap" we have + // to re-interpret the memory as LE (flipping what `read_u32` did). + let floats: [f32; 6] = std::array::from_fn(|i| { + let be_u32 = mem.read_u32(address_bytes.wrapping_add(i as u32 * 4)); + // `be_u32` was composed from bytes `[b0,b1,b2,b3]` as + // `(b0<<24)|...|b3`. Canary reads those same bytes in host-LE, + // producing `(b3<<24)|...|b0`. That's `be_u32.swap_bytes()`. + let canary_le = be_u32.swap_bytes(); + let swapped = gpu_swap_u32(canary_le, fetch_endian); + f32::from_bits(swapped) + }); + + // PA_SU_VTX_CNTL::pix_center: bit 0, 0 = kD3DZero (+0.5 half-pixel), 1 = kOpenGL (no offset). + // Register index 0x2083 per register_table.inc (PA_SU_VTX_CNTL). + const PA_SU_VTX_CNTL: u32 = 0x2083; + let half_pixel_offset = if rf.read(PA_SU_VTX_CNTL) & 1 == 0 { + 0.5f32 + } else { + 0.0f32 + }; + + // Convert each to Fixed16p8 (multiply by 256, round). + let fixed: [i32; 6] = std::array::from_fn(|i| { + ((floats[i] + half_pixel_offset) * 256.0).round() as i32 + }); + + let x0 = fixed[0].min(fixed[2]).min(fixed[4]); + let y0 = fixed[1].min(fixed[3]).min(fixed[5]); + let x1 = fixed[0].max(fixed[2]).max(fixed[4]); + let y1 = fixed[1].max(fixed[3]).max(fixed[5]); + + // Top-left rounding: `(v + 127) >> 8` for both corners. + let round = |v: i32| (v + 127) >> 8; + Some((round(x0), round(y0), round(x1), round(y1))) +} + +/// Canary `GpuSwapInline` on a u32. Exposed here so the vertex-fetch path +/// can apply the same byte-order transform Canary's `GpuSwap` applies +/// to vertex data. `xenos.h:1077-1114`. +#[inline] +fn gpu_swap_u32(value: u32, endian: u8) -> u32 { + match endian & 0x3 { + // kNone. + 0 => value, + // k8in16: swap bytes within each 16-bit word. + 1 => ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8), + // k8in32: full byte reversal. + 2 => value.swap_bytes(), + // k16in32: swap 16-bit halves. + _ => value.rotate_left(16), + } +} + +impl ResolveInfo { + /// Legacy entrypoint used when the caller already has a `DrawState`. It + /// fills only the narrow register bits that live in `DrawState` — the + /// wider coordinate / EDRAM fields require the full register file. + /// + /// Kept for tests that construct resolve decoders from captured draw + /// states. `from_register_file` is the canonical path. + pub fn from_draw_state(ds: &DrawState) -> Self { + use crate::render_target_cache::MsaaSamples; + let c = ds.rb_copy_control; + let p = ds.rb_copy_dest_pitch; + let i = ds.rb_copy_dest_info; + // Sign-extend the 6-bit exp_bias from `copy_dest_info[21:16]`. + let exp_raw = (i >> 16) & 0x3F; + let exp_sign = ((exp_raw & 0x20) != 0) as i8; + let exp_bias = (exp_raw as i8) - (exp_sign * 64); + let src_sel = (c & 0x7) as u8; + let source = if src_sel >= 4 { + ResolveSource::Depth + } else { + ResolveSource::Color(src_sel) + }; + Self { + copy_src_select: src_sel, + copy_sample_select: ((c >> 4) & 0x7) as u8, + color_clear_enable: ((c >> 8) & 1) != 0, + depth_clear_enable: ((c >> 9) & 1) != 0, + copy_command: ((c >> 20) & 0x3) as u8, + dest_base: ds.rb_copy_dest_base & 0x1FFF_FFFF, + dest_pitch_pixels: p & 0x3FFF, + dest_height_pixels: (p >> 16) & 0x3FFF, + dest_format: ((i >> 7) & 0x3F) as u8, + dest_endian: (i & 0x7) as u8, + dest_exp_bias: exp_bias, + source, + coords: ResolveCoordinates::default(), + source_format: 0, + source_base_tiles: 0, + surface_pitch_tiles: 0, + msaa: MsaaSamples::X1, + source_is_64bpp: false, + color_clear_value: 0, + color_clear_value_lo: 0, + depth_clear_value: 0, + copy_dest_array: ((i >> 3) & 1) != 0, + } + } + + /// Canonical resolve decoder — reads live register values and derives the + /// full rectangle / EDRAM layout. Mirrors canary `draw_util.cc:926-1318` + /// `GetResolveInfo` with the following simplifications (all scoped in + /// the landing plan and will be expanded as needs arise): + /// + /// * The rectangle is derived from the scissor window and + /// `RB_COPY_DEST_PITCH` rather than fetched from vertex fetch 0. + /// Sylpheed's splash uses a clear-resolve — there's no draw ahead + /// of it — so vertex-fetch-derived geometry is not available. + /// * `copy_sample_select` is kept as-is; sample averaging for 2x/4x + /// MSAA is not yet applied on the read side. + /// * `PA_SC_WINDOW_OFFSET` is not applied — not needed for Sylpheed + /// and canary only applies it when `PA_SU_SC_MODE_CNTL.vtx_window + /// _offset_enable` is set, which requires a live draw. + pub fn from_register_file(rf: &RegisterFile) -> Self { + use crate::render_target_cache::MsaaSamples; + let c = rf.read(reg::RB_COPY_CONTROL); + let i = rf.read(reg::RB_COPY_DEST_INFO); + let p = rf.read(reg::RB_COPY_DEST_PITCH); + let dest_base_raw = rf.read(reg::RB_COPY_DEST_BASE); + + // Sign-extend 6-bit exp_bias from copy_dest_info[21:16]. + let exp_raw = (i >> 16) & 0x3F; + let exp_sign = ((exp_raw & 0x20) != 0) as i8; + let exp_bias = (exp_raw as i8) - (exp_sign * 64); + + let src_sel = (c & 0x7) as u8; + let source = if src_sel >= 4 { + ResolveSource::Depth + } else { + ResolveSource::Color(src_sel & 0x3) + }; + + let rb_surface_info = rf.read(reg::RB_SURFACE_INFO); + let surface_pitch_pixels = rb_surface_info & 0x3FFF; + let msaa = MsaaSamples::from_raw((rb_surface_info >> 16) & 0x3); + + // Source format + base tiles depend on which RT we're reading. + let (source_format, source_base_tiles, source_is_64bpp) = match source { + ResolveSource::Color(idx) => { + let rb = match idx { + 0 => rf.read(reg::RB_COLOR_INFO_0), + 1 => rf.read(reg::RB_COLOR_INFO_1), + 2 => rf.read(reg::RB_COLOR_INFO_2), + _ => rf.read(reg::RB_COLOR_INFO_3), + }; + let fmt = ((rb >> 16) & 0xF) as u8; + let base = (rb & 0xFFF) as u16; + (fmt, base, color_render_target_format_is_64bpp(fmt)) + } + ResolveSource::Depth => { + let rb = rf.read(reg::RB_DEPTH_INFO); + let fmt = ((rb >> 16) & 0x1) as u8; + let base = (rb & 0xFFF) as u16; + (fmt, base, false) + } + }; + + let pitch_tiles = surface_pitch_tiles(surface_pitch_pixels, msaa, source_is_64bpp); + + // --- Rectangle derivation --- + // Default extent is (0, 0, dest_pitch, dest_height); subject to + // scissor clamp + 8-pixel alignment. + let dest_pitch = p & 0x3FFF; + let dest_height = (p >> 16) & 0x3FFF; + let coords_no_msaa = resolve_rect_apply_scissor_and_align_8( + rf, + 0, + 0, + dest_pitch as i32, + dest_height as i32, + ); + let coords = ResolveCoordinates { + x0: coords_no_msaa.0, + y0: coords_no_msaa.1, + width: coords_no_msaa.2, + height: coords_no_msaa.3, + sample_count_log2_x: u32::from(msaa == MsaaSamples::X4), + sample_count_log2_y: u32::from(msaa != MsaaSamples::X1), + }; + + Self { + copy_src_select: src_sel, + copy_sample_select: ((c >> 4) & 0x7) as u8, + color_clear_enable: ((c >> 8) & 1) != 0, + depth_clear_enable: ((c >> 9) & 1) != 0, + copy_command: ((c >> 20) & 0x3) as u8, + dest_base: dest_base_raw & 0x1FFF_FFFF, + dest_pitch_pixels: dest_pitch, + dest_height_pixels: dest_height, + dest_format: ((i >> 7) & 0x3F) as u8, + dest_endian: (i & 0x7) as u8, + dest_exp_bias: exp_bias, + source, + coords, + source_format, + source_base_tiles, + surface_pitch_tiles: pitch_tiles, + msaa, + source_is_64bpp, + color_clear_value: rf.read(reg::RB_COLOR_CLEAR), + color_clear_value_lo: rf.read(reg::RB_COLOR_CLEAR_LO), + depth_clear_value: rf.read(reg::RB_DEPTH_CLEAR), + copy_dest_array: ((i >> 3) & 1) != 0, + } + } + + /// Memory-aware variant: if vertex fetch 0 contains the D3D9-hack + /// "resolve rectangle" vertices (3 vec2 floats, Canary `draw_util.cc + /// :950-1014`), use its bounding box as the resolve extent. Falls back + /// to the scissor + `RB_COPY_DEST_PITCH/HEIGHT` rect when VF0 isn't a + /// 6-dword vertex buffer. + /// + /// Used from the live TILE_FLUSH path; tests can stick with + /// `from_register_file` when they don't want to program VF0. + pub fn from_register_file_and_memory( + rf: &RegisterFile, + mem: &dyn xenia_memory::access::MemoryAccess, + ) -> Self { + let mut info = Self::from_register_file(rf); + if let Some((x0, y0, x1, y1)) = vertex_fetch_0_rect(rf, mem) { + let (rx0, ry0, rw, rh) = + resolve_rect_apply_scissor_and_align_8(rf, x0, y0, x1, y1); + // Only override when the VF0 rect is non-empty — an empty VF0 + // means the game hasn't set one up yet and we should keep the + // scissor+dest default. + if rw > 0 && rh > 0 { + info.coords.x0 = rx0; + info.coords.y0 = ry0; + info.coords.width = rw; + info.coords.height = rh; + } + } + info + } +} + +/// Register indices from `xenia-canary/src/xenia/gpu/registers.h`. Only what +/// the extractor reads is named here. +pub mod reg { + pub const VGT_DRAW_INITIATOR: u32 = 0x2281; + pub const VGT_DMA_BASE: u32 = 0x2282; + pub const VGT_DMA_SIZE: u32 = 0x2283; + pub const PA_CL_VPORT_XSCALE: u32 = 0x210F; + pub const PA_CL_VPORT_XOFFSET: u32 = 0x2110; + pub const PA_CL_VPORT_YSCALE: u32 = 0x2111; + pub const PA_CL_VPORT_YOFFSET: u32 = 0x2112; + pub const PA_CL_VPORT_ZSCALE: u32 = 0x2113; + pub const PA_CL_VPORT_ZOFFSET: u32 = 0x2114; + pub const PA_SC_WINDOW_SCISSOR_TL: u32 = 0x200E; + pub const PA_SC_WINDOW_SCISSOR_BR: u32 = 0x200F; + pub const RB_MODECONTROL: u32 = 0x2208; + pub const RB_SURFACE_INFO: u32 = 0x2000; + pub const RB_COLOR_INFO_0: u32 = 0x2001; + pub const RB_COLOR_INFO_1: u32 = 0x2010; + pub const RB_COLOR_INFO_2: u32 = 0x2011; + pub const RB_COLOR_INFO_3: u32 = 0x2012; + pub const RB_DEPTH_INFO: u32 = 0x2002; + pub const RB_COLORCONTROL: u32 = 0x2202; + pub const RB_DEPTHCONTROL: u32 = 0x2200; + // P4 additions — per-RT blend + stencil + window offset + resolve dst. + pub const RB_BLENDCONTROL_0: u32 = 0x2201; + pub const RB_BLENDCONTROL_1: u32 = 0x2209; + pub const RB_BLENDCONTROL_2: u32 = 0x220A; + pub const RB_BLENDCONTROL_3: u32 = 0x220B; + pub const RB_STENCILREFMASK: u32 = 0x210D; + pub const RB_STENCILREFMASK_BF: u32 = 0x210C; + pub const PA_SC_WINDOW_OFFSET: u32 = 0x2080; + pub const RB_COPY_CONTROL: u32 = 0x2318; + pub const RB_COPY_DEST_BASE: u32 = 0x2319; + pub const RB_COPY_DEST_PITCH: u32 = 0x231A; + pub const RB_COPY_DEST_INFO: u32 = 0x231B; + pub const RB_DEPTH_CLEAR: u32 = 0x231D; + pub const RB_COLOR_CLEAR: u32 = 0x231E; + pub const RB_COLOR_CLEAR_LO: u32 = 0x231F; +} + +/// Build a [`DrawState`] from a `VGT_DRAW_INITIATOR` value + the current +/// register file. `extra_dma_base`/`extra_dma_size` can override the +/// DMA fields if the caller has them from the PM4 packet payload (canary +/// passes them inline with `DRAW_INDX`). +pub fn extract( + register_file: &RegisterFile, + vgt_draw_initiator: u32, + dma_base: Option, + dma_size: Option, +) -> DrawState { + // `VGT_DRAW_INITIATOR` bit layout (per canary): + // [5:0] prim_type + // [7:6] source_select (0=DMA, 1=immediate, 2=auto) + // [8] index_size (0=16-bit, 1=32-bit) + // [31:16] num_indices + let prim_bits = vgt_draw_initiator & 0x3F; + let source_select = (vgt_draw_initiator >> 6) & 0x3; + let index_size_bit = (vgt_draw_initiator >> 8) & 0x1; + let num_indices = (vgt_draw_initiator >> 16) & 0xFFFF; + let index_size = if index_size_bit == 0 { + IndexSize::Sixteen + } else { + IndexSize::ThirtyTwo + }; + + let index_source = match source_select { + 0 => IndexSource::Dma { + base_address: dma_base.unwrap_or_else(|| register_file.read(reg::VGT_DMA_BASE)), + size_dwords: dma_size.unwrap_or_else(|| register_file.read(reg::VGT_DMA_SIZE)), + index_size, + }, + 1 => IndexSource::Immediate { index_size }, + _ => IndexSource::AutoIndex, + }; + + let f = |r: u32| f32::from_bits(register_file.read(r)); + let viewport = Viewport { + scale_x: f(reg::PA_CL_VPORT_XSCALE), + scale_y: f(reg::PA_CL_VPORT_YSCALE), + scale_z: f(reg::PA_CL_VPORT_ZSCALE), + offset_x: f(reg::PA_CL_VPORT_XOFFSET), + offset_y: f(reg::PA_CL_VPORT_YOFFSET), + offset_z: f(reg::PA_CL_VPORT_ZOFFSET), + }; + + let tl = register_file.read(reg::PA_SC_WINDOW_SCISSOR_TL); + let br = register_file.read(reg::PA_SC_WINDOW_SCISSOR_BR); + let scissor = Scissor { + tl_x: (tl & 0x7FFF) as u16, + tl_y: ((tl >> 16) & 0x7FFF) as u16, + br_x: (br & 0x7FFF) as u16, + br_y: ((br >> 16) & 0x7FFF) as u16, + }; + + let rb_modecontrol = register_file.read(reg::RB_MODECONTROL); + let color_mask = rb_modecontrol & 0xF; + let ci = |reg: u32, present: bool| { + if !present { + return None; + } + let raw = register_file.read(reg); + Some(ColorTargetInfo { + base_tiles: (raw & 0xFFF) as u16, + format: ((raw >> 16) & 0xF) as u8, + }) + }; + let color_info = [ + ci(reg::RB_COLOR_INFO_0, (color_mask & 0x1) != 0), + ci(reg::RB_COLOR_INFO_1, (color_mask & 0x2) != 0), + ci(reg::RB_COLOR_INFO_2, (color_mask & 0x4) != 0), + ci(reg::RB_COLOR_INFO_3, (color_mask & 0x8) != 0), + ]; + let depth_raw = register_file.read(reg::RB_DEPTH_INFO); + // Depth-surface "present" = the RB_MODECONTROL depth-enable bit at bit 4. + let depth_present = (rb_modecontrol & 0x10) != 0; + let depth_info = if depth_present { + Some(DepthTargetInfo { + base_tiles: (depth_raw & 0xFFF) as u16, + format: ((depth_raw >> 16) & 0x1) as u8, + }) + } else { + None + }; + + DrawState { + primitive: PrimitiveType::from_bits(prim_bits), + vertex_count: num_indices, + index_source, + viewport, + scissor, + color_info, + depth_info, + rb_modecontrol, + rb_colorcontrol: register_file.read(reg::RB_COLORCONTROL), + rb_depthcontrol: register_file.read(reg::RB_DEPTHCONTROL), + rb_blendcontrol: [ + register_file.read(reg::RB_BLENDCONTROL_0), + register_file.read(reg::RB_BLENDCONTROL_1), + register_file.read(reg::RB_BLENDCONTROL_2), + register_file.read(reg::RB_BLENDCONTROL_3), + ], + rb_stencilrefmask: register_file.read(reg::RB_STENCILREFMASK), + rb_stencilrefmask_bf: register_file.read(reg::RB_STENCILREFMASK_BF), + pa_sc_window_offset: register_file.read(reg::PA_SC_WINDOW_OFFSET), + rb_copy_control: register_file.read(reg::RB_COPY_CONTROL), + rb_copy_dest_base: register_file.read(reg::RB_COPY_DEST_BASE), + rb_copy_dest_pitch: register_file.read(reg::RB_COPY_DEST_PITCH), + rb_copy_dest_info: register_file.read(reg::RB_COPY_DEST_INFO), + // P3b M1: the kernel-side caller is expected to populate these + // via `DrawState { ..extract(...), vs_blob_key, ps_blob_key }` so + // the pure-register extraction stays decoupled from `GpuSystem` + // state. Default to None so a bare `extract()` stays valid for + // unit tests. + vs_blob_key: None, + ps_blob_key: None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn rf() -> RegisterFile { + RegisterFile::new() + } + + #[test] + fn extract_basic_triangle_list_no_rt() { + let rf = rf(); + // prim_type=4 (TriangleList), source=2 (auto), num_indices=6 + let vgt = (6u32 << 16) | (2 << 6) | 4; + let ds = extract(&rf, vgt, None, None); + assert_eq!(ds.primitive, PrimitiveType::TriangleList); + assert_eq!(ds.vertex_count, 6); + assert!(matches!(ds.index_source, IndexSource::AutoIndex)); + assert!(ds.color_info.iter().all(|c| c.is_none())); + assert!(ds.depth_info.is_none()); + } + + #[test] + fn extract_dma_indices_uses_override() { + let rf = rf(); + let vgt = (3u32 << 16) | (0 << 6) | 4; // prim=TriList, source=DMA + let ds = extract(&rf, vgt, Some(0xDEAD_0000), Some(6)); + match ds.index_source { + IndexSource::Dma { + base_address, + size_dwords, + index_size, + } => { + assert_eq!(base_address, 0xDEAD_0000); + assert_eq!(size_dwords, 6); + assert_eq!(index_size, IndexSize::Sixteen); + } + other => panic!("expected Dma, got {other:?}"), + } + } + + #[test] + fn color_and_depth_enabled_bits_are_honored() { + let mut rf = rf(); + // rb_modecontrol: color0 + depth enabled (bit0 + bit4) + rf.write(reg::RB_MODECONTROL, 0x11); + rf.write(reg::RB_COLOR_INFO_0, (2 << 16) | 0x64); // format=2, tile=0x64 + rf.write(reg::RB_DEPTH_INFO, (1 << 16) | 0x32); + let ds = extract(&rf, 4, None, None); + let c = ds.color_info[0].unwrap(); + assert_eq!(c.format, 2); + assert_eq!(c.base_tiles, 0x64); + let d = ds.depth_info.unwrap(); + assert_eq!(d.format, 1); + assert_eq!(d.base_tiles, 0x32); + } + + /// `RB_COPY_DEST_BASE` is a raw 32-bit register, but a Xenon physical + /// address is 29-bit (`& 0x1FFF_FFFF`). `ResolveInfo::from_register_file` + /// must mask before writes to prevent out-of-range memory accesses. + #[test] + fn resolve_info_masks_dest_base_to_physical() { + let mut rf = rf(); + rf.write(reg::RB_COPY_DEST_BASE, 0xDEAD_BEEF); + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.dest_base, 0x1EAD_BEEF); + } + + /// Scissor ∩ (0, 0, dest_pitch, dest_height), then 8-pixel-aligned per + /// Canary `kResolveAlignmentPixels`. Verify that the scissor actually + /// tightens the rect (not just degenerates it). + #[test] + fn resolve_info_derives_8px_aligned_rect_from_scissor_and_dest_pitch() { + let mut rf = rf(); + // Dest pitch/height 1280×720; scissor (5, 5) -> (1000, 717). + rf.write(reg::RB_COPY_DEST_PITCH, (720u32 << 16) | 1280u32); + rf.write(reg::PA_SC_WINDOW_SCISSOR_TL, (5u32 << 16) | 5u32); + rf.write(reg::PA_SC_WINDOW_SCISSOR_BR, (717u32 << 16) | 1000u32); + let info = ResolveInfo::from_register_file(&rf); + // x0 floors to 0 (was 5 -> &!7 = 0), y0 same. + // x1 = min(1280, 1000) = 1000; ceil-to-8 = 1000. y1 = min(720, 717) = 717, ceil = 720. + assert_eq!(info.coords.x0, 0); + assert_eq!(info.coords.y0, 0); + assert_eq!(info.coords.width, 1000); + assert_eq!(info.coords.height, 720); + } + + /// Non-degenerate scissor outside `dest_pitch/height` clamps to the + /// destination extent. + #[test] + fn resolve_info_scissor_cannot_widen_past_dest() { + let mut rf = rf(); + rf.write(reg::RB_COPY_DEST_PITCH, (16u32 << 16) | 16u32); + rf.write(reg::PA_SC_WINDOW_SCISSOR_BR, (1000u32 << 16) | 1000u32); + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.coords.width, 16); + assert_eq!(info.coords.height, 16); + } + + /// Source decoding: `copy_src_select >= 4` → depth; otherwise Color(idx). + #[test] + fn resolve_info_decodes_source_select() { + let mut rf = rf(); + rf.write(reg::RB_COPY_CONTROL, 2); // src_select = 2 (color2) + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.source, ResolveSource::Color(2)); + assert_eq!(info.copy_src_select, 2); + + rf.write(reg::RB_COPY_CONTROL, 4); // src_select = 4 -> depth + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.source, ResolveSource::Depth); + } + + /// `copy_dest_info` fields: endian (bits 2:0), format (bits 12:7), + /// exp_bias (bits 21:16, signed 6-bit), array (bit 3). + #[test] + fn resolve_info_decodes_copy_dest_info_fields() { + let mut rf = rf(); + // endian=2 (k8in32), format=6 (k_8_8_8_8), exp_bias=-1 (0x3F), array=1 + let val = 2u32 | (1u32 << 3) | (6u32 << 7) | (0x3Fu32 << 16); + rf.write(reg::RB_COPY_DEST_INFO, val); + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.dest_endian, 2); + assert_eq!(info.dest_format, 6); + assert_eq!(info.dest_exp_bias, -1); + assert!(info.copy_dest_array); + } + + /// Positive and negative exp_bias round-trip through the 6-bit + /// sign-extension. + #[test] + fn resolve_info_exp_bias_sign_extends() { + let mut rf = rf(); + rf.write(reg::RB_COPY_DEST_INFO, 1u32 << 16); // exp_bias = +1 + assert_eq!(ResolveInfo::from_register_file(&rf).dest_exp_bias, 1); + rf.write(reg::RB_COPY_DEST_INFO, 0x20u32 << 16); // exp_bias = -32 + assert_eq!(ResolveInfo::from_register_file(&rf).dest_exp_bias, -32); + rf.write(reg::RB_COPY_DEST_INFO, 0x1Fu32 << 16); // exp_bias = +31 + assert_eq!(ResolveInfo::from_register_file(&rf).dest_exp_bias, 31); + } + + /// `RB_SURFACE_INFO`: surface_pitch (bits 13:0) and msaa_samples (bits 17:16) + /// feed `surface_pitch_tiles`. 1280 px divides by 80 exactly → 16 tiles + /// at 1x MSAA / 32bpp; 4x MSAA doubles the sample pitch. + #[test] + fn resolve_info_computes_surface_pitch_tiles() { + let mut rf = rf(); + rf.write(reg::RB_COPY_CONTROL, 0); // color0 + rf.write(reg::RB_COLOR_INFO_0, 0u32 << 16); // k_8_8_8_8 -> 32bpp + rf.write(reg::RB_SURFACE_INFO, 1280); // msaa=1x, pitch=1280 + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.surface_pitch_tiles, 16); + assert!(!info.source_is_64bpp); + + // 4x MSAA widens the sample pitch by 2x. + rf.write(reg::RB_SURFACE_INFO, 1280 | (2u32 << 16)); + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.surface_pitch_tiles, 32); + + // Non-aligned pitch rounds up. + rf.write(reg::RB_SURFACE_INFO, 1281); + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.surface_pitch_tiles, 17); + } + + /// `color_render_target_format_is_64bpp` matches the 64bpp enum values + /// in `xenos::ColorRenderTargetFormat`: k_16_16_16_16 (5), + /// k_16_16_16_16_FLOAT (7), k_32_32_FLOAT (15). + #[test] + fn color_format_64bpp_table_is_correct() { + assert!(!color_render_target_format_is_64bpp(0)); + assert!(!color_render_target_format_is_64bpp(4)); + assert!(color_render_target_format_is_64bpp(5)); + assert!(!color_render_target_format_is_64bpp(6)); + assert!(color_render_target_format_is_64bpp(7)); + assert!(!color_render_target_format_is_64bpp(14)); + assert!(color_render_target_format_is_64bpp(15)); + } + + /// `surface_pitch_tiles` helper: exact arithmetic including the 64bpp + /// doubling. `xenos.h:465-476`. + #[test] + fn surface_pitch_tiles_matches_canary_helper() { + use crate::render_target_cache::MsaaSamples; + // 80 px, 1x, 32bpp -> 1 tile exactly. + assert_eq!(surface_pitch_tiles(80, MsaaSamples::X1, false), 1); + // 81 px, 1x, 32bpp -> 2 tiles (round up). + assert_eq!(surface_pitch_tiles(81, MsaaSamples::X1, false), 2); + // 80 px, 1x, 64bpp -> 2 tiles (64bpp doubles). + assert_eq!(surface_pitch_tiles(80, MsaaSamples::X1, true), 2); + // 80 px, 2x, 32bpp -> 1 tile (2x MSAA doesn't widen X). + assert_eq!(surface_pitch_tiles(80, MsaaSamples::X2, false), 1); + // 80 px, 4x, 32bpp -> 2 tiles (4x MSAA widens X 2x). + assert_eq!(surface_pitch_tiles(80, MsaaSamples::X4, false), 2); + // 80 px, 4x, 64bpp -> 4 tiles. + assert_eq!(surface_pitch_tiles(80, MsaaSamples::X4, true), 4); + } + + /// The color-source branch reads from `RB_COLOR_INFO_` based on + /// `copy_src_select`. Verify that index-3 color targets are addressed. + #[test] + fn resolve_info_color_source_selects_correct_color_info() { + let mut rf = rf(); + rf.write(reg::RB_COPY_CONTROL, 3); // color3 + rf.write(reg::RB_COLOR_INFO_3, (5u32 << 16) | 0x123); // k_16_16_16_16, base=0x123 + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.source, ResolveSource::Color(3)); + assert_eq!(info.source_format, 5); + assert_eq!(info.source_base_tiles, 0x123); + assert!(info.source_is_64bpp); + } + + /// Depth-source branch reads from `RB_DEPTH_INFO` and parses its + /// 1-bit format. + #[test] + fn resolve_info_depth_source_reads_depth_info() { + let mut rf = rf(); + rf.write(reg::RB_COPY_CONTROL, 4); // depth + rf.write(reg::RB_DEPTH_INFO, (1u32 << 16) | 0x55); // kD24FS8, base=0x55 + let info = ResolveInfo::from_register_file(&rf); + assert_eq!(info.source, ResolveSource::Depth); + assert_eq!(info.source_format, 1); + assert_eq!(info.source_base_tiles, 0x55); + assert!(!info.source_is_64bpp); // depth always 32bpp + } + + // ---- Vertex fetch 0 rectangle tests ------------------------------- + + /// Helper: seed a triangle covering the rectangle `(x0, y0) → (x1, y1)` + /// into guest memory at `vb_addr` and program VF0 to read 6 dwords + /// from it with endian = k8in32 (the standard D3D-vertex-buffer case). + fn seed_vertex_fetch_0( + rf: &mut RegisterFile, + mem: &xenia_memory::GuestMemory, + vb_addr: u32, + x0: f32, + y0: f32, + x1: f32, + y1: f32, + ) { + use xenia_memory::MemoryAccess; + // Three (x, y) float pairs covering the rect — exactly the D3D9 + // resolve triangle layout Canary expects. + // (x0, y0), (x1, y0), (x0, y1) + let floats = [x0, y0, x1, y0, x0, y1]; + for (i, f) in floats.iter().enumerate() { + // Write float as BE (PPC `stfs` semantics). `mem.write_u32` + // already stores BE bytes; pass the raw u32 bit pattern. + mem.write_u32( + vb_addr + i as u32 * 4, + f.to_bits(), + ); + } + + // VF0 dword 0: address (bits 31:2, in dwords) + type (bits 1:0 = 3). + let addr_dwords = vb_addr / 4; + let dword_0 = (addr_dwords << 2) | 3; + // VF0 dword 1: size (bits 25:2 = 6) + endian (bits 1:0 = 2 = k8in32). + let dword_1 = (6u32 << 2) | 2; + rf.write(0x4800, dword_0); + rf.write(0x4801, dword_1); + } + + fn fresh_mem_for_vf0() -> xenia_memory::GuestMemory { + use xenia_memory::page_table::MemoryProtect; + let mut mem = xenia_memory::GuestMemory::new().expect("guest memory"); + mem.alloc( + 0x5000_0000, + 0x1_0000, + MemoryProtect::READ | MemoryProtect::WRITE, + ) + .expect("alloc"); + mem + } + + #[test] + fn vf0_rect_returns_none_when_no_vertex_buffer() { + let rf = rf(); + let mem = fresh_mem_for_vf0(); + assert!(vertex_fetch_0_rect(&rf, &mem).is_none()); + } + + #[test] + fn vf0_rect_returns_none_for_wrong_size() { + let mut rf = rf(); + let mem = fresh_mem_for_vf0(); + // type=3 (kVertex), size=4 (wrong — should be 6), endian=2. + rf.write(0x4800, (0x5000_0000u32) | 3); + rf.write(0x4801, (4u32 << 2) | 2); + assert!(vertex_fetch_0_rect(&rf, &mem).is_none()); + } + + #[test] + fn vf0_rect_derives_rectangle_from_three_vertices() { + let mut rf = rf(); + let mut mem = fresh_mem_for_vf0(); + // D3D9 pixel center: +0.5 half-pixel offset applied before Fixed16p8. + // Leave PA_SU_VTX_CNTL at 0 (kD3DZero). + // Triangle at (0, 0) → (100, 50) → vertex 2 = (0, 50). + seed_vertex_fetch_0(&mut rf, &mut mem, 0x5000_0000, 0.0, 0.0, 100.0, 50.0); + + let (x0, y0, x1, y1) = vertex_fetch_0_rect(&rf, &mem).expect("VF0 present"); + // (0 + 0.5) * 256 = 128. (128 + 127) >> 8 = 0. So x0/y0 = 0. + // (100 + 0.5) * 256 = 25728. (25728 + 127) >> 8 = 100. + // (50 + 0.5) * 256 = 12928. (12928 + 127) >> 8 = 50. + assert_eq!(x0, 0); + assert_eq!(y0, 0); + assert_eq!(x1, 100); + assert_eq!(y1, 50); + } + + #[test] + fn from_register_file_and_memory_prefers_vf0_rect() { + let mut rf = rf(); + let mut mem = fresh_mem_for_vf0(); + // Without VF0: dest_pitch/height defaults produce (0, 0, 1280, 720). + rf.write(reg::RB_COPY_DEST_PITCH, (720u32 << 16) | 1280u32); + // With VF0 pointing at a 256×128 triangle, override to that. + seed_vertex_fetch_0(&mut rf, &mut mem, 0x5000_0000, 0.0, 0.0, 256.0, 128.0); + + let info = ResolveInfo::from_register_file_and_memory(&rf, &mem); + assert_eq!(info.coords.x0, 0); + assert_eq!(info.coords.y0, 0); + assert_eq!(info.coords.width, 256); + assert_eq!(info.coords.height, 128); + } + + /// If VF0 is absent, fall back to the scissor+dest default. + #[test] + fn from_register_file_and_memory_falls_back_without_vf0() { + let mut rf = rf(); + let mem = fresh_mem_for_vf0(); + rf.write(reg::RB_COPY_DEST_PITCH, (720u32 << 16) | 1280u32); + let info = ResolveInfo::from_register_file_and_memory(&rf, &mem); + assert_eq!(info.coords.width, 1280); + assert_eq!(info.coords.height, 720); + } + + /// `resolve_rect_apply_scissor_and_align_8` with no scissor just + /// 8-aligns. + #[test] + fn scissor_helper_8_aligns_with_no_scissor() { + let rf = rf(); + let (x0, y0, w, h) = resolve_rect_apply_scissor_and_align_8(&rf, 5, 5, 1001, 17); + assert_eq!(x0, 0); + assert_eq!(y0, 0); + // 1001 ceil-to-8 = 1008; 17 ceil-to-8 = 24. + assert_eq!(w, 1008); + assert_eq!(h, 24); + } + + /// Negative bounding-box (VF0 can produce these) clamps to the scissor + /// top-left without going below zero. + #[test] + fn scissor_helper_clamps_negative_to_zero() { + let mut rf = rf(); + // Small scissor at (0,0)..(128, 64). + rf.write(reg::PA_SC_WINDOW_SCISSOR_BR, (64u32 << 16) | 128u32); + let (x0, y0, w, h) = resolve_rect_apply_scissor_and_align_8(&rf, -50, -50, 80, 32); + assert_eq!(x0, 0); + assert_eq!(y0, 0); + // x1 clamped from 80 -> 80, ceil8 -> 80. y1 32 -> 32. + assert_eq!(w, 80); + assert_eq!(h, 32); + } +} diff --git a/crates/xenia-gpu/src/edram.rs b/crates/xenia-gpu/src/edram.rs new file mode 100644 index 0000000..720569a --- /dev/null +++ b/crates/xenia-gpu/src/edram.rs @@ -0,0 +1,506 @@ +//! CPU-side shadow of the Xenos GPU's 10 MiB EDRAM. +//! +//! The real console has 10 MiB of embedded DRAM organised as 2048 tiles, +//! each 80 × 16 samples wide at 32 bits per sample (`xenos.h:223-285`, +//! `kEdramTileCount = 2048`). 64-bpp formats pack two adjacent EDRAM tiles +//! per color value. +//! +//! xenia-rs does not currently render through a real EDRAM (host draws go +//! straight to wgpu attachments), but the resolve path still needs a +//! concrete byte source. We keep a linear 10 MiB `Vec` here so: +//! +//! * clear-resolves can paint `RB_COLOR_CLEAR` / `RB_DEPTH_CLEAR` into the +//! source tiles, which the resolve loop then copies into guest memory +//! (this is the Sylpheed-first-pixels path); +//! * future host→EDRAM readback code has a place to deposit pixels without +//! touching the resolve API. +//! +//! Byte layout inside one tile: row-major, `80 * 16 * bpp` bytes. At 32bpp, +//! offset `= y * 80 * 4 + x * 4` from the tile base. Samples are stored in +//! native-u32 byte order; any Xenon big-endian vs little-endian shuffling +//! happens at the resolve write boundary, not inside EDRAM. +//! +//! Indexing wraps mod 2048 (`XE_GPU_REGISTER` `RB_COLOR_INFO.color_base` is +//! 11-bit). Canary relies on this wraparound for tall surfaces that +//! exceed the 10 MiB region. + +/// Number of tiles in EDRAM. `xenos::kEdramTileCount`. +pub const EDRAM_TILE_COUNT: u32 = 2048; + +/// Samples per tile along X. `xenos::kEdramTileWidthSamples`. +pub const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80; + +/// Samples per tile along Y. `xenos::kEdramTileHeightSamples`. +pub const EDRAM_TILE_HEIGHT_SAMPLES: u32 = 16; + +/// Bytes per tile at 32bpp: 80 × 16 × 4 = 5120. +pub const EDRAM_TILE_BYTES_32BPP: u32 = + EDRAM_TILE_WIDTH_SAMPLES * EDRAM_TILE_HEIGHT_SAMPLES * 4; + +/// Bytes per tile at 64bpp: 80 × 16 × 8 = 10_240 (two adjacent 32bpp tiles). +pub const EDRAM_TILE_BYTES_64BPP: u32 = EDRAM_TILE_BYTES_32BPP * 2; + +/// Total EDRAM size in bytes: 2048 × 5120 = 10_485_760 (exactly 10 MiB). +pub const EDRAM_SIZE_BYTES: usize = (EDRAM_TILE_COUNT * EDRAM_TILE_BYTES_32BPP) as usize; + +/// 10 MiB shadow of the console's EDRAM. Owned by `GpuSystem` and lives for +/// the lifetime of the GPU; no per-frame allocation. +pub struct ShadowEdram { + bytes: Vec, +} + +impl Default for ShadowEdram { + fn default() -> Self { + Self::new() + } +} + +impl ShadowEdram { + pub fn new() -> Self { + Self { + bytes: vec![0u8; EDRAM_SIZE_BYTES], + } + } + + /// Raw byte offset of a tile within the shadow buffer, wrapped mod 2048. + #[inline] + fn tile_byte_offset(tile_index: u32) -> usize { + ((tile_index % EDRAM_TILE_COUNT) * EDRAM_TILE_BYTES_32BPP) as usize + } + + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + pub fn tile(&self, tile_index: u32) -> &[u8] { + let off = Self::tile_byte_offset(tile_index); + &self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize] + } + + pub fn tile_mut(&mut self, tile_index: u32) -> &mut [u8] { + let off = Self::tile_byte_offset(tile_index); + &mut self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize] + } + + /// Sample-space byte offset within the shadow buffer for one 32bpp + /// sample at `(x_samples, y_samples)` in a surface whose EDRAM origin + /// is `base_tiles` and whose row pitch is `pitch_tiles` 32bpp tiles. + /// + /// Tile layout: a surface of pitch `P` tiles is laid out as a row of + /// `P` tiles followed by the next 16-sample-tall row, etc. Sample + /// `(x, y)` lives in tile `(y/16)*P + (x/80)`, at row `y % 16` and + /// column `x % 80` within that tile. + #[inline] + fn sample_offset_32bpp(base_tiles: u16, pitch_tiles: u32, x: u32, y: u32) -> Option { + if pitch_tiles == 0 { + return None; + } + let tile_row = y / EDRAM_TILE_HEIGHT_SAMPLES; + let tile_col = x / EDRAM_TILE_WIDTH_SAMPLES; + let within_y = y % EDRAM_TILE_HEIGHT_SAMPLES; + let within_x = x % EDRAM_TILE_WIDTH_SAMPLES; + let tile_index = + (base_tiles as u32).wrapping_add(tile_row * pitch_tiles + tile_col); + let off = Self::tile_byte_offset(tile_index) + + (within_y * EDRAM_TILE_WIDTH_SAMPLES * 4 + within_x * 4) as usize; + Some(off) + } + + /// Fill a `(w × h)`-sample rectangle at `(x, y)` with a constant 32bpp + /// pattern. Coordinates are in *sample space* (already scaled through + /// `sample_count_log2_x/y` for MSAA). Wraps mod 2048 tiles via + /// `tile_byte_offset`. + /// + /// The pattern is written as host-native little-endian bytes — the + /// endian swap in [`crate::resolve::apply_endian_128`] converts to the + /// byte order expected by the destination. + #[allow(clippy::too_many_arguments)] + pub fn fill_rect_32bpp( + &mut self, + base_tiles: u16, + pitch_tiles: u32, + x: u32, + y: u32, + w: u32, + h: u32, + pattern: u32, + ) { + if w == 0 || h == 0 { + return; + } + let le = pattern.to_le_bytes(); + for dy in 0..h { + for dx in 0..w { + if let Some(off) = Self::sample_offset_32bpp( + base_tiles, + pitch_tiles, + x + dx, + y + dy, + ) && off + 4 <= self.bytes.len() + { + self.bytes[off..off + 4].copy_from_slice(&le); + } + } + } + } + + /// Read one 32bpp sample at `(x, y)` in sample coordinates. Returns 0 + /// if the surface pitch is zero (degenerate; caller should skip the + /// resolve). + pub fn read_sample_32bpp( + &self, + base_tiles: u16, + pitch_tiles: u32, + x: u32, + y: u32, + ) -> u32 { + match Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y) { + Some(off) if off + 4 <= self.bytes.len() => u32::from_le_bytes([ + self.bytes[off], + self.bytes[off + 1], + self.bytes[off + 2], + self.bytes[off + 3], + ]), + _ => 0, + } + } + + /// Write one 32bpp sample at `(x, y)` in sample coordinates. Mirror of + /// [`Self::read_sample_32bpp`]. Used by the wgpu→ShadowEdram readback + /// retile path and unit tests. + pub fn write_sample_32bpp( + &mut self, + base_tiles: u16, + pitch_tiles: u32, + x: u32, + y: u32, + sample: u32, + ) { + if let Some(off) = Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y) + && off + 4 <= self.bytes.len() + { + self.bytes[off..off + 4].copy_from_slice(&sample.to_le_bytes()); + } + } + + /// Bulk write a `(w × h)`-sample rectangle at `(x, y)` from a row-major + /// linear `samples` buffer. The buffer length must be at least `w * h`; + /// extra entries are ignored. Order: `samples[dy * w + dx]` lands at + /// (x + dx, y + dy). This is the format the wgpu→ShadowEdram readback + /// path uses after stripping wgpu's 256-byte row alignment. + #[allow(clippy::too_many_arguments)] + pub fn write_rect_32bpp( + &mut self, + base_tiles: u16, + pitch_tiles: u32, + x: u32, + y: u32, + w: u32, + h: u32, + samples: &[u32], + ) { + if w == 0 || h == 0 { + return; + } + let needed = (w as usize).saturating_mul(h as usize); + debug_assert!(samples.len() >= needed, "write_rect_32bpp: samples too short"); + for dy in 0..h { + let row_base = (dy as usize) * (w as usize); + for dx in 0..w { + let idx = row_base + dx as usize; + if idx >= samples.len() { + return; + } + self.write_sample_32bpp(base_tiles, pitch_tiles, x + dx, y + dy, samples[idx]); + } + } + } + + // --- 64bpp helpers ---------------------------------------------------- + // + // 64bpp formats (`k_16_16_16_16`, `k_16_16_16_16_FLOAT`, `k_32_32_FLOAT`) + // occupy two adjacent EDRAM tiles per logical tile, doubling the row + // pitch in tiles. Per Canary `xenos.h:321-325 IsColorRenderTargetFormat64bpp` + // and `draw_util.cc:1260-1262` (`pitch_tiles = surface_pitch_tiles << is_64bpp`). + // + // Convention: callers pass the *32bpp-equivalent* `base_tiles` and + // `pitch_tiles_32bpp` (i.e. the `RB_COLOR_INFO.color_base` and + // `surface_pitch_tiles` decoded from registers). The 64bpp helpers + // multiply both by 2 internally so the lo/hi pair lands in adjacent + // tiles. `lo` is the lower-addressed 32bpp word; `hi` is the upper. + + /// Read one 64bpp sample as `(lo, hi)` u32 pair. Doubled-tile addressing + /// per Canary's `is_64bpp` convention. + pub fn read_sample_64bpp( + &self, + base_tiles: u16, + pitch_tiles_32bpp: u32, + x: u32, + y: u32, + ) -> (u32, u32) { + let pitch64 = pitch_tiles_32bpp.saturating_mul(2); + let base64 = (base_tiles as u32).saturating_mul(2) as u16; + let lo = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2), y); + let hi = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y); + (lo, hi) + } + + /// Write one 64bpp sample as `(lo, hi)` u32 pair. + pub fn write_sample_64bpp( + &mut self, + base_tiles: u16, + pitch_tiles_32bpp: u32, + x: u32, + y: u32, + lo: u32, + hi: u32, + ) { + let pitch64 = pitch_tiles_32bpp.saturating_mul(2); + let base64 = (base_tiles as u32).saturating_mul(2) as u16; + self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2), y, lo); + self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y, hi); + } + + /// Bulk write a 64bpp rectangle from a row-major `(lo, hi)` linear + /// buffer. + #[allow(clippy::too_many_arguments)] + pub fn write_rect_64bpp( + &mut self, + base_tiles: u16, + pitch_tiles_32bpp: u32, + x: u32, + y: u32, + w: u32, + h: u32, + samples: &[(u32, u32)], + ) { + if w == 0 || h == 0 { + return; + } + for dy in 0..h { + let row_base = (dy as usize) * (w as usize); + for dx in 0..w { + let idx = row_base + dx as usize; + if idx >= samples.len() { + return; + } + let (lo, hi) = samples[idx]; + self.write_sample_64bpp(base_tiles, pitch_tiles_32bpp, x + dx, y + dy, lo, hi); + } + } + } + + /// Fill a `(w × h)`-sample rectangle with a constant 64bpp pattern. + /// `lo` lands at the low-addressed 32bpp word, `hi` at the high one + /// — i.e. for clears, callers pass `(lo = RB_COLOR_CLEAR_LO, + /// hi = RB_COLOR_CLEAR)` per Canary `draw_util.cc:1302-1303`. + #[allow(clippy::too_many_arguments)] + pub fn fill_rect_64bpp( + &mut self, + base_tiles: u16, + pitch_tiles_32bpp: u32, + x: u32, + y: u32, + w: u32, + h: u32, + lo: u32, + hi: u32, + ) { + if w == 0 || h == 0 { + return; + } + for dy in 0..h { + for dx in 0..w { + self.write_sample_64bpp( + base_tiles, + pitch_tiles_32bpp, + x + dx, + y + dy, + lo, + hi, + ); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn shadow_edram_is_exactly_10_mib() { + assert_eq!(EDRAM_SIZE_BYTES, 10 * 1024 * 1024); + let e = ShadowEdram::new(); + assert_eq!(e.as_bytes().len(), 10 * 1024 * 1024); + } + + #[test] + fn fill_rect_writes_the_whole_first_tile() { + let mut e = ShadowEdram::new(); + e.fill_rect_32bpp(0, 1, 0, 0, 80, 16, 0x11223344); + // Every 4-byte sample in tile 0 should be 0x11223344 (LE). + let expected = 0x11223344u32.to_le_bytes(); + let tile = e.tile(0); + for chunk in tile.chunks_exact(4) { + assert_eq!(chunk, expected); + } + } + + #[test] + fn fill_rect_respects_pitch_and_base() { + let mut e = ShadowEdram::new(); + // Surface: pitch=2 tiles, base=5. A 160x16 fill should land in + // tiles 5 and 6 — and leave tile 4 / tile 7 / tile 0 untouched. + e.fill_rect_32bpp(5, 2, 0, 0, 160, 16, 0xAABBCCDD); + let expected = 0xAABBCCDDu32.to_le_bytes(); + for chunk in e.tile(5).chunks_exact(4) { + assert_eq!(chunk, expected); + } + for chunk in e.tile(6).chunks_exact(4) { + assert_eq!(chunk, expected); + } + assert!(e.tile(4).iter().all(|&b| b == 0)); + assert!(e.tile(7).iter().all(|&b| b == 0)); + assert!(e.tile(0).iter().all(|&b| b == 0)); + } + + #[test] + fn fill_rect_wraps_mod_2048() { + let mut e = ShadowEdram::new(); + // base=2047, pitch=2: first tile is 2047, second wraps to 0. + e.fill_rect_32bpp(2047, 2, 0, 0, 160, 16, 0xDEAD_BEEF); + let expected = 0xDEAD_BEEFu32.to_le_bytes(); + for chunk in e.tile(2047).chunks_exact(4) { + assert_eq!(chunk, expected); + } + for chunk in e.tile(0).chunks_exact(4) { + assert_eq!(chunk, expected); + } + } + + #[test] + fn read_sample_roundtrips_fill_rect() { + let mut e = ShadowEdram::new(); + e.fill_rect_32bpp(3, 1, 0, 0, 80, 16, 0xCAFE_F00D); + // Sample any interior point. + assert_eq!(e.read_sample_32bpp(3, 1, 0, 0), 0xCAFE_F00D); + assert_eq!(e.read_sample_32bpp(3, 1, 79, 15), 0xCAFE_F00D); + // Untouched neighbouring tile. + assert_eq!(e.read_sample_32bpp(4, 1, 0, 0), 0); + } + + #[test] + fn zero_pitch_is_a_noop_read() { + let e = ShadowEdram::new(); + assert_eq!(e.read_sample_32bpp(0, 0, 10, 10), 0); + } + + /// `write_sample_32bpp` round-trips through `read_sample_32bpp`. + #[test] + fn write_sample_32bpp_round_trips() { + let mut e = ShadowEdram::new(); + for x in 0..80u32 { + for y in 0..16u32 { + e.write_sample_32bpp(0, 1, x, y, 0xABCD_0000 | (y << 8) | x); + } + } + for x in 0..80u32 { + for y in 0..16u32 { + assert_eq!( + e.read_sample_32bpp(0, 1, x, y), + 0xABCD_0000 | (y << 8) | x, + "round-trip mismatch at ({x},{y})" + ); + } + } + } + + /// `write_rect_32bpp` writes row-major samples into the right + /// sample-offsets, including across tile boundaries. + #[test] + fn write_rect_32bpp_crosses_tile_boundary() { + let mut e = ShadowEdram::new(); + // Surface pitch = 2 tiles → x in [0, 160), y in [0, 16). A 100x4 + // rect at (40, 4) crosses x=80 (tile boundary). + let w = 100u32; + let h = 4u32; + let mut samples = Vec::with_capacity((w * h) as usize); + for dy in 0..h { + for dx in 0..w { + samples.push(0x10000 | (dy << 8) | dx); + } + } + e.write_rect_32bpp(0, 2, 40, 4, w, h, &samples); + // Spot-check: (40, 4) lands in tile 0; (140, 4) in tile 1. + assert_eq!(e.read_sample_32bpp(0, 2, 40, 4), 0x1_0000); + assert_eq!( + e.read_sample_32bpp(0, 2, 139, 7), + 0x10000 | (3 << 8) | 99 + ); + } + + /// `read_sample_64bpp` round-trips through `write_sample_64bpp` — + /// doubled-pitch addressing keeps lo/hi adjacent in EDRAM bytes. + #[test] + fn write_read_sample_64bpp_roundtrips() { + let mut e = ShadowEdram::new(); + // Use 32bpp pitch=1, base=0 → 64bpp pitch=2, base=0. A single-tile + // 64bpp surface fits 80x16 logical 64bpp samples? No — 80x16 32bpp + // samples per tile, 80 logical 64bpp samples per *pair* of tiles, + // and our 80×16 region needs 2 tiles. Stick to 16x4 logical 64bpp. + for x in 0..16u32 { + for y in 0..4u32 { + e.write_sample_64bpp(0, 1, x, y, 0xAAAA_0000 | x, 0xBBBB_0000 | y); + } + } + for x in 0..16u32 { + for y in 0..4u32 { + let (lo, hi) = e.read_sample_64bpp(0, 1, x, y); + assert_eq!(lo, 0xAAAA_0000 | x); + assert_eq!(hi, 0xBBBB_0000 | y); + } + } + } + + /// `fill_rect_64bpp` writes both the lo and hi clear words across + /// a 64bpp surface — matches the `RB_COLOR_CLEAR_LO`/`RB_COLOR_CLEAR` + /// convention. + #[test] + fn fill_rect_64bpp_writes_both_words() { + let mut e = ShadowEdram::new(); + // 16x4 logical 64bpp samples; pitch=1 32bpp tile → 2 64bpp tiles. + e.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xCAFE_F00D, 0xDEAD_BEEF); + for x in 0..16u32 { + for y in 0..4u32 { + let (lo, hi) = e.read_sample_64bpp(0, 1, x, y); + assert_eq!(lo, 0xCAFE_F00D); + assert_eq!(hi, 0xDEAD_BEEF); + } + } + } + + /// 64bpp helpers must respect the doubled tile pitch — adjacent logical + /// 64bpp samples must land at adjacent 32bpp samples in EDRAM. + #[test] + fn sixty_four_bpp_uses_doubled_pitch() { + let mut e = ShadowEdram::new(); + e.write_sample_64bpp(0, 1, 5, 0, 0x1111_1111, 0x2222_2222); + // The lo word must sit at 32bpp x=10 (5 << 1), hi at x=11. + // Doubled pitch -> base=0, pitch=2 32bpp. + assert_eq!(e.read_sample_32bpp(0, 2, 10, 0), 0x1111_1111); + assert_eq!(e.read_sample_32bpp(0, 2, 11, 0), 0x2222_2222); + } + + /// `write_rect_*` with empty dimensions is a no-op. + #[test] + fn write_rect_empty_is_noop() { + let mut e = ShadowEdram::new(); + e.write_rect_32bpp(0, 1, 0, 0, 0, 5, &[1, 2, 3]); + e.write_rect_32bpp(0, 1, 0, 0, 5, 0, &[1, 2, 3]); + e.fill_rect_64bpp(0, 1, 0, 0, 0, 5, 1, 2); + e.fill_rect_64bpp(0, 1, 0, 0, 5, 0, 1, 2); + // Nothing should have been written. + assert!(e.as_bytes().iter().all(|&b| b == 0)); + } +} diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs new file mode 100644 index 0000000..079759f --- /dev/null +++ b/crates/xenia-gpu/src/gpu_system.rs @@ -0,0 +1,1753 @@ +//! Xenos GPU system: register file + primary ring buffer + PM4 executor. +//! +//! Design notes mirror the approved plan's P2 slice: +//! +//! - Runs on the same host thread as the CPU interpreter. Sequential access +//! to `GuestMemory` — no locks, no sharing. +//! - One packet per [`GpuSystem::execute_one`] call. The scheduler calls this +//! once per round when `is_ready` returns true. When the packet is a +//! `WAIT_REG_MEM` whose condition isn't yet satisfied, we transition to +//! [`GpuState::Blocked`] and the scheduler will re-poll us. +//! - Non-draw opcodes execute for real (register/memory writes, event +//! writebacks). Draws (`DRAW_INDX*`, `XE_SWAP`) are classified but not +//! rendered yet; they surface state (via spans + the swap hook) for later +//! phases to consume. +//! +//! Opcode reference: `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h`. + +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; + +use xenia_memory::MemoryAccess; + +use crate::draw_state::{self, DrawState}; +use crate::pm4::{self, PacketKind}; +use crate::primitive::{self, ProcessedPrimitive}; +use crate::register_file::RegisterFile; +use crate::ring_view::RingBufferView; + +/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets. +#[derive(Debug, Clone)] +pub struct ShaderBlob { + pub shader_type: u8, // 0 = vertex, 1 = pixel + pub dwords: Vec, +} + +/// P8 — upper bound on the shader-blob cache (`GpuSystem.shader_blobs`). +/// Canary uses a similar FIFO ceiling; our number is deliberately generous +/// because blobs are small (a few KiB each at most) and misses force a +/// re-upload from the guest the next time `IM_LOAD*` fires. 256 is enough +/// for every shipping game's peak working set, per canary's traces. +pub const SHADER_BLOB_CAP: usize = 256; + +/// PM4 `WAIT_REG_MEM` condition. Bits 0..=2 of the wait_info dword encode the +/// comparison (per `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h:685-696`). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WaitCmp { + /// value < ref + Less, + /// value <= ref + LessEq, + /// value == ref + Equal, + /// value != ref + NotEqual, + /// value >= ref + GreaterEq, + /// value > ref + Greater, + /// Always — caller wants to sleep regardless. + Always, +} + +impl WaitCmp { + /// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`. + pub fn from_wait_info(wait_info: u32) -> Self { + match wait_info & 0x7 { + 0 => WaitCmp::Less, + 1 => WaitCmp::LessEq, + 2 => WaitCmp::Equal, + 3 => WaitCmp::NotEqual, + 4 => WaitCmp::GreaterEq, + 5 => WaitCmp::Greater, + _ => WaitCmp::Always, + } + } + + pub fn evaluate(self, value: u32, reference: u32) -> bool { + match self { + WaitCmp::Less => value < reference, + WaitCmp::LessEq => value <= reference, + WaitCmp::Equal => value == reference, + WaitCmp::NotEqual => value != reference, + WaitCmp::GreaterEq => value >= reference, + WaitCmp::Greater => value > reference, + WaitCmp::Always => true, + } + } +} + +/// Reason the GPU is currently parked. Mirrors the CPU-side scheduler +/// `BlockReason` shape. Currently only `WaitRegMem` — more to come in later +/// phases (interrupts, timestamp waits). +#[derive(Debug, Clone)] +pub enum GpuBlock { + WaitRegMem { + poll_addr: u32, + is_memory: bool, + reference: u32, + mask: u32, + cmp: WaitCmp, + }, +} + +impl GpuBlock { + /// Probe the wait condition. Returns `true` if the condition holds and + /// the GPU should be unparked. + pub fn is_satisfied(&self, mem: &dyn MemoryAccess, reg_file: &RegisterFile) -> bool { + match self { + GpuBlock::WaitRegMem { + poll_addr, + is_memory, + reference, + mask, + cmp, + } => { + let value = if *is_memory { + mem.read_u32(*poll_addr) + } else { + reg_file.read(*poll_addr) + }; + cmp.evaluate(value & *mask, *reference) + } + } + } +} + +/// Public notification the CP emits when the guest presents a frame. The +/// kernel `vd_swap` path gates on this to push a `SwapInfo` to the host UI. +#[derive(Debug, Clone, Copy, Default)] +pub struct SwapNotification { + pub frame_index: u64, + pub frontbuffer_phys: u32, + pub width: u32, + pub height: u32, +} + +/// GPU interrupt (fired by `PM4_INTERRUPT`). The kernel dispatches this to +/// the guest callback registered by `VdSetGraphicsInterruptCallback`. +#[derive(Debug, Clone, Copy)] +pub struct PendingInterrupt { + pub source: InterruptSource, + pub cpu_mask: u32, +} + +#[derive(Debug, Clone, Copy)] +pub enum InterruptSource { + CommandProcessor, + Swap, +} + +/// Per-run counters for observability. +#[derive(Debug, Clone, Default)] +pub struct GpuStats { + pub packets_executed: u64, + pub draws_seen: u64, + pub swaps_seen: u64, + pub interrupts_emitted: u64, + pub wait_reg_mem_blocks: u64, + pub indirect_buffer_jumps: u64, + /// P4: count of EDRAM→memory resolves triggered by `TILE_FLUSH` events + /// (event code 15). Non-zero means the game is committing rendered + /// pixels to the frontbuffer / a texture. + pub resolves_total: u64, + /// Resolves whose byte copy path ran and wrote at least one sample to + /// guest memory. Delta against `resolves_total` indicates how many + /// resolves were skipped for an unsupported format / MSAA mode / 3D + /// destination. + pub resolves_copied_total: u64, + /// Resolves that were skipped by [`crate::resolve::copy_to_memory`] due + /// to an unsupported format path. Logged at `warn` so the reason is + /// visible. + pub resolves_skipped_total: u64, + /// Total number of 32bpp samples written into guest memory across all + /// successful resolves. Useful for sanity-checking that a big splash + /// frame actually made it out (e.g. 1280×720 = 921_600 samples). + pub resolve_samples_written: u64, + /// P4: unique render-target keys seen (as managed by the internal + /// `RenderTargetCache`). Useful HUD metric for multi-target workloads. + pub unique_render_targets: u64, +} + +/// Result of one packet step. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExecOutcome { + /// Consumed one packet; GPU remains Ready. + Stepped { dwords_consumed: u32 }, + /// Nothing to do right now. + Idle, + /// Parked on a sync primitive; `GpuSystem::pending_block` has details. + Blocked, +} + +/// Shader fetch constants: the game uploads them via `PM4_SET_CONSTANT` type=1 +/// into a 256-dword region. Games then reference them by index when binding +/// textures / vertex buffers. +pub const CONST_BASE_ALU: u32 = 0x4000; +pub const CONST_BASE_FETCH: u32 = 0x4800; +pub const CONST_BASE_BOOL: u32 = 0x4900; +pub const CONST_BASE_LOOP: u32 = 0x4908; +pub const CONST_BASE_REGISTERS: u32 = 0x2000; + +/// Atomic mailbox for the handful of GPU registers that CROSS the MMIO +/// boundary. Guests write into the `0x7FC80000` register aperture; those +/// writes run through [`crate::mmio_region`] and land in these atomics. +/// Inside `execute_one` / the scheduler's per-round GPU hook we sample them +/// to sync `ring.write_offset_dwords`, reflect progress back to the guest, +/// etc. +/// +/// Only these three registers need atomic cross-thread access. Everything +/// else lives in [`GpuSystem::register_file`] which is CPU-thread-local. +#[derive(Debug, Clone)] +pub struct GpuMmio { + /// `CP_RB_WPTR` — guest writes dword offset of the write pointer. + pub cp_rb_wptr: Arc, + /// `CP_RB_RPTR` — we mirror our internal `ring.read_offset_dwords` here + /// so guests polling the register see progress. + pub cp_rb_rptr: Arc, + /// `CP_INT_STATUS` — bit set when an interrupt is pending. + pub cp_int_status: Arc, + /// `CP_INT_ACK` — guest clears the bit after handling. + pub cp_int_ack: Arc, + /// `D1MODE_VBLANK_VLINE_STATUS` (register `0x1951`, MMIO offset `0x6544`). + /// Bit 0 = `VBLANK_INT_OCCURRED` — set by the GPU when vblank fires, + /// cleared by the guest via write-1-to-clear. Sylpheed's vsync callback + /// gates *all* its work on reading bit 0 as set: `lwz; rlwinm. r,r,0,31,31; + /// bc 12,2,skip`. Without this bit toggling across vsyncs the callback + /// always skips, so the PKEVENT that feeds the render dispatcher + /// (user_data + 0x3B28) never gets signaled and the worker loops + /// forever. + pub d1mode_vblank_vline_status: Arc, + /// M1.7 parker — set by producers (guest WPTR writes, shutdown) so + /// the GPU worker thread does not park when work is pending. The + /// worker swaps to `false` on entering its park decision and + /// re-checks predicates; if a producer raced between the swap and + /// the actual `park_timeout`, the producer's `unpark()` returns the + /// park immediately via std's token semantics. Inline mode never + /// reads this; the cost is one extra atomic store per WPTR write. + pub wake_pending: Arc, + /// Handle to the GPU worker thread, populated by `GpuWorker::run` on + /// startup. The MMIO write callback for `CP_RB_WPTR` `unpark()`s it + /// after every guest WPTR write so the worker proceeds without + /// waiting for its `park_timeout`. `None` in inline mode (no worker + /// to wake), in which case the unpark site is a one-mutex-lock + /// no-op. + pub worker_thread: Arc>>, +} + +impl GpuMmio { + pub fn new() -> Self { + Self { + cp_rb_wptr: Arc::new(AtomicU32::new(0)), + cp_rb_rptr: Arc::new(AtomicU32::new(0)), + cp_int_status: Arc::new(AtomicU32::new(0)), + cp_int_ack: Arc::new(AtomicU32::new(0)), + d1mode_vblank_vline_status: Arc::new(AtomicU32::new(0)), + wake_pending: Arc::new(AtomicBool::new(false)), + worker_thread: Arc::new(std::sync::Mutex::new(None)), + } + } +} + +impl Default for GpuMmio { + fn default() -> Self { + Self::new() + } +} + +/// Live GPU system. One instance per `KernelState`. +pub struct GpuSystem { + pub register_file: RegisterFile, + pub ring: RingBufferView, + /// Stack of saved rings for `PM4_INDIRECT_BUFFER` nesting. The active + /// ring is always `ring`; when an IB packet arrives, we push `ring` onto + /// this stack and replace `ring` with the IB view. On IB completion + /// (read pointer catches up to size), we pop. + ib_stack: Vec, + /// Cached shader blobs keyed by the raw CP register address that loaded them. + pub shader_blobs: HashMap, + /// P8 — FIFO of blob keys for bounded eviction. On `IM_LOAD*` the + /// new key is pushed to the back; if the blob count exceeds + /// [`SHADER_BLOB_CAP`], the front is popped and removed from + /// `shader_blobs`. Prevents long-running guests from growing the + /// cache without bound. The two *active* keys (`active_vs_key` + + /// `active_ps_key`) are never evicted — safeguard in `evict_oldest`. + pub shader_blob_order: std::collections::VecDeque, + /// Monotonic frame counter (bumped on `PM4_XE_SWAP`). + pub swap_counter: u64, + /// Most recent swap notification; the kernel polls this after `execute_one` + /// to decide whether to push a UI swap event. + pub last_swap: Option, + /// Queue of interrupts not yet delivered to the guest. Private so that + /// callers go through [`Self::take_pending_interrupts`] — M1 step 6 + /// then redirects this drain into a `crossbeam_channel::Sender` without + /// re-touching every call site. + pending_interrupts: Vec, + /// Current stall reason, if any. + pub pending_block: Option, + pub stats: GpuStats, + /// For the 64-bit bin mask/select we split hi/lo writes. + pub bin_mask: u64, + pub bin_select: u64, + /// Shared-atomic mailbox for the `0x7FC80000` MMIO aperture. Cloned into + /// the [`crate::mmio_region`] callbacks; clone-ownership keeps the bus + /// side and the executor side in sync without locks. + pub mmio: GpuMmio, + /// Most recent draw state extracted at a `PM4_DRAW_INDX*` packet. The + /// uber-shader pipeline in P3+ reads this to build its wgpu draw call. + pub last_draw: Option, + /// Most recent processed primitive — index rewrite + host topology + /// decision. Separate from `last_draw` because its `rewritten_indices` + /// may be large and callers may want to drop it after consumption. + pub last_primitive: Option, + /// Key in `shader_blobs` of the currently-active vertex shader. Set by + /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kVertex` and read at + /// `PM4_DRAW_INDX*` time so the host side can upload the matching + /// microcode bytes before dispatching. + pub active_vs_key: Option, + /// Key in `shader_blobs` of the currently-active pixel shader. Set by + /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kPixel`. + pub active_ps_key: Option, + /// P4: EDRAM tile-ownership bookkeeping + per-key RT descriptors. Updated + /// at each `PM4_DRAW_INDX*` (`bind` + `claim_tiles`) and queried by + /// `TILE_FLUSH` event handling to decide resolve sources. + pub rt_cache: crate::render_target_cache::RenderTargetCache, + /// P4: most recent `ResolveInfo` observed when `TILE_FLUSH` fired. The UI + /// bridge surfaces this in the HUD so users can tell when a game is + /// resolving to the frontbuffer versus an off-screen target. + pub last_resolve: Option, + /// P5: CPU-side decoded-texture cache (shared across draws within a + /// frame; trimmed implicitly by insertion). `ensure_cached` hits this + /// on every texture-fetch resolution; the UI thread sees the decoded + /// bytes via `UiBridge::publish_texture`. + pub texture_cache: crate::texture_cache::TextureCache, + /// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and + /// (future) host-render-target readback; read by the resolve byte-copy + /// path that writes tiled pixels into guest memory. Allocated once at + /// `GpuSystem::new` and lives for the whole GPU lifetime — no + /// per-frame churn. + pub edram: crate::edram::ShadowEdram, +} + +impl GpuSystem { + pub fn new() -> Self { + Self { + register_file: RegisterFile::new(), + ring: RingBufferView::new(), + ib_stack: Vec::new(), + shader_blobs: HashMap::new(), + shader_blob_order: std::collections::VecDeque::with_capacity(SHADER_BLOB_CAP + 1), + swap_counter: 0, + last_swap: None, + pending_interrupts: Vec::new(), + pending_block: None, + stats: GpuStats::default(), + bin_mask: 0, + bin_select: 0, + mmio: GpuMmio::new(), + last_draw: None, + last_primitive: None, + active_vs_key: None, + active_ps_key: None, + rt_cache: crate::render_target_cache::RenderTargetCache::new(), + last_resolve: None, + texture_cache: crate::texture_cache::TextureCache::new(), + edram: crate::edram::ShadowEdram::new(), + } + } + + /// P8 — insert a shader blob + bump the FIFO so long-running games + /// don't grow `shader_blobs` without bound. Caps at [`SHADER_BLOB_CAP`]. + /// Never evicts the currently-active VS/PS blobs (if they ended up at + /// the front of the queue, we skip past them). + fn insert_shader_blob(&mut self, key: u32, blob: ShaderBlob) { + let already_present = self.shader_blobs.contains_key(&key); + self.shader_blobs.insert(key, blob); + if !already_present { + self.shader_blob_order.push_back(key); + metrics::counter!("gpu.shader.blob_seen").increment(1); + } + while self.shader_blobs.len() > SHADER_BLOB_CAP { + // Pop the oldest key that isn't one of the active ones. + let mut evicted = None; + for _ in 0..self.shader_blob_order.len() { + if let Some(candidate) = self.shader_blob_order.pop_front() { + if Some(candidate) == self.active_vs_key + || Some(candidate) == self.active_ps_key + { + self.shader_blob_order.push_back(candidate); + continue; + } + self.shader_blobs.remove(&candidate); + evicted = Some(candidate); + break; + } + } + if evicted.is_some() { + metrics::counter!("gpu.shader.blob_evicted").increment(1); + } else { + // All remaining blobs are active — can't evict, stop. + break; + } + } + } + + /// P4: event dispatch helper called by every `PM4_EVENT_WRITE*` variant. + /// `event_code` is the low 6 bits of the initiator word (see canary's + /// `xenos::Event` enum — code 15 is `TILE_FLUSH`, the resolve trigger). + /// + /// Handle a `PM4_EVENT_WRITE*` initiator. For `TILE_FLUSH` (event 15) + /// we decode the live `RB_*` register state into a [`ResolveInfo`], + /// paint any clear values into the shadow EDRAM, and then copy bytes + /// from the source render target into guest memory at + /// `RB_COPY_DEST_BASE`. That last step is what `VdSwap` needs to see + /// non-empty front-buffer pixels — see `memory/project_xenia_rs_edram + /// _resolve_gap.md` for the history of this path. + fn handle_event_initiator(&mut self, event_code: u32, mem: &dyn MemoryAccess) { + const EVENT_TILE_FLUSH: u32 = 15; + if event_code != EVENT_TILE_FLUSH { + return; + } + let info = draw_state::ResolveInfo::from_register_file_and_memory( + &self.register_file, + mem, + ); + self.stats.resolves_total += 1; + metrics::counter!( + "gpu.resolve", + "src" => format!("{}", info.copy_src_select), + "fmt" => format!("{}", info.dest_format), + "cmd" => format!("{}", info.copy_command), + ) + .increment(1); + tracing::info!( + src = info.copy_src_select, + dst_base = format_args!("{:#010x}", info.dest_base), + w = info.coords.width, + h = info.coords.height, + pitch = info.dest_pitch_pixels, + fmt = info.dest_format, + endian = info.dest_endian, + clear_color = info.color_clear_enable, + clear_depth = info.depth_clear_enable, + "gpu: TILE_FLUSH resolve" + ); + + // Paint clear values into the shadow EDRAM at the source tile + // range *before* the copy. Games often issue a clear-then-resolve + // as a single TILE_FLUSH: the EDRAM is filled with `RB_COLOR_CLEAR` + // by the clear part, and that's what the copy part reads. + // + // Sample coordinates are pixel coordinates scaled up by + // `sample_count_log2_x/y` — for 1x MSAA (Sylpheed) this is the + // identity. + if info.color_clear_enable + && let draw_state::ResolveSource::Color(_) = info.source + && info.surface_pitch_tiles > 0 + { + let sx = info.coords.x0 << info.coords.sample_count_log2_x; + let sy = info.coords.y0 << info.coords.sample_count_log2_y; + let sw = info.coords.width << info.coords.sample_count_log2_x; + let sh = info.coords.height << info.coords.sample_count_log2_y; + // 64bpp clears use both `RB_COLOR_CLEAR_LO` (low 32 bits) and + // `RB_COLOR_CLEAR` (high 32 bits) per Canary `draw_util.cc:1302-1303`. + // 32bpp clears ignore the lo word entirely. + if info.source_is_64bpp { + self.edram.fill_rect_64bpp( + info.source_base_tiles, + info.surface_pitch_tiles, + sx, + sy, + sw, + sh, + info.color_clear_value_lo, + info.color_clear_value, + ); + } else { + self.edram.fill_rect_32bpp( + info.source_base_tiles, + info.surface_pitch_tiles, + sx, + sy, + sw, + sh, + info.color_clear_value, + ); + } + } + if info.depth_clear_enable && info.surface_pitch_tiles > 0 { + let sx = info.coords.x0 << info.coords.sample_count_log2_x; + let sy = info.coords.y0 << info.coords.sample_count_log2_y; + let sw = info.coords.width << info.coords.sample_count_log2_x; + let sh = info.coords.height << info.coords.sample_count_log2_y; + // Depth tiles live at RB_DEPTH_INFO.depth_base regardless of + // which source this resolve selects. + let rb_depth_info = self.register_file.read(draw_state::reg::RB_DEPTH_INFO); + let depth_base = (rb_depth_info & 0xFFF) as u16; + self.edram.fill_rect_32bpp( + depth_base, + info.surface_pitch_tiles, + sx, + sy, + sw, + sh, + info.depth_clear_value, + ); + } + + // Byte copy into guest memory. + let stats = crate::resolve::copy_to_memory(&info, &self.edram, mem); + if stats.supported && stats.samples_written > 0 { + self.stats.resolves_copied_total += 1; + self.stats.resolve_samples_written += stats.samples_written as u64; + } else if !stats.supported { + self.stats.resolves_skipped_total += 1; + } + + self.last_resolve = Some(info); + } + + /// Sync state with the MMIO atomic mailbox. Call once at the top of the + /// per-round GPU hook: the guest may have advanced `CP_RB_WPTR` since + /// we last ran, and we in turn reflect our read-pointer back to the + /// mirror register so the guest sees progress. + pub fn sync_with_mmio(&mut self) { + let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Relaxed); + if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 { + self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords; + } + // Mirror our read pointer. + self.mmio + .cp_rb_rptr + .store(self.ring.read_offset_dwords, Ordering::Relaxed); + } + + /// True iff `execute_one` is expected to make progress without blocking. + pub fn is_ready(&self, mem: &dyn MemoryAccess) -> bool { + if let Some(block) = &self.pending_block { + return block.is_satisfied(mem, &self.register_file); + } + self.ring.has_pending() + } + + /// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when + /// there's nothing to do, [`ExecOutcome::Blocked`] if a sync primitive + /// stalls the GPU, otherwise [`ExecOutcome::Stepped`] with the number of + /// dwords consumed (counting the header). + pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome { + // 0) If currently parked, probe the condition and either wake up or stay blocked. + if let Some(block) = self.pending_block.clone() { + if block.is_satisfied(mem, &self.register_file) { + tracing::debug!(?block, "gpu: wait satisfied — resuming"); + self.pending_block = None; + } else { + return ExecOutcome::Blocked; + } + } + if !self.ring.has_pending() { + // End of current ring. If we were inside an indirect buffer, pop + // and resume the caller. + if let Some(caller) = self.ib_stack.pop() { + self.ring = caller; + if self.ring.has_pending() { + return self.execute_one(mem); + } + } + return ExecOutcome::Idle; + } + let header_addr = self.ring.addr_at_offset(0).unwrap(); + let header_word = mem.read_u32(header_addr); + let packet = pm4::decode(header_word); + tracing::trace!( + header = format_args!("{header_word:#010x}"), + addr = format_args!("{header_addr:#010x}"), + ?packet.kind, + "gpu: packet" + ); + let consumed = match packet.kind { + PacketKind::Type0 { base_index, count, write_one } => { + self.handle_type0(mem, base_index, count, write_one, packet.total_dwords) + } + PacketKind::Type1 { reg_index_1, reg_index_2 } => { + self.handle_type1(mem, reg_index_1, reg_index_2) + } + PacketKind::Type2 => 1, + PacketKind::Type3 { + opcode, + count, + predicated, + } => match self.handle_type3(mem, opcode, count, predicated, packet.total_dwords) { + Type3Result::Consumed(n) => n, + Type3Result::Blocked { rewind_to_header } => { + // Re-park on this packet so the resume path re-reads it. + if rewind_to_header { + // We haven't moved read ptr yet, so this is a no-op — + // documented to keep intent explicit. + } + return ExecOutcome::Blocked; + } + }, + }; + self.ring.advance_read(consumed); + self.writeback_read_ptr(mem); + self.stats.packets_executed += 1; + ExecOutcome::Stepped { + dwords_consumed: consumed, + } + } + + /// First-Pixels M2b — kernel-side commit point for `VdSwap`. Prior to + /// M2b the kernel's [`vd_swap`] wrote a synthetic `PM4_XE_SWAP` packet + /// at the guest-provided `buffer_ptr` + advanced our ring `wptr` by 64 + /// dwords, expecting the drain to pick it up. That mechanism misaligned: + /// the drain reads from `ring.base + rptr * 4` forward, not from the + /// game's out-of-band `buffer_ptr`. 512 ring packets executed through + /// 1 B guest instructions but `swaps_seen` stayed at 0. + /// + /// `VdSwap` is the kernel's commit point by definition — we don't need + /// to launder the event through the ring. Call this directly from the + /// kernel-side handler; the `PM4_XE_SWAP` opcode path still works for + /// the (rare) case of a game that emits the packet through its own ring + /// writes. + pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) { + self.stats.swaps_seen += 1; + self.swap_counter = self.swap_counter.wrapping_add(1); + self.last_swap = Some(SwapNotification { + frame_index: self.swap_counter, + frontbuffer_phys, + width, + height, + }); + self.pending_interrupts.push(PendingInterrupt { + source: InterruptSource::Swap, + cpu_mask: 0x1, + }); + tracing::info!( + frame = self.swap_counter, + fb = format_args!("{frontbuffer_phys:#010x}"), + width, + height, + "gpu: XE_SWAP (kernel-direct)" + ); + } + + /// Called by `VdInitializeRingBuffer` to give us the primary ring. + pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) { + let size_bytes = 1u32 << size_log2.min(31); + self.ring.base = base; + self.ring.size_dwords = size_bytes / 4; + self.ring.read_offset_dwords = 0; + // `write_offset` is driven by the guest — start at 0 so the ring + // appears empty until MMIO writes advance it. + self.ring.write_offset_dwords = 0; + tracing::info!( + base = format_args!("{base:#010x}"), + size_bytes, + size_dwords = self.ring.size_dwords, + "gpu: ring initialized" + ); + } + + /// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest + /// expects us to mirror `read_offset_dwords`. + pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) { + self.ring.rptr_writeback_addr = addr; + self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31); + tracing::info!( + addr = format_args!("{addr:#010x}"), + block_dwords = self.ring.rptr_writeback_block_dwords, + "gpu: rptr writeback enabled" + ); + } + + /// Drain the pending-interrupt queue. The kernel calls this once per + /// scheduler round and queues each entry into `interrupts.queue_interrupt`. + /// + /// M1 step 6 swaps the `Vec`-backed implementation for a + /// `crossbeam_channel::Sender`. Routing every external + /// reader through this single accessor in step 2 means that swap is a + /// localized change — no call site changes. + /// + /// Returns the previously-queued interrupts and leaves the internal queue + /// empty. Cheap (`Vec::take`); no allocation when the queue is already + /// empty. + pub fn take_pending_interrupts(&mut self) -> Vec { + std::mem::take(&mut self.pending_interrupts) + } + + /// True when the pending-interrupt queue has at least one entry. Used + /// by callers that want to short-circuit an empty drain (saving the + /// `Vec::new()` allocation that `take` would otherwise force on every + /// scheduler round). + pub fn has_pending_interrupts(&self) -> bool { + !self.pending_interrupts.is_empty() + } + + /// Extend the logical write pointer by `dwords` (cumulative). `VdSwap` + /// reserves 64 dwords then calls this; MMIO writes to `CP_RB_WPTR` will + /// do the same in P2+. + pub fn extend_write_ptr(&mut self, new_wptr_dwords: u32) { + if self.ring.size_dwords == 0 { + return; + } + self.ring.write_offset_dwords = new_wptr_dwords % self.ring.size_dwords; + } + + /// Write the current read pointer back to the guest-registered + /// address. M1.8 uses the fenced variant: when the GPU runs on its + /// own host thread, the CPU can poll this RPTR mirror to learn how + /// far the GPU has consumed the ring; the Release fence ensures any + /// upstream packet effects (memory writes, register file updates + /// the guest reads via subsequent MMIO) happen-before the + /// CPU-visible RPTR bump. + fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) { + if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() { + mem.write_u32_fence( + self.ring.rptr_writeback_addr, + self.ring.read_offset_dwords, + ); + } + } + + // ── Type-0/1 handlers ───────────────────────────────────────────────── + + fn handle_type0( + &mut self, + mem: &dyn MemoryAccess, + base_index: u32, + count: u32, + write_one: bool, + total_dwords: u32, + ) -> u32 { + for i in 0..count { + let dword_addr = self.ring.addr_at_offset(1 + i).unwrap(); + let value = mem.read_u32(dword_addr); + let target = if write_one { base_index } else { base_index + i }; + self.register_file.write(target, value); + } + tracing::trace!( + base = format_args!("{base_index:#x}"), + count, + write_one, + "gpu: Type0 reg write run" + ); + total_dwords + } + + fn handle_type1( + &mut self, + mem: &dyn MemoryAccess, + reg_index_1: u32, + reg_index_2: u32, + ) -> u32 { + let a_addr = self.ring.addr_at_offset(1).unwrap(); + let b_addr = self.ring.addr_at_offset(2).unwrap(); + let a = mem.read_u32(a_addr); + let b = mem.read_u32(b_addr); + self.register_file.write(reg_index_1, a); + self.register_file.write(reg_index_2, b); + tracing::trace!( + r1 = format_args!("{reg_index_1:#x}"), + r2 = format_args!("{reg_index_2:#x}"), + "gpu: Type1 dual reg write" + ); + 3 + } + + // ── Type-3 dispatch ─────────────────────────────────────────────────── + + fn handle_type3( + &mut self, + mem: &dyn MemoryAccess, + opcode: u8, + count: u32, + predicated: bool, + total_dwords: u32, + ) -> Type3Result { + metrics::counter!("gpu.packet", "opcode" => pm4::type3_opcode_name(opcode)).increment(1); + tracing::trace!( + opcode = format_args!("{opcode:#x}"), + name = pm4::type3_opcode_name(opcode), + count, + predicated, + "gpu: Type3" + ); + // If predicated and the bin mask/select combo evaluates to "skip", + // consume the whole packet (including data dwords) and move on. We + // don't emulate binning so bin_mask & bin_select is always 0 → we + // keep predicated packets in simplest form: execute them anyway. Most + // games don't use binning on Xenos. Observed in canary: + // `pm4_command_processor_implement.h:440-460`. + let _ = predicated; + + match opcode { + pm4::PM4_NOP + | pm4::PM4_WAIT_FOR_IDLE + | pm4::PM4_CONTEXT_UPDATE + | pm4::PM4_INVALIDATE_STATE + | pm4::PM4_ME_INIT + | pm4::PM4_VIZ_QUERY + | pm4::PM4_SET_SHADER_BASES => { + // Classify-and-skip. State side effects (if any) are deferred. + } + pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => { + self.stats.indirect_buffer_jumps += 1; + let ib_ptr = self.read_payload(mem, 1); + let ib_size = self.read_payload(mem, 2); + // Advance past the IB header + payload before recursing so + // the return location is correct. + self.ring.advance_read(total_dwords); + self.writeback_read_ptr(mem); + // Push current ring, switch to IB view. + let caller = self.ring; + self.ib_stack.push(caller); + self.ring = RingBufferView { + base: ib_ptr & !3, + size_dwords: ib_size, + read_offset_dwords: 0, + write_offset_dwords: ib_size, // IB is fully-written at jump time + rptr_writeback_addr: 0, + rptr_writeback_block_dwords: 0, + }; + tracing::debug!( + ib_ptr = format_args!("{ib_ptr:#010x}"), + ib_size, + "gpu: jump to indirect buffer" + ); + return Type3Result::Consumed(0); // we already advanced + } + pm4::PM4_WAIT_REG_MEM => { + // Canary layout (pm4_command_processor_implement.h:699-755): + // payload[0] = wait_info (bit 4 = is_memory, bits 2:0 = cmp) + // payload[1] = poll address (register idx OR memory addr; bottom 2 bits = endian for memory) + // payload[2] = ref value + // payload[3] = mask + // payload[4] = wait (sleep hint, ignored) + let wait_info = self.read_payload(mem, 1); + let poll_addr_raw = self.read_payload(mem, 2); + let reference = self.read_payload(mem, 3); + let mask = self.read_payload(mem, 4); + let is_memory = (wait_info & 0x10) != 0; + let cmp = WaitCmp::from_wait_info(wait_info); + let poll_addr = if is_memory { + poll_addr_raw & !3 + } else { + poll_addr_raw + }; + let block = GpuBlock::WaitRegMem { + poll_addr, + is_memory, + reference, + mask, + cmp, + }; + if block.is_satisfied(mem, &self.register_file) { + // Condition already true; proceed past this packet. + tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied"); + } else { + self.stats.wait_reg_mem_blocks += 1; + tracing::debug!(?block, "gpu: WAIT_REG_MEM parking"); + self.pending_block = Some(block); + return Type3Result::Blocked { rewind_to_header: true }; + } + } + pm4::PM4_REG_RMW => { + // payload[0] = rmw_info (bit 31 = and_from_reg, bit 30 = or_from_reg) + // payload[1] = and mask (or register index) + // payload[2] = or mask (or register index) + let rmw_info = self.read_payload(mem, 1); + let and_or_reg = (rmw_info & 0x8000_0000) != 0; + let or_from_reg = (rmw_info & 0x4000_0000) != 0; + let reg_index = rmw_info & 0x1FFF; + let p2 = self.read_payload(mem, 2); + let p3 = self.read_payload(mem, 3); + let and_mask = if and_or_reg { + self.register_file.read(p2 & 0x1FFF) + } else { + p2 + }; + let or_mask = if or_from_reg { + self.register_file.read(p3 & 0x1FFF) + } else { + p3 + }; + let cur = self.register_file.read(reg_index); + let new_value = (cur & and_mask) | or_mask; + self.register_file.write(reg_index, new_value); + tracing::trace!( + reg = format_args!("{reg_index:#x}"), + cur = format_args!("{cur:#x}"), + new = format_args!("{new_value:#x}"), + "gpu: REG_RMW" + ); + } + pm4::PM4_REG_TO_MEM => { + // payload[0] = reg_index, payload[1] = mem addr + let reg_index = self.read_payload(mem, 1) & 0x1FFF; + let dst = self.read_payload(mem, 2) & !3; + let value = self.register_file.read(reg_index); + mem.write_u32(dst, value); + tracing::trace!( + reg = format_args!("{reg_index:#x}"), + dst = format_args!("{dst:#010x}"), + value = format_args!("{value:#x}"), + "gpu: REG_TO_MEM" + ); + } + pm4::PM4_MEM_WRITE => { + // payload[0] = dst, payload[1..=count-1] = values + let mut dst = self.read_payload(mem, 1) & !3; + for i in 2..=count { + let val = self.read_payload(mem, i); + mem.write_u32(dst, val); + dst = dst.wrapping_add(4); + } + } + pm4::PM4_COND_WRITE => { + // payload[0] = wait_info, [1] = poll addr, [2] = ref, [3] = mask, + // [4] = write addr/reg, [5] = write data + let wait_info = self.read_payload(mem, 1); + let poll_raw = self.read_payload(mem, 2); + let reference = self.read_payload(mem, 3); + let mask = self.read_payload(mem, 4); + let is_memory = (wait_info & 0x10) != 0; + let cmp = WaitCmp::from_wait_info(wait_info); + let poll_addr = if is_memory { poll_raw & !3 } else { poll_raw }; + let cur_raw = if is_memory { + mem.read_u32(poll_addr) + } else { + self.register_file.read(poll_addr) + }; + if cmp.evaluate(cur_raw & mask, reference) { + let write_addr = self.read_payload(mem, 5); + let write_data = self.read_payload(mem, 6); + if (wait_info & 0x100) != 0 { + mem.write_u32(write_addr & !3, write_data); + } else { + self.register_file + .write(write_addr & 0x1FFF, write_data); + } + } + } + pm4::PM4_EVENT_WRITE => { + // payload[0] = initiator (written to VGT_EVENT_INITIATOR). + let initiator = self.read_payload(mem, 1); + self.register_file + .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); + self.handle_event_initiator(initiator & 0x3F, mem); + tracing::trace!(initiator = format_args!("{:#x}", initiator & 0x3F), "gpu: EVENT_WRITE"); + } + pm4::PM4_EVENT_WRITE_SHD => { + // payload[0] = initiator (bit 31: write counter, else write `value`) + // payload[1] = address, payload[2] = value + let initiator = self.read_payload(mem, 1); + let address = self.read_payload(mem, 2); + let value = self.read_payload(mem, 3); + self.register_file + .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); + self.handle_event_initiator(initiator & 0x3F, mem); + let data = if (initiator & 0x8000_0000) != 0 { + self.swap_counter as u32 + } else { + value + }; + // M1.8: fenced write. The CPU thread busy-polls this + // address as a GPU completion fence. The Release fence + // emitted here pairs with `read_u32_fence`'s Acquire on + // the polling side: any earlier writes the worker + // performed (RPTR writeback, resolve target writes, + // etc.) are visible to the CPU once it sees the new + // fence value. + mem.write_u32_fence(address & !3, data); + tracing::trace!( + addr = format_args!("{:#010x}", address & !3), + value = format_args!("{data:#x}"), + "gpu: EVENT_WRITE_SHD" + ); + } + pm4::PM4_EVENT_WRITE_EXT => { + // payload[0] = initiator, [1] = address. Writes 6 u16 extents + // (min/max x/y/z) — we're not tracking scissors yet, so write zeros. + let initiator = self.read_payload(mem, 1); + let address = self.read_payload(mem, 2) & !3; + self.register_file + .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); + self.handle_event_initiator(initiator & 0x3F, mem); + for i in 0..6u32 { + mem.write_u16(address + i * 2, 0); + } + } + pm4::PM4_EVENT_WRITE_ZPD => { + // Occlusion query writeback — always write zeros (no query). + let initiator = self.read_payload(mem, 1); + self.register_file + .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); + self.handle_event_initiator(initiator & 0x3F, mem); + } + pm4::PM4_DRAW_INDX | pm4::PM4_DRAW_INDX_2 => { + self.stats.draws_seen += 1; + // Canary (`pm4_command_processor_implement.h:1128-1151`): + // DRAW_INDX: payload[0] = viz_query, payload[1] = vgt_draw_initiator, + // [2] = dma_base (if source=DMA), [3] = dma_size + // DRAW_INDX_2: payload[0] = vgt_draw_initiator (indices follow inline). + let (vgt, dma_base, dma_size) = if opcode == pm4::PM4_DRAW_INDX { + let _viz = self.read_payload(mem, 1); + let vgt = self.read_payload(mem, 2); + let (db, ds) = if count >= 4 { + (Some(self.read_payload(mem, 3)), Some(self.read_payload(mem, 4))) + } else { + (None, None) + }; + (vgt, db, ds) + } else { + (self.read_payload(mem, 1), None, None) + }; + let mut ds = draw_state::extract(&self.register_file, vgt, dma_base, dma_size); + ds.vs_blob_key = self.active_vs_key; + ds.ps_blob_key = self.active_ps_key; + let processed = primitive::process(ds.primitive, ds.vertex_count, None); + metrics::counter!( + "gpu.draw", + "prim" => format!("{:?}", ds.primitive), + ) + .increment(1); + if processed.rejected { + metrics::counter!("gpu.draw.rejected").increment(1); + } + // P4: update the render-target cache with every bound RT + // from this draw. Each bind either inserts a new key or + // refreshes an existing descriptor's bind_count. `msaa` is + // still hardcoded to 1× because we don't yet decode + // `PA_SC_AA_CONFIG`; P4b can add that. + let msaa = crate::render_target_cache::MsaaSamples::X1; + let mut viewport_height = ds.viewport.scale_y.abs() * 2.0; + if viewport_height <= 0.0 { + viewport_height = 720.0; + } + // 16 samples per tile row (64-sample 8×8 macroblocks pack + // 16 vertical samples per EDRAM tile). + let rows_of_tiles = (viewport_height as u32).div_ceil(16); + for (i, ci_opt) in ds.color_info.iter().enumerate() { + if let Some(ci) = ci_opt { + let pitch32 = ds.scissor.br_x.div_ceil(32); + let key = crate::render_target_cache::RenderTargetKey { + base_tiles: ci.base_tiles, + pitch_tiles_at_32bpp: pitch32, + msaa_samples: msaa, + is_depth: false, + resource_format: ci.format & 0xF, + }; + let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32); + self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16); + metrics::counter!( + "gpu.rt.bind", + "slot" => format!("{i}"), + "fmt" => format!("{}", ci.format & 0xF), + ) + .increment(1); + } + } + if let Some(depth) = ds.depth_info { + let pitch32 = ds.scissor.br_x.div_ceil(32); + let key = crate::render_target_cache::RenderTargetKey { + base_tiles: depth.base_tiles, + pitch_tiles_at_32bpp: pitch32, + msaa_samples: msaa, + is_depth: true, + resource_format: depth.format & 0xF, + }; + let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32); + self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16); + } + self.stats.unique_render_targets = self.rt_cache.len() as u64; + tracing::debug!( + opcode = format_args!("{opcode:#x}"), + prim = ?ds.primitive, + verts = ds.vertex_count, + ?processed.topology, + rewritten = processed.rewritten_indices.is_some(), + "gpu: DRAW_INDX captured" + ); + self.last_draw = Some(ds); + self.last_primitive = Some(processed); + } + pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => { + // payload[0] = offset_type — bits[10:0] index, bits[23:16] type + let offset_type = self.read_payload(mem, 1); + let index = offset_type & 0x7FF; + let const_type = (offset_type >> 16) & 0xFF; + let base = match const_type { + 0 => CONST_BASE_ALU, + 1 => CONST_BASE_FETCH, + 2 => CONST_BASE_BOOL, + 3 => CONST_BASE_LOOP, + 4 => CONST_BASE_REGISTERS, + _ => CONST_BASE_ALU, // defensive default + }; + for i in 0..(count - 1) { + let v = self.read_payload(mem, 2 + i); + self.register_file.write(base + index + i, v); + } + } + pm4::PM4_SET_CONSTANT2 => { + // payload[0] = 16-bit index; subsequent payloads write consecutive regs. + let index = self.read_payload(mem, 1) & 0xFFFF; + for i in 0..(count - 1) { + let v = self.read_payload(mem, 2 + i); + self.register_file.write(index + i, v); + } + } + pm4::PM4_LOAD_ALU_CONSTANT => { + // payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords + let src = self.read_payload(mem, 1) & !3; + let offset_type = self.read_payload(mem, 2); + let size_dwords = self.read_payload(mem, 3); + let index = offset_type & 0x7FF; + let const_type = (offset_type >> 16) & 0xFF; + let base = match const_type { + 0 => CONST_BASE_ALU, + 1 => CONST_BASE_FETCH, + 2 => CONST_BASE_BOOL, + 3 => CONST_BASE_LOOP, + 4 => CONST_BASE_REGISTERS, + _ => CONST_BASE_ALU, + }; + for i in 0..size_dwords { + let v = mem.read_u32(src + i * 4); + self.register_file.write(base + index + i, v); + } + } + pm4::PM4_IM_LOAD | pm4::PM4_IM_LOAD_IMMEDIATE => { + // Canary (pm4_command_processor_implement.h:1271-1330): + // IM_LOAD payload: [0] addr_type, [1] start_size + // IM_LOAD_IMMEDIATE payload: [0] shader_type, [1] start_size, [2..count] = microcode + let shader_type = self.read_payload(mem, 1) as u8 & 0x3; + let start_size = self.read_payload(mem, 2); + let size_dwords = start_size & 0xFFFF; + let blob = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE { + let mut v = Vec::with_capacity(size_dwords as usize); + for i in 0..size_dwords { + v.push(self.read_payload(mem, 3 + i)); + } + v + } else { + let addr = self.read_payload(mem, 1) & !3; + let mut v = Vec::with_capacity(size_dwords as usize); + for i in 0..size_dwords { + v.push(mem.read_u32(addr + i * 4)); + } + v + }; + // For IM_LOAD the payload already carries an address that + // uniquely identifies this shader in guest memory, so the + // full `addr_type` dword (address | stage bits) makes a + // good cache key. For IM_LOAD_IMMEDIATE payload[0] is just + // the 2-bit shader_type — without a content-derived key + // every immediate upload would collide on 0 or 1 and + // thrash a single slot. Fold the microcode through a + // stable FNV-1a hash so per-content dedup still works. + let key = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE { + fnv1a_u32_dwords(shader_type as u32, &blob) + } else { + self.read_payload(mem, 1) + }; + self.insert_shader_blob( + key, + ShaderBlob { + shader_type, + dwords: blob, + }, + ); + // P3b M1: record which blob is now "active" for the + // current stage. The uber-shader dispatch (xenia-ui) reads + // `active_vs_key`/`active_ps_key` at draw time to upload + // the right microcode. `shader_type`: 0 = vertex, 1 = pixel + // (per Xenos `ShaderType`). + match shader_type { + 0 => self.active_vs_key = Some(key), + 1 => self.active_ps_key = Some(key), + _ => {} + } + metrics::counter!( + "gpu.shader.blob_seen", + "stage" => if shader_type == 0 { "vs" } else { "ps" }, + ) + .increment(1); + tracing::debug!( + shader_type, + size_dwords, + key = format_args!("{key:#x}"), + "gpu: IM_LOAD (shader blob cached)" + ); + } + pm4::PM4_SET_BIN_MASK_LO => { + self.bin_mask = (self.bin_mask & 0xFFFF_FFFF_0000_0000) + | (self.read_payload(mem, 1) as u64); + } + pm4::PM4_SET_BIN_MASK_HI => { + self.bin_mask = (self.bin_mask & 0x0000_0000_FFFF_FFFF) + | ((self.read_payload(mem, 1) as u64) << 32); + } + pm4::PM4_SET_BIN_MASK => { + let lo = self.read_payload(mem, 1) as u64; + let hi = self.read_payload(mem, 2) as u64; + self.bin_mask = (hi << 32) | lo; + } + pm4::PM4_SET_BIN_SELECT_LO => { + self.bin_select = (self.bin_select & 0xFFFF_FFFF_0000_0000) + | (self.read_payload(mem, 1) as u64); + } + pm4::PM4_SET_BIN_SELECT_HI => { + self.bin_select = (self.bin_select & 0x0000_0000_FFFF_FFFF) + | ((self.read_payload(mem, 1) as u64) << 32); + } + pm4::PM4_SET_BIN_SELECT => { + let lo = self.read_payload(mem, 1) as u64; + let hi = self.read_payload(mem, 2) as u64; + self.bin_select = (hi << 32) | lo; + } + pm4::PM4_INTERRUPT => { + let cpu_mask = self.read_payload(mem, 1); + self.stats.interrupts_emitted += 1; + self.pending_interrupts.push(PendingInterrupt { + source: InterruptSource::CommandProcessor, + cpu_mask, + }); + tracing::debug!( + cpu_mask = format_args!("{cpu_mask:#x}"), + "gpu: PM4_INTERRUPT queued" + ); + } + pm4::PM4_XE_SWAP => { + // Payload: [0] signature, [1] frontbuffer_phys, [2] width, [3] height + let _signature = self.read_payload(mem, 1); + let frontbuffer_phys = self.read_payload(mem, 2); + let width = self.read_payload(mem, 3); + let height = self.read_payload(mem, 4); + self.notify_xe_swap(frontbuffer_phys, width, height); + } + _ => { + // Unknown opcode — log once per opcode but don't stall. + tracing::warn!( + opcode = format_args!("{opcode:#x}"), + count, + "gpu: unhandled Type3 opcode" + ); + } + } + Type3Result::Consumed(total_dwords) + } + + /// Read dword at payload-relative offset `i` (where `i=0` is the header). + fn read_payload(&self, mem: &dyn MemoryAccess, i: u32) -> u32 { + let addr = self.ring.addr_at_offset(i).unwrap(); + mem.read_u32(addr) + } + + /// Drain up to `max_packets` (used by the kernel's VdSwap handler when we + /// don't yet have MMIO-triggered draining). Returns the number of + /// packets consumed. + pub fn drain(&mut self, mem: &dyn MemoryAccess, max_packets: u32) -> u32 { + let mut n = 0; + for _ in 0..max_packets { + match self.execute_one(mem) { + ExecOutcome::Stepped { .. } => n += 1, + ExecOutcome::Idle | ExecOutcome::Blocked => break, + } + } + n + } +} + +impl Default for GpuSystem { + fn default() -> Self { + Self::new() + } +} + +/// Subset of Xenos registers we reference by name. Full table at +/// `xenia-canary/src/xenia/gpu/registers.h`. +pub mod reg { + //! All values below are Xenos *register indices* (the number you find in + //! canary's `register_table.inc`, i.e. the byte offset within the + //! aperture divided by 4). The MMIO aperture at `0x7FC8_0000` maps each + //! register at `APERTURE_BASE + index * 4`; the MMIO callbacks recover + //! the index with `(addr & 0xFFFF) / 4` before matching against these + //! constants. + + /// `XE_GPU_REG_CP_RB_BASE` — ring buffer base address. + pub const CP_RB_BASE: u32 = 0x01C0; + /// `XE_GPU_REG_CP_RB_CNTL` — ring buffer control. + pub const CP_RB_CNTL: u32 = 0x01C1; + /// `XE_GPU_REG_CP_RB_RPTR_ADDR` — where to mirror read pointer. + pub const CP_RB_RPTR_ADDR: u32 = 0x01C3; + /// `XE_GPU_REG_CP_RB_RPTR` — read pointer (GPU → CPU). + pub const CP_RB_RPTR: u32 = 0x01C4; + /// `XE_GPU_REG_CP_RB_WPTR` — write pointer (CPU → GPU); MMIO side effect. + pub const CP_RB_WPTR: u32 = 0x01C5; + /// `XE_GPU_REG_CP_INT_STATUS` — interrupt status bits. + pub const CP_INT_STATUS: u32 = 0x01F3; + /// `XE_GPU_REG_CP_INT_ACK` — write-to-ack interrupt bits. + pub const CP_INT_ACK: u32 = 0x01F4; + /// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126). + /// Bit 0 = VBLANK_INT_OCCURRED. + pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951; + /// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE. + pub const VGT_EVENT_INITIATOR: u32 = 0x21F9; + /// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits + /// (Canary `register_table.inc:530`). + pub const COHER_STATUS_HOST: u32 = 0x0A31; +} + +/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a +/// stable, collision-resistant cache key for `IM_LOAD_IMMEDIATE` shader +/// blobs (where the guest supplies no natural address to key on). +fn fnv1a_u32_dwords(seed: u32, dwords: &[u32]) -> u32 { + const FNV_OFFSET: u32 = 0x811C_9DC5; + const FNV_PRIME: u32 = 0x0100_0193; + let mut hash = FNV_OFFSET; + for byte in seed.to_le_bytes() { + hash ^= byte as u32; + hash = hash.wrapping_mul(FNV_PRIME); + } + for dw in dwords { + for byte in dw.to_le_bytes() { + hash ^= byte as u32; + hash = hash.wrapping_mul(FNV_PRIME); + } + } + hash +} + +/// Internal Type-3 handler result. Distinguishes "consumed a packet (by N +/// dwords)" from "blocked; don't advance read ptr". +enum Type3Result { + Consumed(u32), + Blocked { rewind_to_header: bool }, +} + +#[cfg(test)] +mod tests { + use super::*; + use xenia_memory::GuestMemory; + use xenia_memory::page_table::MemoryProtect; + + fn build_mem() -> GuestMemory { + let mut mem = GuestMemory::new().unwrap(); + let rw = MemoryProtect::READ | MemoryProtect::WRITE; + mem.alloc(0x4000_0000, 0x4000, rw).unwrap(); + mem + } + + #[test] + fn ready_when_ring_has_pending() { + let mut gpu = GpuSystem::new(); + let mem = build_mem(); + assert!(!gpu.is_ready(&mem)); + gpu.initialize_ring_buffer(0x4000_0000, 10); // 1024 bytes = 256 dwords + assert!(!gpu.is_ready(&mem)); + gpu.extend_write_ptr(4); + assert!(gpu.is_ready(&mem)); + } + + #[test] + fn type2_nop_advances_read_pointer() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + // 256 dwords ring at 0x40000000 + gpu.initialize_ring_buffer(0x4000_0000, 10); + // Push 3 Type-2 NOPs + for i in 0..3u32 { + mem.write_u32(0x4000_0000 + i * 4, 0x8000_0000); + } + gpu.extend_write_ptr(3); + for _ in 0..3 { + match gpu.execute_one(&mut mem) { + ExecOutcome::Stepped { dwords_consumed } => assert_eq!(dwords_consumed, 1), + other => panic!("unexpected {:?}", other), + } + } + assert_eq!(gpu.ring.read_offset_dwords, 3); + assert_eq!(gpu.stats.packets_executed, 3); + } + + #[test] + fn type0_reg_run_writes_register_file() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + // Type0 header: 2 dwords, base_index=0x100 → write_one=0 → count field = 1 (count-1) + let hdr = (1u32 << 16) | 0x100; + mem.write_u32(0x4000_0000, hdr); + mem.write_u32(0x4000_0004, 0xDEAD_BEEF); + mem.write_u32(0x4000_0008, 0xCAFE_BABE); + gpu.extend_write_ptr(3); + assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); + assert_eq!(gpu.register_file.read(0x100), 0xDEAD_BEEF); + assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE); + } + + #[test] + fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + // WAIT_REG_MEM: wait until *0x40001000 == 0x42 + // header + let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8); + mem.write_u32(0x4000_0000, hdr); + // wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 2) + mem.write_u32(0x4000_0004, 0x12); + mem.write_u32(0x4000_0008, 0x4000_1000); + mem.write_u32(0x4000_000C, 0x42); + mem.write_u32(0x4000_0010, 0xFFFF_FFFF); + mem.write_u32(0x4000_0014, 0); + gpu.extend_write_ptr(6); + // First exec: poll addr reads 0 → blocked. + assert_eq!(gpu.execute_one(&mut mem), ExecOutcome::Blocked); + assert_eq!(gpu.ring.read_offset_dwords, 0, "read ptr must not advance while blocked"); + // Make the wait satisfied. + mem.write_u32(0x4000_1000, 0x42); + match gpu.execute_one(&mut mem) { + ExecOutcome::Stepped { dwords_consumed } => { + // The WAIT_REG_MEM packet is 1 (header) + 5 (count) = 6 dwords. + assert_eq!(dwords_consumed, 6); + } + other => panic!("expected Stepped after wait satisfied, got {:?}", other), + } + assert_eq!(gpu.ring.read_offset_dwords, 6); + } + + #[test] + fn mem_write_writes_all_payload_dwords() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + // MEM_WRITE: count=3 → 1 header + 1 dst + 2 data + let hdr = (3u32 << 30) | ((3u32 - 1) << 16) | ((pm4::PM4_MEM_WRITE as u32) << 8); + mem.write_u32(0x4000_0000, hdr); + mem.write_u32(0x4000_0004, 0x4000_1000); // dst + mem.write_u32(0x4000_0008, 0x1111_1111); + mem.write_u32(0x4000_000C, 0x2222_2222); + gpu.extend_write_ptr(4); + assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); + assert_eq!(mem.read_u32(0x4000_1000), 0x1111_1111); + assert_eq!(mem.read_u32(0x4000_1004), 0x2222_2222); + } + + #[test] + fn mmio_write_to_cp_rb_wptr_reflects_into_ring() { + use std::sync::atomic::Ordering; + let mut gpu = GpuSystem::new(); + let mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + // Guest writes wptr=8 via MMIO. + gpu.mmio.cp_rb_wptr.store(8, Ordering::Relaxed); + // Before sync, ring has no pending work. + assert!(!gpu.is_ready(&mem)); + gpu.sync_with_mmio(); + assert_eq!(gpu.ring.write_offset_dwords, 8); + assert!(gpu.is_ready(&mem)); + // After sync, rptr is mirrored back to mmio for the guest to read. + assert_eq!(gpu.mmio.cp_rb_rptr.load(Ordering::Relaxed), 0); + } + + /// End-to-end: feed two DRAW_INDX_2 packets through `execute_one` and + /// verify the GPU system reports the expected `draws_seen` / `last_draw` + /// state that the UI's Xenos pipeline consumes. Acts as the "draw + /// dispatch integration" check mentioned in the P3 verification plan. + #[test] + fn successive_draws_accumulate_in_stats() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + let mk_draw = |addr: u32, vgt: u32, mem: &xenia_memory::GuestMemory| { + let hdr = (3u32 << 30) | ((1u32 - 1) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); + mem.write_u32(addr, hdr); + mem.write_u32(addr + 4, vgt); + }; + // Draw #1: TriangleList, 6 verts. + mk_draw(0x4000_0000, (6u32 << 16) | (2 << 6) | 4, &mut mem); + // Draw #2: TriangleStrip, 4 verts. + mk_draw(0x4000_0008, (4u32 << 16) | (2 << 6) | 6, &mut mem); + gpu.extend_write_ptr(4); + assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); + assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); + assert_eq!(gpu.stats.draws_seen, 2); + let ds = gpu.last_draw.expect("last_draw set"); + assert_eq!(ds.primitive, crate::draw_state::PrimitiveType::TriangleStrip); + assert_eq!(ds.vertex_count, 4); + } + + #[test] + fn draw_indx_2_captures_last_draw() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + // DRAW_INDX_2 packet with 1 payload dword = vgt_draw_initiator: + // prim=4 (TriangleList), source=2 (auto), count=3 verts. + let vgt = (3u32 << 16) | (2 << 6) | 4; + let hdr = (3u32 << 30) | ((1u32 - 0) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); + // count field in header = (total_dwords_after - 1); we have 1 payload = count=1 → encoded 0. + let hdr = (hdr & !0x3FFF_0000) | ((1u32 - 1) << 16); + mem.write_u32(0x4000_0000, hdr); + mem.write_u32(0x4000_0004, vgt); + gpu.extend_write_ptr(2); + assert!(matches!( + gpu.execute_one(&mut mem), + ExecOutcome::Stepped { .. } + )); + assert_eq!(gpu.stats.draws_seen, 1); + let ds = gpu.last_draw.expect("last_draw set"); + assert_eq!( + ds.primitive, + crate::draw_state::PrimitiveType::TriangleList + ); + assert_eq!(ds.vertex_count, 3); + let p = gpu.last_primitive.as_ref().expect("last_primitive set"); + assert_eq!(p.topology, crate::primitive::HostTopology::TriangleList); + assert!(!p.rejected); + } + + /// P3b M1: IM_LOAD_IMMEDIATE must set `active_vs_key` / `active_ps_key` + /// based on `shader_type`, and a subsequent DRAW_INDX must carry those + /// P8: shader-blob FIFO evicts the oldest non-active blob when the + /// cache crosses `SHADER_BLOB_CAP`. Active keys are protected. + #[test] + fn shader_blob_cap_evicts_oldest() { + let mut gpu = GpuSystem::new(); + gpu.active_vs_key = Some(u32::MAX); + // Insert unique keys (starting at 1_000 to avoid colliding with + // the active-key sentinel) up to `CAP + 10`; every insert fires + // the eviction path once len > CAP. + gpu.insert_shader_blob( + u32::MAX, + ShaderBlob { + shader_type: 0, + dwords: vec![0xAA; 4], + }, + ); + let first_key = 1_000u32; + for k in first_key..(first_key + SHADER_BLOB_CAP as u32 + 10) { + gpu.insert_shader_blob( + k, + ShaderBlob { + shader_type: 0, + dwords: vec![k; 2], + }, + ); + } + assert!(gpu.shader_blobs.len() <= SHADER_BLOB_CAP); + // Active key (u32::MAX) must still be present. + assert!(gpu.shader_blobs.contains_key(&u32::MAX)); + // Earliest non-active key must have been evicted (at least one of + // the first 10 we inserted is gone). + let evicted = (first_key..first_key + 10) + .filter(|k| !gpu.shader_blobs.contains_key(k)) + .count(); + assert!( + evicted > 0, + "expected at least one of the first 10 keys to be evicted, \ + got shader_blobs.len() = {}", + gpu.shader_blobs.len() + ); + } + + /// `IM_LOAD_IMMEDIATE` uploads a vertex + pixel shader inline; draw + /// state must then carry whichever keys the executor minted. With the + /// content-hashed key scheme, vs and ps keys differ because their + /// microcode bytes differ — the concrete values are derived, so the + /// test just asserts both are non-zero and not equal. + #[test] + fn im_load_records_active_blob_and_draw_carries_it() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + + // PM4_IM_LOAD_IMMEDIATE VS: 4 data dwords = shader_type + start_size + // + 2 code. Header count field = data_count - 1 = 3. + let hdr_vs = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8); + mem.write_u32(0x4000_0000, hdr_vs); + mem.write_u32(0x4000_0004, 0); // shader_type=0 (vertex) + mem.write_u32(0x4000_0008, 2); // start_size: size=2 + mem.write_u32(0x4000_000C, 0xAAAA_AAAA); + mem.write_u32(0x4000_0010, 0xBBBB_BBBB); + + // Second IM_LOAD_IMMEDIATE PS (shader_type=1); 5 dwords total. + let hdr_ps = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8); + mem.write_u32(0x4000_0014, hdr_ps); + mem.write_u32(0x4000_0018, 1); // shader_type=1 (pixel) + mem.write_u32(0x4000_001C, 2); + mem.write_u32(0x4000_0020, 0xCCCC_CCCC); + mem.write_u32(0x4000_0024, 0xDDDD_DDDD); + + // DRAW_INDX_2: 1 data dword, count field = 0. + let vgt = (3u32 << 16) | (2 << 6) | 4; + let hdr_draw = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); + mem.write_u32(0x4000_0028, hdr_draw); + mem.write_u32(0x4000_002C, vgt); + + // Total dwords consumed by the 3 packets: 5 + 5 + 2 = 12. + gpu.extend_write_ptr(12); + // Drain all three packets. + for _ in 0..3 { + assert!(matches!( + gpu.execute_one(&mut mem), + ExecOutcome::Stepped { .. } + )); + } + let vs_key = gpu.active_vs_key.expect("vs key set by IM_LOAD_IMMEDIATE"); + let ps_key = gpu.active_ps_key.expect("ps key set by IM_LOAD_IMMEDIATE"); + assert_ne!(vs_key, ps_key, "VS and PS keys must be distinct"); + let ds = gpu.last_draw.expect("DRAW_INDX_2 captured"); + assert_eq!(ds.vs_blob_key, Some(vs_key)); + assert_eq!(ds.ps_blob_key, Some(ps_key)); + } + + /// Regression: before the content-hash keying, two distinct vertex + /// shaders uploaded via `IM_LOAD_IMMEDIATE` both mapped to key `0` + /// (the shader_type dword) and overwrote each other in `shader_blobs`. + /// With FNV-1a over the microcode, different blobs get different keys + /// and the cache retains both. + #[test] + fn im_load_immediate_distinct_microcode_does_not_collide() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + + let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8); + + // VS shader A. + mem.write_u32(0x4000_0000, hdr); + mem.write_u32(0x4000_0004, 0); // shader_type = vertex + mem.write_u32(0x4000_0008, 2); // size = 2 + mem.write_u32(0x4000_000C, 0x1111_1111); + mem.write_u32(0x4000_0010, 0x2222_2222); + + // VS shader B — same stage, different microcode. + mem.write_u32(0x4000_0014, hdr); + mem.write_u32(0x4000_0018, 0); + mem.write_u32(0x4000_001C, 2); + mem.write_u32(0x4000_0020, 0x3333_3333); + mem.write_u32(0x4000_0024, 0x4444_4444); + + gpu.extend_write_ptr(10); + for _ in 0..2 { + assert!(matches!( + gpu.execute_one(&mut mem), + ExecOutcome::Stepped { .. } + )); + } + assert_eq!( + gpu.shader_blobs.len(), + 2, + "two distinct VS shaders must not collide on the same cache key" + ); + } + + /// P4: `EVENT_WRITE` with initiator `15` (TILE_FLUSH) must route + /// through the resolve handler — captured `last_resolve` + incremented + /// `stats.resolves_total` proves the dispatch works. + #[test] + fn tile_flush_event_records_resolve() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + // Pre-seed RB_COPY_DEST_BASE / DEST_PITCH / DEST_INFO so + // ResolveInfo captures recognisable values. + gpu.register_file + .write(draw_state::reg::RB_COPY_DEST_BASE, 0xDEAD_0000); + gpu.register_file.write( + draw_state::reg::RB_COPY_DEST_PITCH, + (720u32 << 16) | 1280u32, + ); + // copy_dest_format=6 (k_8_8_8_8), copy_dest_endian=0. + gpu.register_file + .write(draw_state::reg::RB_COPY_DEST_INFO, 6u32 << 7); + gpu.register_file.write( + draw_state::reg::RB_COPY_CONTROL, + (1u32 << 20) /* copy_command=1 */ | (1u32 << 8), /* color_clear_enable */ + ); + + // PM4_EVENT_WRITE: 1 data dword — the initiator. + let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_EVENT_WRITE as u32) << 8); + mem.write_u32(0x4000_0000, hdr); + mem.write_u32(0x4000_0004, 15); // TILE_FLUSH + gpu.extend_write_ptr(2); + assert!(matches!( + gpu.execute_one(&mut mem), + ExecOutcome::Stepped { .. } + )); + assert_eq!(gpu.stats.resolves_total, 1); + let info = gpu.last_resolve.expect("TILE_FLUSH captured resolve"); + // `0xDEAD_0000 & 0x1FFF_FFFF = 0x1EAD_0000` — `dest_base` is now + // masked to the Xenon 29-bit physical range at decode time. + assert_eq!(info.dest_base, 0x1EAD_0000); + assert_eq!(info.dest_pitch_pixels, 1280); + assert_eq!(info.dest_height_pixels, 720); + assert_eq!(info.dest_format, 6); + assert_eq!(info.copy_command, 1); + assert!(info.color_clear_enable); + } + + /// P4: DRAW_INDX* with a bound color target should populate + /// `rt_cache` so downstream stages (HUD, resolve) can look up the RT. + #[test] + fn draw_indx_populates_rt_cache() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + // color0 enabled + format=2 (k_8_8_8_8 or similar), base=0x10. + gpu.register_file.write(draw_state::reg::RB_MODECONTROL, 0x1); + gpu.register_file + .write(draw_state::reg::RB_COLOR_INFO_0, (2u32 << 16) | 0x10); + // Non-zero scissor so pitch32 calc is meaningful. + gpu.register_file.write( + draw_state::reg::PA_SC_WINDOW_SCISSOR_BR, + (720u32 << 16) | 1280u32, + ); + let vgt = (3u32 << 16) | (2 << 6) | 4; + let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); + mem.write_u32(0x4000_0000, hdr); + mem.write_u32(0x4000_0004, vgt); + gpu.extend_write_ptr(2); + assert!(matches!( + gpu.execute_one(&mut mem), + ExecOutcome::Stepped { .. } + )); + assert_eq!(gpu.rt_cache.len(), 1); + assert_eq!(gpu.stats.unique_render_targets, 1); + } + + #[test] + fn xe_swap_records_notification() { + let mut gpu = GpuSystem::new(); + let mut mem = build_mem(); + gpu.initialize_ring_buffer(0x4000_0000, 10); + let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_XE_SWAP as u32) << 8); + mem.write_u32(0x4000_0000, hdr); + mem.write_u32(0x4000_0004, pm4::SWAP_SIGNATURE); + mem.write_u32(0x4000_0008, 0xCAFE_0000); + mem.write_u32(0x4000_000C, 1280); + mem.write_u32(0x4000_0010, 720); + gpu.extend_write_ptr(5); + assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); + let swap = gpu.last_swap.unwrap(); + assert_eq!(swap.frame_index, 1); + assert_eq!(swap.frontbuffer_phys, 0xCAFE_0000); + assert_eq!(swap.width, 1280); + assert_eq!(swap.height, 720); + assert_eq!(gpu.stats.swaps_seen, 1); + } +} diff --git a/crates/xenia-gpu/src/handle.rs b/crates/xenia-gpu/src/handle.rs new file mode 100644 index 0000000..4205f77 --- /dev/null +++ b/crates/xenia-gpu/src/handle.rs @@ -0,0 +1,1010 @@ +//! GPU thread skeleton — types only, no thread spawned yet. +//! +//! M1 step 1 of the concurrency rollout (see +//! `/home/fabi/.claude/plans/good-plese-implement-in-zesty-hickey.md`). +//! This module introduces the **shapes** that the eventual GPU host thread +//! will use, without changing any runtime behavior: +//! +//! - [`GpuCommand`] — the control-plane RPC enum sent CPU→GPU. +//! - [`GpuHandle`] — the CPU-side proxy: command sender + cloned MMIO +//! atomics + an interrupt receiver. Eventually it'll also carry the +//! worker thread's `JoinHandle`. +//! - [`GpuWorker`] — the GPU-side owned state (the `GpuSystem` itself plus +//! the receive end of the command channel and the sender for interrupts). +//! It does not yet have a `run()` method; that lands in step 4. +//! +//! The construction is done via [`GpuSystem::into_handle`], which splits a +//! freshly-built `GpuSystem` into `(GpuWorker, GpuHandle)`. The worker keeps +//! the actual GPU state plus `cmd_rx`/`int_tx`; the handle carries `cmd_tx`, +//! `int_rx`, and clones of the `Arc` MMIO mailboxes so the CPU +//! producer side can write WPTR / read RPTR without going through the +//! channel. +//! +//! Until step 4 wires the worker into a real thread, no caller invokes +//! `into_handle` on the live `KernelState.gpu` — the constructor exists for +//! the unit test below and for the synthetic-test path. + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::thread::{self, JoinHandle}; +use std::time::{Duration, Instant}; + +use crossbeam_channel::{Receiver, Sender, bounded, unbounded}; + +use xenia_memory::GuestMemory; + +use crate::gpu_system::{ExecOutcome, GpuMmio, GpuStats, GpuSystem, PendingInterrupt}; + +/// Reply channel for a [`GpuCommand::DrainFence`]. Single-shot +/// `bounded(1)` — the GPU sends `()` once it's drained the ring up to the +/// requested wptr, the CPU thread blocks on `recv` until then. M1 step 5 +/// is the first user of this; step 1 only validates the type fits. +pub type DrainReply = crossbeam_channel::Sender<()>; + +/// Control-plane RPC the CPU thread sends to the GPU thread. Data-plane +/// signals (WPTR/RPTR/INT_STATUS) ride atomic mailboxes instead — see +/// [`GpuMmio`]. Channels are for events that need ordered delivery and +/// (sometimes) a reply. +#[derive(Debug)] +pub enum GpuCommand { + /// `VdInitializeRingBuffer(base, size_log2)`. The kernel hands the GPU + /// the guest-physical base address and dword-size of the primary ring. + /// Today's [`GpuSystem::initialize_ring_buffer`] does this synchronously; + /// from step 4 onward the kernel sends this command instead. + InitializeRing { + base: u32, + size_log2: u32, + }, + /// `VdEnableRingBufferRPtrWriteBack(addr, block_size)`. The kernel + /// supplies a guest-memory address into which the GPU should mirror its + /// internal `read_offset_dwords` after each packet. + EnableRptrWriteback { + addr: u32, + block_size_log2: u32, + }, + /// Block until the GPU's read pointer has caught up with the supplied + /// `target_wptr` (in dwords). The reply is sent on `reply_tx` once the + /// drain completes (or the worker hits its internal deadline). Used by + /// `vd_swap` to preserve the synchronous "GPU has caught up to the + /// guest's wptr at swap time" semantics. + DrainFence { + target_wptr: u32, + reply_tx: DrainReply, + }, + /// Bump the swap counter and post a swap interrupt. Sent by `vd_swap` + /// when the guest commits a frame; the worker's [`GpuSystem::notify_xe_swap`] + /// updates `swaps_seen`/`last_swap` and pushes an `InterruptSource::Swap` + /// onto the (M1.6) `int_tx` channel. Fire-and-forget — no reply. + NotifyXeSwap { + frontbuffer_phys: u32, + width: u32, + height: u32, + }, + /// Tear-down signal. The worker drains any in-flight reply channels, + /// drops its `GpuSystem`, and the host thread joins. + Shutdown, +} + +/// CPU-side proxy that the kernel and the interpreter loop hold. It owns: +/// +/// - The send end of the GPU command channel (`cmd_tx`). +/// - The receive end of the GPU→CPU interrupt channel (`int_rx`). Step 6 +/// migrates `kernel.gpu.pending_interrupts` onto this channel. +/// - Cloned `Arc` MMIO mailboxes so MMIO write callbacks (which +/// already capture these `Arc`s) can keep working unchanged after the +/// `GpuSystem` itself moves to the worker thread. +/// +/// `GpuHandle` is `Send + Sync` by virtue of every field being so already +/// (`Sender`/`Receiver` from crossbeam are `Send + Sync` for `Send` payloads; +/// `Arc` is `Send + Sync`). No explicit impl is needed. +/// +/// Use [`Self::send_cmd`] to post commands so the parker wake invariant is +/// preserved (see M1.7): every channel send must be paired with a wake of +/// the worker thread, otherwise the worker can sleep through new commands +/// even if `cmd_rx.is_empty()` was momentarily false. +#[derive(Debug, Clone)] +pub struct GpuHandle { + /// Control-plane sender. `clone()`-able so multiple call sites can post + /// commands; crossbeam's `Sender` is `Send + Sync + Clone`. + pub cmd_tx: Sender, + /// Interrupt drain channel. Step 6 moves `pending_interrupts` onto this. + /// Until then the `int_tx` half on the worker side is unused. + pub int_rx: Receiver, + /// Direct access to the MMIO mailbox arcs. The CP_RB_WPTR write that + /// the guest does inside the MMIO region callback already lands in + /// `mmio.cp_rb_wptr` (an `Arc`); cloning these here lets + /// CPU-side code do the same atomic read/write that the inline path + /// did, without channel hops. + pub mmio: GpuMmio, + /// Read-side snapshot of GPU stats / cache sizes. Refreshed by the + /// worker each outer loop iteration. Read by [`GpuBackend::digest_snapshot`] + /// and the HUD; cheap copy-out under a brief lock acquisition. + pub digest: Arc>, + /// Shared shutdown flag — set by the CPU side during teardown, read + /// by the worker each loop iteration. Cloned here so callers without + /// access to the worker side (e.g. drop guards) can still signal exit. + pub shutdown: Arc, +} + +impl GpuHandle { + /// Post a command to the worker and wake it. Wraps the raw + /// `cmd_tx.send` with the M1.7 parker discipline: set + /// `wake_pending=true` (Release) and `unpark()` the worker thread. + /// Without the wake, channel sends would not surface to a parked + /// worker — `crossbeam_channel::Sender::send` doesn't unpark by + /// itself. + pub fn send_cmd( + &self, + cmd: GpuCommand, + ) -> Result<(), crossbeam_channel::SendError> { + let r = self.cmd_tx.send(cmd); + if r.is_ok() { + self.mmio.wake_pending.store(true, Ordering::Release); + if let Ok(g) = self.mmio.worker_thread.lock() { + if let Some(t) = g.as_ref() { + t.unpark(); + } + } + } + r + } +} + +/// Periodically-refreshed read-side snapshot of GPU state. Updated by the +/// worker thread on each outer loop iteration; consumed by the CPU side +/// for the run-digest at end-of-run, the HUD, etc. Held in an +/// `Arc>` shared between worker and handle. +/// +/// Snapshot is intentionally restricted to `u64` counters — they're +/// cheap to copy and survive past the worker's lifetime (so the digest +/// can be computed even after the worker has shut down). For the +/// inline backend [`GpuBackend::digest_snapshot`] computes the same view +/// directly from the live `GpuSystem` without any locking. +#[derive(Debug, Clone, Default)] +pub struct GpuDigestSnapshot { + pub stats: GpuStats, + pub shader_blobs_live: u64, + pub texture_cache_entries: u64, + pub texture_decodes: u64, +} + +/// Reverse end of the command and interrupt channels. The GPU thread +/// (`GpuWorker::run`) reads from `cmd_rx`, pushes onto `int_tx` (M1.6+), +/// owns the actual `GpuSystem`, and refreshes the shared +/// `GpuDigestSnapshot`. Built by [`GpuSystem::into_handle`] alongside its +/// matching [`GpuHandle`]. +/// +/// (No `#[derive(Debug)]` because `GpuSystem` itself isn't `Debug`; we +/// don't need it on the worker for any production purpose.) +pub struct GpuWorker { + /// The GPU subsystem itself. The worker thread is its exclusive + /// owner once spawned. + pub system: GpuSystem, + /// Receive end of the control channel. + pub cmd_rx: Receiver, + /// Send end of the interrupt channel. M1.6 wires this in. + pub int_tx: Sender, + /// Shared digest snapshot, refreshed each outer loop iteration. + pub digest: Arc>, + /// Shutdown flag. Set by `shutdown_and_join_with_timeout`; the worker + /// loop checks `Acquire` each iteration. + pub shutdown: Arc, +} + +impl GpuSystem { + /// Split a freshly-built `GpuSystem` into a `(GpuWorker, GpuHandle)` + /// pair. The handle keeps cloned `Arc` MMIO mailboxes plus + /// the channel sender; the worker keeps the system itself plus the + /// channel receiver and the interrupt sender. + /// + /// Channels are unbounded (`crossbeam_channel::unbounded`) because the + /// CPU side never blocks on a control-plane send — guest-driven export + /// rates are bounded by the interpreter throughput, and interrupts are + /// already coalesced upstream by the kernel. + /// + /// Caller supplies a shared `shutdown: Arc` so the worker + /// and the CPU side can coordinate teardown. For unit tests that don't + /// care about lifecycle, [`Self::into_handle_test`] supplies a fresh + /// flag. + pub fn into_handle_with_shutdown( + self, + shutdown: Arc, + ) -> (GpuWorker, GpuHandle) { + let mmio = self.mmio.clone(); + let (cmd_tx, cmd_rx) = unbounded::(); + let (int_tx, int_rx) = unbounded::(); + let digest = Arc::new(std::sync::Mutex::new(GpuDigestSnapshot::default())); + let worker = GpuWorker { + system: self, + cmd_rx, + int_tx, + digest: digest.clone(), + shutdown: shutdown.clone(), + }; + let handle = GpuHandle { + cmd_tx, + int_rx, + mmio, + digest, + shutdown, + }; + (worker, handle) + } + + /// Convenience for tests: allocate a fresh shutdown flag and split. + pub fn into_handle(self) -> (GpuWorker, GpuHandle) { + self.into_handle_with_shutdown(Arc::new(AtomicBool::new(false))) + } +} + +/// Polling interval for the no-op worker's shutdown check. A short sleep +/// avoids burning a host core while still keeping shutdown latency under +/// 10 ms, well below the 1 s defensive timeout in +/// [`shutdown_and_join_with_timeout`]. +const NOOP_WORKER_POLL: Duration = Duration::from_millis(2); + +/// Maximum time the worker waits in `park_timeout` before re-checking +/// shutdown / commands / ring state. With `unpark()` on every guest WPTR +/// write the typical wake latency is microseconds; this is the upper +/// bound for the shutdown / quiescent-state polling cadence. 16 ms aligns +/// with vsync cadence on a 60 Hz host and bounds shutdown latency at the +/// same value. +const WORKER_PARK_TIMEOUT: Duration = Duration::from_millis(16); + +/// Cap on packets executed per outer-loop iteration before the worker +/// re-checks shutdown / commands / digest publish. Mirrors the inline-mode +/// `gpu_runs = max(1, min(64, executed_this_round / 6))` pacer ceiling. +const WORKER_PACKETS_PER_ITER: u32 = 64; + +/// Backend for the kernel's `gpu` field. The two variants share a thin +/// dispatch layer (forwarding methods on this enum) so call sites in +/// `xenia-kernel` exports stay terse. +/// +/// - [`GpuBackend::Inline`] keeps the legacy synchronous path: the CPU +/// thread calls `kernel.gpu.execute_one(mem)` directly each scheduler +/// round. Selected by `--gpu-inline` (rollback flag) or implied by +/// `--ui` until the UI worker is migrated. +/// - [`GpuBackend::Threaded`] (**default at M1.9**) hands `GpuSystem` +/// ownership to a dedicated host thread; the CPU thread holds a +/// [`GpuHandle`] proxy and talks to the worker via channels + the +/// shared MMIO atomics. +/// +/// `GpuBackend` itself is `Send` (the inline variant carries `GpuSystem`, +/// which is Send-able as long as nothing inside it is `!Send` — it isn't); +/// the threaded variant carries a `Send + Sync` handle. +pub enum GpuBackend { + Inline(GpuSystem), + Threaded(GpuHandle), +} + +impl GpuBackend { + /// Read the MMIO mailbox struct (cheap — `GpuMmio` is `Clone` cloning + /// only `Arc`s; we hand back a borrow). The result is the + /// same `Arc` set on either backend, so MMIO region + /// callbacks installed via [`crate::build_mmio_region`] route guest + /// writes to the same atomics the worker reads. + pub fn mmio(&self) -> &GpuMmio { + match self { + GpuBackend::Inline(s) => &s.mmio, + GpuBackend::Threaded(h) => &h.mmio, + } + } + + /// Convenience: borrow the inline `GpuSystem` for code paths that + /// haven't been generalized to the `Threaded` variant yet (vd_swap's + /// drain, the various `state.gpu.X` reads in M1.5+ work). Returns + /// `None` in threaded mode; the caller's responsibility is to handle + /// that gracefully — typically by treating the operation as a no-op + /// or routing it through a command (see `vd_swap` notes). + pub fn as_inline(&self) -> Option<&GpuSystem> { + match self { + GpuBackend::Inline(s) => Some(s), + GpuBackend::Threaded(_) => None, + } + } + + /// Mutable counterpart of [`Self::as_inline`]. + pub fn as_inline_mut(&mut self) -> Option<&mut GpuSystem> { + match self { + GpuBackend::Inline(s) => Some(s), + GpuBackend::Threaded(_) => None, + } + } + + /// Forward `VdInitializeRingBuffer`. Inline mode applies it directly; + /// threaded mode posts an `InitializeRing` command onto the worker + /// channel. + pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) { + match self { + GpuBackend::Inline(s) => s.initialize_ring_buffer(base, size_log2), + GpuBackend::Threaded(h) => { + let _ = h.send_cmd(GpuCommand::InitializeRing { base, size_log2 }); + } + } + } + + /// Forward `VdEnableRingBufferRPtrWriteBack`. + pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) { + match self { + GpuBackend::Inline(s) => s.enable_rptr_writeback(addr, block_log2), + GpuBackend::Threaded(h) => { + let _ = h.send_cmd(GpuCommand::EnableRptrWriteback { + addr, + block_size_log2: block_log2, + }); + } + } + } + + /// Bump `CP_RB_WPTR` by `dwords`. Both backends route the bump through + /// the shared MMIO atomic mailbox (`Acquire`-load → wrap-add → + /// `Release`-store). Inline mode then picks up the new value on its + /// next `sync_with_mmio`; threaded mode's worker observes the same + /// atomic and folds it into its ring view. + /// + /// Note: the value stored is unmodulo'd. The reading side's + /// `sync_with_mmio` does the `% ring.size_dwords` step before + /// updating the local ring view, which is the only place a `size_dwords` + /// reference exists. Practical wptr drift before u32 wraps is + /// `2^32 / 64 ≈ 67M` VdSwap-style bumps — safely above any plausible + /// single-run total. + pub fn extend_write_ptr_by(&mut self, dwords: u32) { + let mmio = self.mmio(); + // Relaxed is sufficient for the load — we re-store with Release + // and the readers (worker `sync_with_mmio` / inline next round) + // do their own Acquire. The load here is just a value source. + let cur = mmio.cp_rb_wptr.load(Ordering::Relaxed); + mmio.cp_rb_wptr + .store(cur.wrapping_add(dwords), Ordering::Release); + } + + /// Drain any PM4 packets currently exposed by the ring (i.e., up to + /// the current `CP_RB_WPTR`). Inline mode runs the synchronous + /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks + /// on the reply channel up to a 1 s defensive timeout — the worker + /// has its own ~900 ms internal deadline so the reply is bounded. + /// + /// The CPU thread blocking here is sound: the only thread that + /// satisfies the reply is the GPU worker, which never tries to + /// acquire any CPU-side primitive (it talks back exclusively through + /// channels and atomics). The lock-ordering argument from the M1.4 + /// plan holds: T_cpu → cmd_tx → T_gpu → reply_tx → T_cpu, no cycle. + pub fn drain_to_current_wptr(&mut self, mem: &dyn xenia_memory::MemoryAccess) -> u32 { + match self { + GpuBackend::Inline(s) => { + s.sync_with_mmio(); + s.drain(mem, 4096) + } + GpuBackend::Threaded(h) => { + let target_wptr = h.mmio.cp_rb_wptr.load(Ordering::Acquire); + let (reply_tx, reply_rx) = bounded::<()>(1); + if h + .send_cmd(GpuCommand::DrainFence { + target_wptr, + reply_tx, + }) + .is_err() + { + // Worker disconnected; treat as drained. + return 0; + } + match reply_rx.recv_timeout(Duration::from_secs(1)) { + Ok(()) => { + // We don't currently track the exact packet count + // drained on the threaded path — the worker drains + // by `is_ready` predicate. Return 1 as a "drain + // happened" sentinel; the inline mode's exact + // count is a debug-trace nicety. + 1 + } + Err(_) => { + tracing::warn!( + target: "gpu", + target_wptr, + "vd_swap drain fence timed out at 1s; continuing teardown", + ); + 0 + } + } + } + } + } + + /// Bump `swaps_seen` + record `last_swap` + push a swap interrupt. + /// Inline calls directly. Threaded sends `NotifyXeSwap` over the + /// command channel — fire-and-forget; the worker handles it on its + /// next loop iteration. + pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) { + match self { + GpuBackend::Inline(s) => s.notify_xe_swap(frontbuffer_phys, width, height), + GpuBackend::Threaded(h) => { + let _ = h.send_cmd(GpuCommand::NotifyXeSwap { + frontbuffer_phys, + width, + height, + }); + } + } + } + + /// Forward [`GpuSystem::has_pending_interrupts`] under inline mode; + /// under threaded mode peek the `int_rx` channel. + pub fn has_pending_interrupts(&self) -> bool { + match self { + GpuBackend::Inline(s) => s.has_pending_interrupts(), + GpuBackend::Threaded(h) => !h.int_rx.is_empty(), + } + } + + /// Drain pending interrupts. Inline path forwards to + /// [`GpuSystem::take_pending_interrupts`]; threaded path drains the + /// channel non-blockingly. (M1 step 6 fully wires the threaded path + /// when the worker starts pushing onto `int_tx`; for now, the channel + /// is empty in threaded mode at -n 2M, so this still returns an empty + /// `Vec`.) + pub fn take_pending_interrupts(&mut self) -> Vec { + match self { + GpuBackend::Inline(s) => s.take_pending_interrupts(), + GpuBackend::Threaded(h) => { + let mut out = Vec::new(); + while let Ok(pi) = h.int_rx.try_recv() { + out.push(pi); + } + out + } + } + } + + /// End-of-run snapshot used by the run-digest. Inline mode reads + /// directly; threaded mode pulls the latest published mirror under a + /// brief lock. Returns owned data — safe to use after the worker has + /// shut down. + pub fn digest_snapshot(&self) -> GpuDigestSnapshot { + match self { + GpuBackend::Inline(s) => GpuDigestSnapshot { + stats: s.stats.clone(), + shader_blobs_live: s.shader_blobs.len() as u64, + texture_cache_entries: s.texture_cache.len() as u64, + texture_decodes: s.texture_cache.decodes_total, + }, + GpuBackend::Threaded(h) => h + .digest + .lock() + .expect("GpuDigestSnapshot mutex poisoned") + .clone(), + } + } +} + +impl GpuWorker { + /// Run loop body for the GPU host thread. + /// + /// Each iteration: + /// 1. Check the `Acquire`-loaded shutdown flag — exit if set. + /// 2. Drain any pending control-plane commands non-blockingly. + /// 3. Sample MMIO (refreshes WPTR / RPTR mailboxes into the live ring). + /// 4. Execute up to [`WORKER_PACKETS_PER_ITER`] PM4 packets while the + /// ring is non-empty / not blocked. + /// 5. Refresh the shared digest snapshot under a brief lock. + /// 6. If no work was done this iteration, sleep + /// [`WORKER_IDLE_SLEEP`]. Step 7 swaps this for `park_timeout`. + /// + /// `memory: Arc` is shared with the CPU thread. The + /// worker only ever reads through `&*memory`, which deref-coerces to + /// `&GuestMemory` and then to `&dyn MemoryAccess`. All mutations on + /// `MemoryAccess` are `&self` post-trait-flip, so concurrent CPU and + /// GPU writes are sound under the trait's contract (callers must not + /// concurrently read/write the same byte range from different + /// threads — vd_swap's RPTR writeback / EVENT_WRITE_SHD writes target + /// guest-thread-private addresses by construction). + pub fn run(mut self, memory: Arc) { + // M1.7 parker registration: publish our `Thread` handle so the + // MMIO `CP_RB_WPTR` write callback can `unpark()` us. Only one + // worker thread per `GpuMmio`; we replace whatever was there. + if let Ok(mut g) = self.system.mmio.worker_thread.lock() { + *g = Some(thread::current()); + } + loop { + // (1) shutdown + if self.shutdown.load(Ordering::Acquire) { + break; + } + // (2) drain commands + let mut did_work = false; + while let Ok(cmd) = self.cmd_rx.try_recv() { + did_work = true; + match cmd { + GpuCommand::InitializeRing { base, size_log2 } => { + self.system.initialize_ring_buffer(base, size_log2); + } + GpuCommand::EnableRptrWriteback { + addr, + block_size_log2, + } => { + self.system.enable_rptr_writeback(addr, block_size_log2); + } + GpuCommand::DrainFence { + target_wptr: _, + reply_tx, + } => { + // Drain the ring up to whatever WPTR the MMIO + // atomic currently exposes (the CPU side bumped + // it before sending the fence). Bounded by an + // internal 900 ms deadline — 100 ms tighter than + // the CPU's `recv_timeout(1s)` so the timeout + // surfaces on the CPU side as a clean + // `RecvTimeout` rather than a partial drain that + // looks complete. + // + // The drain loop polls `is_ready` after each + // packet; `sync_with_mmio` between packets is + // what folds late guest WPTR writes into the + // local ring view. Loop exits when the ring is + // empty (rptr == wptr after modulo) or a packet + // returns `Idle`/`Blocked`. + self.system.sync_with_mmio(); + let deadline = Instant::now() + Duration::from_millis(900); + while self.system.is_ready(&*memory) { + if Instant::now() >= deadline { + break; + } + match self.system.execute_one(&*memory) { + ExecOutcome::Stepped { .. } => { + self.system.sync_with_mmio(); + } + ExecOutcome::Idle | ExecOutcome::Blocked => break, + } + } + let _ = reply_tx.send(()); + } + GpuCommand::NotifyXeSwap { + frontbuffer_phys, + width, + height, + } => { + self.system + .notify_xe_swap(frontbuffer_phys, width, height); + } + GpuCommand::Shutdown => { + self.shutdown.store(true, Ordering::Release); + return; + } + } + } + // (3,4) drive the GPU + self.system.sync_with_mmio(); + let mut budget = WORKER_PACKETS_PER_ITER; + while budget > 0 && self.system.is_ready(&*memory) { + match self.system.execute_one(&*memory) { + ExecOutcome::Stepped { .. } => { + did_work = true; + self.system.sync_with_mmio(); + } + ExecOutcome::Idle | ExecOutcome::Blocked => break, + } + budget -= 1; + } + // (5a) M1.6: forward `PM4_INTERRUPT` / `XE_SWAP` events from + // `system.pending_interrupts` onto `int_tx`. The Vec + // lives on this thread; the channel is the cross-thread + // delivery primitive. Send is non-blocking (unbounded) + // and the receive end is drained by the CPU thread's + // per-round queue at `main.rs::run_execution`. + // `int_tx.send` returns `Err` only if the receiver was + // dropped — which means the CPU side is gone, in which + // case we'd be torn down momentarily anyway. + for pi in self.system.take_pending_interrupts() { + if self.int_tx.send(pi).is_err() { + break; + } + did_work = true; + } + // (5b) publish digest snapshot + if did_work { + let snap = GpuDigestSnapshot { + stats: self.system.stats.clone(), + shader_blobs_live: self.system.shader_blobs.len() as u64, + texture_cache_entries: self.system.texture_cache.len() as u64, + texture_decodes: self.system.texture_cache.decodes_total, + }; + if let Ok(mut g) = self.digest.lock() { + *g = snap; + } + } + // (6) M1.7 parker — `park_timeout` replaces the polling + // sleep. The standard parker idiom defends against the + // producer-races-park lost-wakeup: + // + // 1. Swap `wake_pending` to false (claim "we're going to + // park"). If `was_pending` is true, a producer + // signaled us between the last work and now — skip + // the park, loop and re-process. + // 2. Re-check side conditions (cmd channel, shutdown). + // These may have changed after step 1. + // 3. `park_timeout`. If the producer's `unpark()` runs + // between our re-check and our park call, std's token + // records it and the next park returns immediately. + // If neither happens within `WORKER_PARK_TIMEOUT`, we + // wake on our own and re-evaluate. + if !did_work { + let was_pending = + self.system.mmio.wake_pending.swap(false, Ordering::AcqRel); + if !was_pending + && self.cmd_rx.is_empty() + && !self.shutdown.load(Ordering::Acquire) + { + thread::park_timeout(WORKER_PARK_TIMEOUT); + } + } + } + // Clear the wake target on exit so post-shutdown MMIO writes + // don't try to unpark a dead thread (sound — `Thread::unpark` + // on an exited thread is a no-op — but it keeps the invariant + // tidy). + if let Ok(mut g) = self.system.mmio.worker_thread.lock() { + *g = None; + } + } +} + +/// Spawn the real GPU worker thread. Returns its `JoinHandle`; the +/// matching `GpuHandle` (caller's existing one from +/// [`GpuSystem::into_handle_with_shutdown`]) is what the CPU thread keeps. +pub fn spawn_gpu_worker( + worker: GpuWorker, + memory: Arc, +) -> JoinHandle<()> { + thread::Builder::new() + .name("xenia-gpu".to_string()) + .spawn(move || worker.run(memory)) + .expect("spawn xenia-gpu worker thread") +} + +/// M1 step 3 — spawn a placeholder GPU worker thread that does nothing +/// except poll `shutdown` on a short cadence and exit cleanly when it sees +/// `true`. Verifies thread lifecycle, signal propagation, and clean +/// teardown. **Not used in production paths**: in step 4 the worker grows a +/// real `GpuWorker::run` body that owns a `GpuSystem`. Until then this +/// function is the only spawn site, gated behind `--gpu-thread` in the CLI. +/// +/// The function returns the `JoinHandle` so the caller can block on +/// teardown via [`shutdown_and_join_with_timeout`]. Ownership of the +/// `Arc` is shared: the caller keeps a clone for signaling, +/// the thread takes another clone for polling. +/// +/// Rationale for `Acquire`/`Release` ordering on the bool: the spawning +/// thread may set up shared state *before* the worker reads it once we +/// expand the worker in step 4 — the `Release` store on shutdown then +/// pairs with the `Acquire` load here so any prior writes the spawner did +/// (e.g. populating channels with farewell messages) are visible to the +/// worker. For the no-op stage there's no shared state, but using the +/// stricter ordering now means step 4 inherits a correctly-fenced +/// shutdown protocol with no further changes. +pub fn spawn_noop_worker(shutdown: Arc) -> JoinHandle<()> { + thread::Builder::new() + .name("xenia-gpu-noop".to_string()) + .spawn(move || { + while !shutdown.load(Ordering::Acquire) { + thread::sleep(NOOP_WORKER_POLL); + } + }) + .expect("spawn xenia-gpu-noop worker thread") +} + +/// Signal `shutdown` to the worker and join its thread, with a defensive +/// timeout so a misbehaving worker can't wedge the entire process. Logs at +/// `error!` if the timeout fires (which would indicate either the worker +/// loop ignoring `shutdown` or being parked on a primitive that wasn't +/// woken — both are bugs the user should hear about). +/// +/// Returns `Ok(())` on a clean join inside the timeout, `Err(())` on +/// timeout. The caller decides whether to continue process teardown anyway +/// (typically yes — the worker's only state is its own stack). +pub fn shutdown_and_join_with_timeout( + shutdown: &Arc, + handle: JoinHandle<()>, + timeout: Duration, +) -> Result<(), ()> { + shutdown.store(true, Ordering::Release); + // No `JoinHandle::join_timeout` in std; emulate via a side-channel + // signal from the polling. We use a sentinel-clone of the JoinHandle + // approach: spawn a watchdog that, after the timeout, sets a + // "give up" flag — but `join` is a blocking call we can't preempt. + // Instead use a parking helper: spawn a hop-thread to call join, and + // wait on it via a `crossbeam_channel::after` select. + let (tx, rx) = unbounded::<()>(); + let join_thread = thread::Builder::new() + .name("xenia-gpu-joiner".to_string()) + .spawn(move || { + let _ = handle.join(); + // `_ = tx.send(())` — receiver may already be dropped if + // we timed out, in which case Err is fine. + let _ = tx.send(()); + }) + .expect("spawn xenia-gpu-joiner thread"); + crossbeam_channel::select! { + recv(rx) -> _ => { + // Joiner finished within the budget. Reap it (no work — the + // thread already returned). `join` here is fast. + let _ = join_thread.join(); + Ok(()) + } + recv(crossbeam_channel::after(timeout)) -> _ => { + tracing::error!( + target: "gpu", + ?timeout, + "GPU worker did not exit in time; leaking thread to avoid wedging shutdown", + ); + // Detach the joiner; it leaks but at least we proceed. Will + // get cleaned up when the process exits. + std::mem::drop(join_thread); + Err(()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Roundtrip an `InitializeRing` command: CPU side sends, worker side + /// receives, payload bytes match. This validates the channel plumbing + /// without touching any GPU semantics. + #[test] + fn initialize_ring_roundtrips_through_handle() { + let (worker, handle) = GpuSystem::new().into_handle(); + let GpuWorker { cmd_rx, .. } = worker; + handle + .cmd_tx + .send(GpuCommand::InitializeRing { + base: 0x1000_0000, + size_log2: 18, + }) + .expect("cmd_tx send"); + match cmd_rx.recv().expect("cmd_rx recv") { + GpuCommand::InitializeRing { base, size_log2 } => { + assert_eq!(base, 0x1000_0000); + assert_eq!(size_log2, 18); + } + other => panic!("unexpected cmd: {other:?}"), + } + } + + /// MMIO atomics on the handle and the worker's GpuSystem must be the + /// same Arc (clone). A guest write through the MMIO region callback + /// stores into `handle.mmio.cp_rb_wptr`; the worker observes the same + /// value via `worker.system.mmio.cp_rb_wptr`. If we accidentally + /// allocated a fresh atomic for either side, the worker would never see + /// guest writes. + #[test] + fn mmio_arcs_are_shared_between_handle_and_worker() { + use std::sync::atomic::Ordering; + let (worker, handle) = GpuSystem::new().into_handle(); + handle.mmio.cp_rb_wptr.store(0xC0FFEE, Ordering::Release); + assert_eq!( + worker.system.mmio.cp_rb_wptr.load(Ordering::Acquire), + 0xC0FFEE, + "worker side did not observe handle-side atomic store", + ); + } + + /// `GpuHandle` must be `Send + Sync`. Compile-time assertion via type + /// constraint — won't link if the bound is violated. + #[test] + fn handle_is_send_sync() { + fn assert_send_sync() {} + assert_send_sync::(); + } + + /// `GpuWorker` must be `Send` so we can move it onto a thread in step 3. + /// (`Sync` is not required — only one thread ever owns the worker.) + #[test] + fn worker_is_send() { + fn assert_send() {} + assert_send::(); + } + + /// Dropping the handle (CPU side) must not block recv on the worker + /// side; instead it must surface as `Disconnected`. This is the + /// standard crossbeam guarantee but we pin it down with an explicit + /// test so a future refactor (e.g. switching channel backends) can't + /// silently change semantics. + #[test] + fn dropping_handle_disconnects_command_channel() { + let (worker, handle) = GpuSystem::new().into_handle(); + drop(handle); + let result = worker.cmd_rx.recv(); + assert!( + matches!(result, Err(crossbeam_channel::RecvError)), + "expected Disconnected after handle drop, got {result:?}", + ); + } + + /// Spawn the no-op worker, signal shutdown immediately, and join. Must + /// complete within a generous timeout (the polling cadence is 2 ms, + /// so 250 ms gives plenty of headroom even on a loaded test runner). + #[test] + fn noop_worker_shuts_down_cleanly() { + let shutdown = Arc::new(AtomicBool::new(false)); + let handle = spawn_noop_worker(shutdown.clone()); + // Brief gap so the worker enters its loop at least once before we + // signal — exercises the in-loop exit path rather than the never- + // entered-loop case. + thread::sleep(Duration::from_millis(5)); + let outcome = + shutdown_and_join_with_timeout(&shutdown, handle, Duration::from_millis(250)); + assert_eq!(outcome, Ok(()), "no-op worker did not join in budget"); + } + + /// A worker that never exits must surface as a timeout, not a hang. + /// Validates the `crossbeam_channel::after` budget. We construct a + /// pseudo-worker that ignores `shutdown` for the entire timeout + /// window, then exits — the test budget is short enough to force the + /// timeout path. + #[test] + fn shutdown_join_timeouts_on_misbehaving_worker() { + let shutdown = Arc::new(AtomicBool::new(false)); + let handle = thread::Builder::new() + .name("test-misbehaving-worker".to_string()) + .spawn(|| { + // Sleep longer than the test's join budget. The test + // proves the join helper returns `Err(())` rather than + // blocking the test process indefinitely. + thread::sleep(Duration::from_millis(500)); + }) + .expect("spawn misbehaving worker"); + let outcome = + shutdown_and_join_with_timeout(&shutdown, handle, Duration::from_millis(50)); + assert_eq!(outcome, Err(()), "expected timeout signal"); + } + + /// M1.8 — `write_u32_fence` / `read_u32_fence` ordering test. A + /// producer thread writes a "data" value, then a "fence" value via + /// `write_u32_fence`. A consumer thread spin-reads the fence via + /// `read_u32_fence` and, on observing the producer's update, reads + /// the data via plain `read_u32`. The data must always equal the + /// producer's pre-fence write — never an older value or a torn read. + /// + /// On x86_64 (TSO) this would pass even without the fences; on + /// weaker architectures it would fail without them. We pin down the + /// invariant here so future ports / refactors can't silently weaken + /// it. Uses a small synthetic memory implementing `MemoryAccess`. + #[test] + fn write_u32_fence_publishes_prior_writes() { + use std::sync::atomic::{AtomicBool, AtomicU32}; + use std::time::Instant; + use xenia_memory::MemoryAccess; + + // The test's MemoryAccess impl uses `AtomicU32` slots so the + // multi-byte u32 reads/writes are torn-read-free. The fence + // helper layers Release/Acquire on top — without atomic + // storage, byte-by-byte reads on a writer-racing buffer would + // see torn values regardless of the fence. + const SLOT_COUNT: usize = 16; + struct ScopedMem([AtomicU32; SLOT_COUNT]); + impl ScopedMem { + fn slot(&self, addr: u32) -> &AtomicU32 { + &self.0[(addr / 4) as usize] + } + } + impl MemoryAccess for ScopedMem { + fn read_u8(&self, addr: u32) -> u8 { + let v = self.slot(addr & !3).load(Ordering::Relaxed); + let shift = (addr & 3) * 8; + (v >> shift) as u8 + } + fn read_u16(&self, addr: u32) -> u16 { + u16::from_le_bytes([self.read_u8(addr), self.read_u8(addr + 1)]) + } + fn read_u32(&self, addr: u32) -> u32 { + self.slot(addr).load(Ordering::Relaxed) + } + fn read_u64(&self, addr: u32) -> u64 { + let lo = self.read_u32(addr) as u64; + let hi = self.read_u32(addr + 4) as u64; + lo | (hi << 32) + } + fn write_u8(&self, _addr: u32, _val: u8) { + unimplemented!("test fixture only writes u32") + } + fn write_u16(&self, _addr: u32, _val: u16) { + unimplemented!("test fixture only writes u32") + } + fn write_u32(&self, addr: u32, val: u32) { + self.slot(addr).store(val, Ordering::Relaxed); + } + fn write_u64(&self, addr: u32, val: u64) { + self.write_u32(addr, val as u32); + self.write_u32(addr + 4, (val >> 32) as u32); + } + fn translate(&self, _addr: u32) -> Option<*const u8> { + None + } + fn translate_mut(&self, _addr: u32) -> Option<*mut u8> { + None + } + } + + let mem: Arc = Arc::new(ScopedMem(std::array::from_fn(|_| { + AtomicU32::new(0) + }))); + // Initialize fence and data slots to zero. + mem.write_u32(0, 0); // data + mem.write_u32(16, 0); // fence + + let stop = Arc::new(AtomicBool::new(false)); + + let mem_p = mem.clone(); + let stop_p = stop.clone(); + let producer = thread::Builder::new() + .name("fence-producer".into()) + .spawn(move || { + for i in 1u32..=10_000 { + if stop_p.load(Ordering::Relaxed) { + break; + } + mem_p.write_u32(0, i); // data + mem_p.write_u32_fence(16, i); // fence (Release) + thread::yield_now(); + } + }) + .expect("spawn producer"); + + let mem_c = mem.clone(); + let consumer = thread::Builder::new() + .name("fence-consumer".into()) + .spawn(move || { + let deadline = Instant::now() + Duration::from_millis(500); + let mut last_seen = 0u32; + let mut iters = 0u32; + while Instant::now() < deadline { + let f = mem_c.read_u32_fence(16); // Acquire + if f != last_seen { + let d = mem_c.read_u32(0); + // The data we read after the fence must be at + // least as new as the fence value (producer + // wrote `data = i; fence(i)` in that order). + assert!( + d >= f, + "fence ordering violated: data={d} fence={f}" + ); + last_seen = f; + iters += 1; + } + } + iters + }) + .expect("spawn consumer"); + + let observed = consumer.join().expect("consumer join"); + stop.store(true, Ordering::Relaxed); + let _ = producer.join(); + assert!( + observed > 0, + "consumer never observed a fence transition (race scheduler too unfair?)", + ); + } + + /// Spawning two no-op workers in parallel and joining both must + /// succeed without interference — proves the joiner side-thread + /// pattern doesn't accidentally serialize teardown. + #[test] + fn two_concurrent_noop_workers_both_shut_down() { + let shutdown_a = Arc::new(AtomicBool::new(false)); + let handle_a = spawn_noop_worker(shutdown_a.clone()); + let shutdown_b = Arc::new(AtomicBool::new(false)); + let handle_b = spawn_noop_worker(shutdown_b.clone()); + let r_a = shutdown_and_join_with_timeout( + &shutdown_a, + handle_a, + Duration::from_millis(250), + ); + let r_b = shutdown_and_join_with_timeout( + &shutdown_b, + handle_b, + Duration::from_millis(250), + ); + assert_eq!(r_a, Ok(())); + assert_eq!(r_b, Ok(())); + } +} diff --git a/crates/xenia-gpu/src/lib.rs b/crates/xenia-gpu/src/lib.rs index 8adce9c..74f906f 100644 --- a/crates/xenia-gpu/src/lib.rs +++ b/crates/xenia-gpu/src/lib.rs @@ -1,21 +1,49 @@ +//! Xenos GPU emulation for xenia-rs. +//! +//! Modules: +//! - [`pm4`]: packet format decoder + Type-3 opcode set. +//! - [`ring_view`]: ring-buffer bookkeeping (base/size/read/write pointers). +//! - [`register_file`]: 0x6000-entry register array backing the CP + state. +//! - [`gpu_system`]: top-level `GpuSystem` + PM4 executor running one packet +//! per call (see the plan's P2 for the design rationale). +//! +//! Legacy module `ring_drain` and `command_processor` are retained while P3+ +//! migrations finish; they will be removed once every caller is on +//! [`gpu_system::GpuSystem`]. + pub mod command_processor; +pub mod draw_state; +pub mod edram; +pub mod gpu_system; +pub mod handle; +pub mod mmio_region; +pub mod pm4; +pub mod primitive; pub mod register_file; +pub mod ring_drain; +pub mod ring_view; +pub mod render_target_cache; +pub mod resolve; +pub mod shader_metrics; +pub mod shaders; +pub mod texture_cache; +pub mod tiled_address; +pub mod translator; +pub mod ucode; +pub mod xenos_constants; -/// Stub GPU system for initial implementation. -pub struct GpuSystem { - pub register_file: register_file::RegisterFile, -} - -impl GpuSystem { - pub fn new() -> Self { - Self { - register_file: register_file::RegisterFile::new(), - } - } -} - -impl Default for GpuSystem { - fn default() -> Self { - Self::new() - } -} +pub use gpu_system::{ + ExecOutcome, GpuBlock, GpuMmio, GpuStats, GpuSystem, InterruptSource, PendingInterrupt, + ShaderBlob, SwapNotification, WaitCmp, +}; +pub use handle::{ + DrainReply, GpuBackend, GpuCommand, GpuDigestSnapshot, GpuHandle, GpuWorker, + shutdown_and_join_with_timeout, spawn_gpu_worker, spawn_noop_worker, +}; +pub use mmio_region::build_region as build_mmio_region; +pub use pm4::{ + PacketHeader, PacketKind, PM4_INTERRUPT, PM4_NOP, PM4_XE_SWAP, SWAP_SIGNATURE, + type3_opcode_name, +}; +pub use ring_drain::{DrainResult, drain}; +pub use ring_view::RingBufferView; diff --git a/crates/xenia-gpu/src/mmio_region.rs b/crates/xenia-gpu/src/mmio_region.rs new file mode 100644 index 0000000..fe32c62 --- /dev/null +++ b/crates/xenia-gpu/src/mmio_region.rs @@ -0,0 +1,217 @@ +//! Construct an `xenia_memory::MmioRegion` that backs the Xenos GPU register +//! aperture at guest physical `0x7FC80000` (per canary +//! `graphics_system.cc:141-144` — `memory_->AddVirtualMappedRange(0x7FC80000, +//! 0xFFFF0000, 0x0000FFFF, …)`). +//! +//! Only a handful of registers need a round-trip over the bus; everything +//! else (the ALU / fetch constants, the RBBM state machine, …) lives inside +//! `GpuSystem::register_file` and is driven by PM4 packets from the CP on +//! the same host thread. +//! +//! The read/write closures capture `Arc` mailboxes cloned from +//! [`crate::GpuMmio`]; [`crate::GpuSystem::sync_with_mmio`] samples them +//! each scheduler round. + +use std::sync::atomic::Ordering; + +use xenia_memory::MmioRegion; + +use crate::gpu_system::{reg, GpuMmio}; + +/// Xenos GPU register aperture base (guest physical address). Matches +/// canary's `graphics_system.cc:141`. +pub const APERTURE_BASE: u32 = 0x7FC8_0000; +/// Mask used by `MmioRegion::contains` so any `0x7FC8xxxx` address hits. +pub const APERTURE_MASK: u32 = 0xFFFF_0000; +/// Total aperture size in bytes (enough for the low 16-bit register window). +pub const APERTURE_SIZE: u32 = 0x0001_0000; + +/// Build the [`MmioRegion`] to install on the guest memory. +pub fn build_region(mmio: &GpuMmio) -> MmioRegion { + let read_wptr = mmio.cp_rb_wptr.clone(); + let read_rptr = mmio.cp_rb_rptr.clone(); + let read_int_status = mmio.cp_int_status.clone(); + let read_int_ack = mmio.cp_int_ack.clone(); + let read_vblank_status = mmio.d1mode_vblank_vline_status.clone(); + let write_wptr = mmio.cp_rb_wptr.clone(); + let write_int_ack = mmio.cp_int_ack.clone(); + let write_vblank_status = mmio.d1mode_vblank_vline_status.clone(); + // M1.7 parker — captured into the WPTR write closure to wake a + // parked GPU worker on every guest WPTR write. In inline mode the + // mutex holds `None`, so the unpark site is a brief lock + no-op. + let wake_pending = mmio.wake_pending.clone(); + let worker_thread = mmio.worker_thread.clone(); + + MmioRegion { + base_address: APERTURE_BASE, + mask: APERTURE_MASK, + size: APERTURE_SIZE, + read_callback: Box::new(move |addr: u32| { + let reg_index = (addr & 0xFFFF) / 4; + match reg_index { + reg::CP_RB_WPTR => read_wptr.load(Ordering::Relaxed), + reg::CP_RB_RPTR => read_rptr.load(Ordering::Relaxed), + reg::CP_INT_STATUS => read_int_status.load(Ordering::Relaxed), + // Games sometimes read-back the ack register to check interrupt ownership + // — serve the last-written value. + reg::CP_INT_ACK => read_int_ack.load(Ordering::Relaxed), + reg::D1MODE_VBLANK_VLINE_STATUS => { + read_vblank_status.load(Ordering::Relaxed) + } + _ => { + tracing::trace!( + reg = format_args!("{reg_index:#x}"), + addr = format_args!("{addr:#010x}"), + "gpu mmio: unmapped read (returning 0)" + ); + 0 + } + } + }), + write_callback: Box::new(move |addr: u32, value: u32| { + let reg_index = (addr & 0xFFFF) / 4; + match reg_index { + reg::CP_RB_WPTR => { + // Release: any prior writes to ring memory the guest + // performed before bumping WPTR must be visible to + // the GPU consumer that Acquire-loads this atomic. + write_wptr.store(value, Ordering::Release); + // M1.7 parker wake: set the pending bit (Release) so + // a worker swapping it on its way to `park_timeout` + // sees `was_pending == true` and skips the park; AND + // unpark the worker if it's already parked. Both are + // necessary to defend against the race window between + // the worker's `swap(false)` and `park_timeout()`. + wake_pending.store(true, Ordering::Release); + if let Ok(g) = worker_thread.lock() { + if let Some(t) = g.as_ref() { + t.unpark(); + } + } + tracing::trace!( + value, + addr = format_args!("{addr:#010x}"), + "gpu mmio: CP_RB_WPTR write" + ); + } + // CP_INT_ACK clears interrupt bits; we just echo the value. + reg::CP_INT_ACK => { + write_int_ack.store(value, Ordering::Relaxed); + } + // D1MODE_VBLANK_VLINE_STATUS is write-1-to-clear per the + // AMD M56 display-controller ref. Clear any bit the guest + // writes a 1 to (leaving other bits untouched). + reg::D1MODE_VBLANK_VLINE_STATUS => { + let prev = write_vblank_status.load(Ordering::Relaxed); + write_vblank_status.store(prev & !value, Ordering::Relaxed); + } + _ => { + tracing::trace!( + reg = format_args!("{reg_index:#x}"), + addr = format_args!("{addr:#010x}"), + value = format_args!("{value:#x}"), + "gpu mmio: unmapped write (dropping)" + ); + } + } + }), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn build() -> (GpuMmio, MmioRegion) { + let mmio = GpuMmio::new(); + let region = build_region(&mmio); + (mmio, region) + } + + /// `D1MODE_VBLANK_VLINE_STATUS` read must surface the atomic's current + /// value — Sylpheed's graphics-interrupt callback reads bit 0 to decide + /// whether vblank actually fired; if we always return 0 the callback + /// silently skips every frame's work. + #[test] + fn vblank_status_read_returns_stored_value() { + let (mmio, region) = build(); + mmio.d1mode_vblank_vline_status + .store(0x1, Ordering::Relaxed); + let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4; + assert_eq!((region.read_callback)(offset), 0x1); + } + + /// Guest clears the flag by writing 1 back. Classic write-1-to-clear — + /// AMD M56 display-controller ref and Canary's behavior. We preserve + /// unrelated bits so higher-bit status (VLINE_INT_OCCURRED etc.) can + /// coexist with a concurrent clear of bit 0. + #[test] + fn vblank_status_write_1_to_clear() { + let (mmio, region) = build(); + mmio.d1mode_vblank_vline_status + .store(0b11, Ordering::Relaxed); + let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4; + (region.write_callback)(offset, 0b01); + assert_eq!( + mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed), + 0b10, + "bit 0 cleared, bit 1 preserved" + ); + } + + /// Write-0-to-a-bit must NOT clear that bit — classic W1TC semantics. + #[test] + fn vblank_status_write_0_is_noop() { + let (mmio, region) = build(); + mmio.d1mode_vblank_vline_status + .store(0b11, Ordering::Relaxed); + let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4; + (region.write_callback)(offset, 0x0); + assert_eq!( + mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed), + 0b11 + ); + } + + /// Regression: prior to the fix, `reg::CP_RB_WPTR` held a byte offset + /// (`0x0714`) while the match arm compared against a *register index* + /// (`(addr & 0xFFFF) / 4 == 0x01C5`). Guest MMIO writes to the WPTR + /// therefore fell through to "unmapped" and the atomic never moved; + /// only `VdInitializeRingBuffer` / `extend_write_ptr` paths worked. + /// + /// Verify every CP register lands in its atomic when the guest writes + /// at the canonical `APERTURE_BASE + index*4` byte address. + #[test] + fn cp_rb_wptr_write_via_mmio_bus_reaches_atomic() { + let (mmio, region) = build(); + let offset = APERTURE_BASE + reg::CP_RB_WPTR * 4; + assert_eq!(offset, 0x7FC8_0714, "byte offset must match Canary CP_RB_WPTR"); + (region.write_callback)(offset, 0x1234_5678); + assert_eq!(mmio.cp_rb_wptr.load(Ordering::Relaxed), 0x1234_5678); + } + + #[test] + fn cp_int_ack_write_via_mmio_bus_reaches_atomic() { + let (mmio, region) = build(); + let offset = APERTURE_BASE + reg::CP_INT_ACK * 4; + assert_eq!(offset, 0x7FC8_07D0, "byte offset must match Canary CP_INT_ACK"); + (region.write_callback)(offset, 0xDEAD_BEEF); + assert_eq!(mmio.cp_int_ack.load(Ordering::Relaxed), 0xDEAD_BEEF); + } + + #[test] + fn cp_rb_rptr_read_via_mmio_bus_returns_atomic() { + let (mmio, region) = build(); + mmio.cp_rb_rptr.store(0xCAFE_F00D, Ordering::Relaxed); + let offset = APERTURE_BASE + reg::CP_RB_RPTR * 4; + assert_eq!((region.read_callback)(offset), 0xCAFE_F00D); + } + + #[test] + fn cp_int_status_read_via_mmio_bus_returns_atomic() { + let (mmio, region) = build(); + mmio.cp_int_status.store(0x0000_0001, Ordering::Relaxed); + let offset = APERTURE_BASE + reg::CP_INT_STATUS * 4; + assert_eq!((region.read_callback)(offset), 0x0000_0001); + } +} diff --git a/crates/xenia-gpu/src/pm4.rs b/crates/xenia-gpu/src/pm4.rs new file mode 100644 index 0000000..370b0ba --- /dev/null +++ b/crates/xenia-gpu/src/pm4.rs @@ -0,0 +1,232 @@ +//! PM4 packet format — header decoding + Type-3 opcode set. +//! +//! Xenos PM4 packet layout mirrors `xenia-canary/src/xenia/gpu/packet_disassembler.cc`: +//! +//! - **Type 0** (`packet >> 30 == 0`): register-write run. +//! `count = ((packet >> 16) & 0x3FFF) + 1`. Total dwords = `1 + count`. +//! With `(packet >> 15) & 1 == 1`, all writes target the same register. +//! - **Type 1** (`packet >> 30 == 1`): two-register write. Total dwords = 3. +//! - **Type 2** (`packet >> 30 == 2`): NOP — a single skipped dword. +//! - **Type 3** (`packet >> 30 == 3`): command. +//! `opcode = (packet >> 8) & 0x7F`, +//! `count = ((packet >> 16) & 0x3FFF) + 1`. +//! Total dwords = `1 + count`. + +/// The cookie canary writes alongside `PM4_XE_SWAP` so tooling can recognize +/// swap packets. `'X','E','N','X'` big-endian (`kSwapSignature`). +pub const SWAP_SIGNATURE: u32 = 0x584E_4558; + +// ── Named Type-3 opcodes (from xenia-canary/src/xenia/gpu/xenos.h:1617-1679) ── + +pub const PM4_ME_INIT: u8 = 0x48; +pub const PM4_NOP: u8 = 0x10; +pub const PM4_INDIRECT_BUFFER: u8 = 0x3F; +pub const PM4_INDIRECT_BUFFER_PFD: u8 = 0x37; +pub const PM4_WAIT_FOR_IDLE: u8 = 0x26; +pub const PM4_WAIT_REG_MEM: u8 = 0x3C; +pub const PM4_REG_RMW: u8 = 0x21; +pub const PM4_REG_TO_MEM: u8 = 0x3E; +pub const PM4_MEM_WRITE: u8 = 0x3D; +pub const PM4_COND_WRITE: u8 = 0x45; +pub const PM4_EVENT_WRITE: u8 = 0x46; +pub const PM4_EVENT_WRITE_SHD: u8 = 0x58; +pub const PM4_EVENT_WRITE_EXT: u8 = 0x5A; +pub const PM4_EVENT_WRITE_ZPD: u8 = 0x5B; +pub const PM4_DRAW_INDX: u8 = 0x22; +pub const PM4_DRAW_INDX_2: u8 = 0x36; +pub const PM4_VIZ_QUERY: u8 = 0x23; +pub const PM4_SET_CONSTANT: u8 = 0x2D; +pub const PM4_SET_CONSTANT2: u8 = 0x55; +pub const PM4_SET_SHADER_CONSTANTS: u8 = 0x56; +pub const PM4_LOAD_ALU_CONSTANT: u8 = 0x2F; +pub const PM4_IM_LOAD: u8 = 0x27; +pub const PM4_IM_LOAD_IMMEDIATE: u8 = 0x2B; +pub const PM4_LOAD_CONSTANT_CONTEXT: u8 = 0x2E; +pub const PM4_INVALIDATE_STATE: u8 = 0x3B; +pub const PM4_INTERRUPT: u8 = 0x54; +pub const PM4_SET_SHADER_BASES: u8 = 0x4A; +pub const PM4_SET_BIN_MASK_LO: u8 = 0x60; +pub const PM4_SET_BIN_MASK_HI: u8 = 0x61; +pub const PM4_SET_BIN_SELECT_LO: u8 = 0x62; +pub const PM4_SET_BIN_SELECT_HI: u8 = 0x63; +pub const PM4_SET_BIN_MASK: u8 = 0x50; +pub const PM4_SET_BIN_SELECT: u8 = 0x51; +pub const PM4_CONTEXT_UPDATE: u8 = 0x5E; +/// Xenia-specific: `VdSwap` writes this to trigger a present. +pub const PM4_XE_SWAP: u8 = 0x64; + +/// Human-readable name for a Type-3 opcode. Used for tracing spans. +pub fn type3_opcode_name(op: u8) -> &'static str { + match op { + PM4_ME_INIT => "ME_INIT", + PM4_NOP => "NOP", + PM4_INDIRECT_BUFFER => "INDIRECT_BUFFER", + PM4_INDIRECT_BUFFER_PFD => "INDIRECT_BUFFER_PFD", + PM4_WAIT_FOR_IDLE => "WAIT_FOR_IDLE", + PM4_WAIT_REG_MEM => "WAIT_REG_MEM", + PM4_REG_RMW => "REG_RMW", + PM4_REG_TO_MEM => "REG_TO_MEM", + PM4_MEM_WRITE => "MEM_WRITE", + PM4_COND_WRITE => "COND_WRITE", + PM4_EVENT_WRITE => "EVENT_WRITE", + PM4_EVENT_WRITE_SHD => "EVENT_WRITE_SHD", + PM4_EVENT_WRITE_EXT => "EVENT_WRITE_EXT", + PM4_EVENT_WRITE_ZPD => "EVENT_WRITE_ZPD", + PM4_DRAW_INDX => "DRAW_INDX", + PM4_DRAW_INDX_2 => "DRAW_INDX_2", + PM4_VIZ_QUERY => "VIZ_QUERY", + PM4_SET_CONSTANT => "SET_CONSTANT", + PM4_SET_CONSTANT2 => "SET_CONSTANT2", + PM4_SET_SHADER_CONSTANTS => "SET_SHADER_CONSTANTS", + PM4_LOAD_ALU_CONSTANT => "LOAD_ALU_CONSTANT", + PM4_LOAD_CONSTANT_CONTEXT => "LOAD_CONSTANT_CONTEXT", + PM4_IM_LOAD => "IM_LOAD", + PM4_IM_LOAD_IMMEDIATE => "IM_LOAD_IMMEDIATE", + PM4_INVALIDATE_STATE => "INVALIDATE_STATE", + PM4_INTERRUPT => "INTERRUPT", + PM4_SET_SHADER_BASES => "SET_SHADER_BASES", + PM4_SET_BIN_MASK_LO => "SET_BIN_MASK_LO", + PM4_SET_BIN_MASK_HI => "SET_BIN_MASK_HI", + PM4_SET_BIN_SELECT_LO => "SET_BIN_SELECT_LO", + PM4_SET_BIN_SELECT_HI => "SET_BIN_SELECT_HI", + PM4_SET_BIN_MASK => "SET_BIN_MASK", + PM4_SET_BIN_SELECT => "SET_BIN_SELECT", + PM4_CONTEXT_UPDATE => "CONTEXT_UPDATE", + PM4_XE_SWAP => "XE_SWAP", + _ => "UNKNOWN", + } +} + +/// Decoded single PM4 packet header. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct PacketHeader { + pub kind: PacketKind, + /// Total size of the packet (including header) in dwords. + pub total_dwords: u32, +} + +/// Classification of a PM4 packet. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PacketKind { + /// Type-0 register-write run. `base_index` is the first register index + /// (the register offset / 4). `write_one` is true if all `count` data + /// dwords write to the same register. + Type0 { + base_index: u32, + count: u32, + write_one: bool, + }, + /// Type-1 two-register write. + Type1 { reg_index_1: u32, reg_index_2: u32 }, + /// Type-2 NOP (a single skipped dword). + Type2, + /// Type-3 command. + Type3 { + opcode: u8, + count: u32, + predicated: bool, + }, +} + +/// Decode a single PM4 packet header. +pub fn decode(header: u32) -> PacketHeader { + match header >> 30 { + 0 => { + let count = ((header >> 16) & 0x3FFF) + 1; + PacketHeader { + kind: PacketKind::Type0 { + base_index: header & 0x7FFF, + count, + write_one: (header >> 15) & 1 != 0, + }, + total_dwords: 1 + count, + } + } + 1 => PacketHeader { + kind: PacketKind::Type1 { + reg_index_1: header & 0x7FF, + reg_index_2: (header >> 11) & 0x7FF, + }, + total_dwords: 3, + }, + 2 => PacketHeader { + kind: PacketKind::Type2, + total_dwords: 1, + }, + 3 => { + let count = ((header >> 16) & 0x3FFF) + 1; + PacketHeader { + kind: PacketKind::Type3 { + opcode: ((header >> 8) & 0x7F) as u8, + count, + predicated: (header & 1) != 0, + }, + total_dwords: 1 + count, + } + } + _ => unreachable!(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn type2_is_one_dword() { + // 0x80000000 == type 2 header (bits 31:30 = 10) + let hdr = decode(0x8000_0000); + assert_eq!(hdr.kind, PacketKind::Type2); + assert_eq!(hdr.total_dwords, 1); + } + + #[test] + fn type0_count_is_inclusive() { + // count field (bits 29:16) = 5 → 6 data dwords. base_index = 0x100. + // write_one = 0. + let hdr = decode((5 << 16) | 0x100); + match hdr.kind { + PacketKind::Type0 { + base_index, + count, + write_one, + } => { + assert_eq!(base_index, 0x100); + assert_eq!(count, 6); + assert!(!write_one); + } + _ => panic!("expected Type0"), + } + assert_eq!(hdr.total_dwords, 7); + } + + #[test] + fn type3_swap_packet() { + // Build the exact header canary's VdSwap emits: + // MakePacketType3(PM4_XE_SWAP, 4) → ((3<<30) | ((4-1)<<16) | (0x64<<8)) + let hdr_word = (3u32 << 30) | ((4u32 - 1) << 16) | ((PM4_XE_SWAP as u32) << 8); + let hdr = decode(hdr_word); + match hdr.kind { + PacketKind::Type3 { + opcode, + count, + predicated, + } => { + assert_eq!(opcode, PM4_XE_SWAP); + assert_eq!(count, 4); + assert!(!predicated); + } + _ => panic!("expected Type3"), + } + assert_eq!(hdr.total_dwords, 5); + } + + #[test] + fn opcode_names_are_present_for_common_ops() { + assert_eq!(type3_opcode_name(PM4_NOP), "NOP"); + assert_eq!(type3_opcode_name(PM4_DRAW_INDX), "DRAW_INDX"); + assert_eq!(type3_opcode_name(PM4_XE_SWAP), "XE_SWAP"); + assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM"); + assert_eq!(type3_opcode_name(0xFE), "UNKNOWN"); + } +} diff --git a/crates/xenia-gpu/src/primitive.rs b/crates/xenia-gpu/src/primitive.rs new file mode 100644 index 0000000..6bbafed --- /dev/null +++ b/crates/xenia-gpu/src/primitive.rs @@ -0,0 +1,229 @@ +//! Primitive processor — normalize Xenos primitives into host-GPU forms. +//! +//! wgpu only exposes `PrimitiveTopology::{PointList, LineList, LineStrip, +//! TriangleList, TriangleStrip}`. For everything else (fans, quads, +//! rectangles) we rewrite indices on the CPU side so the host just sees a +//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`. +//! +//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need +//! (list, strip, fan). Rectangle + quad expansions are stubs logged via +//! `tracing::warn!` for later. + +use crate::draw_state::{IndexSize, PrimitiveType}; + +/// Host primitive topology — a subset of wgpu's that we commit to. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum HostTopology { + PointList, + LineList, + LineStrip, + TriangleList, + TriangleStrip, +} + +/// Result of primitive processing. +#[derive(Debug, Clone)] +pub struct ProcessedPrimitive { + pub topology: HostTopology, + /// When the Xenos primitive needed client-side rewriting (fans, quads), + /// this buffer holds the rewritten 16-bit or 32-bit index sequence. + /// `None` means the input index buffer is usable as-is. + pub rewritten_indices: Option>, + /// Post-processing vertex count — equals the input count when indices + /// pass through unchanged. + pub host_vertex_count: u32, + /// `true` if we rejected the primitive (unsupported shape) and the + /// caller should skip this draw. Logged via `tracing::warn!`. + pub rejected: bool, +} + +/// Normalize a draw. +/// +/// `indices` is `None` for `AutoIndex` draws; otherwise it's the decoded +/// index stream (already endian-converted / widened to u32 by the caller). +pub fn process( + primitive: PrimitiveType, + vertex_count: u32, + indices: Option<&[u32]>, +) -> ProcessedPrimitive { + match primitive { + PrimitiveType::PointList => pass_through(HostTopology::PointList, vertex_count), + PrimitiveType::LineList => pass_through(HostTopology::LineList, vertex_count), + PrimitiveType::LineStrip => pass_through(HostTopology::LineStrip, vertex_count), + PrimitiveType::TriangleList => pass_through(HostTopology::TriangleList, vertex_count), + PrimitiveType::TriangleStrip => pass_through(HostTopology::TriangleStrip, vertex_count), + PrimitiveType::TriangleFan => expand_fan(indices, vertex_count), + PrimitiveType::RectangleList => expand_rectangles(indices, vertex_count), + PrimitiveType::QuadList => expand_quads(indices, vertex_count), + PrimitiveType::None | PrimitiveType::Unknown(_) => { + tracing::warn!(?primitive, "gpu: rejecting unsupported primitive"); + metrics::counter!("gpu.primitive.rejected").increment(1); + ProcessedPrimitive { + topology: HostTopology::TriangleList, + rewritten_indices: None, + host_vertex_count: 0, + rejected: true, + } + } + } +} + +fn pass_through(topology: HostTopology, vertex_count: u32) -> ProcessedPrimitive { + ProcessedPrimitive { + topology, + rewritten_indices: None, + host_vertex_count: vertex_count, + rejected: false, + } +} + +/// Convert a triangle fan to a triangle list. Fan indices `[0, 1, 2, 3, 4]` +/// expand to triangles `(0,1,2), (0,2,3), (0,3,4)` — 3 × (n-2) host indices. +fn expand_fan(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive { + if vertex_count < 3 { + return ProcessedPrimitive { + topology: HostTopology::TriangleList, + rewritten_indices: Some(Vec::new()), + host_vertex_count: 0, + rejected: false, + }; + } + let mut out = Vec::with_capacity(3 * (vertex_count as usize - 2)); + let get = |i: u32| -> u32 { + match indices { + Some(buf) => buf[i as usize], + None => i, + } + }; + let apex = get(0); + for i in 1..vertex_count.saturating_sub(1) { + out.push(apex); + out.push(get(i)); + out.push(get(i + 1)); + } + let host_vertex_count = out.len() as u32; + ProcessedPrimitive { + topology: HostTopology::TriangleList, + rewritten_indices: Some(out), + host_vertex_count, + rejected: false, + } +} + +/// Convert a quad list (groups of 4) to a triangle list (groups of 6). +fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive { + let quad_count = vertex_count / 4; + let mut out = Vec::with_capacity(6 * quad_count as usize); + let get = |i: u32| -> u32 { + match indices { + Some(buf) => buf[i as usize], + None => i, + } + }; + for q in 0..quad_count { + let base = q * 4; + let a = get(base); + let b = get(base + 1); + let c = get(base + 2); + let d = get(base + 3); + out.extend_from_slice(&[a, b, c, a, c, d]); + } + let host_vertex_count = out.len() as u32; + ProcessedPrimitive { + topology: HostTopology::TriangleList, + rewritten_indices: Some(out), + host_vertex_count, + rejected: false, + } +} + +/// Rectangle lists: a Xenos-specific primitive where each group of 3 +/// vertices defines a right-angle rectangle by its three non-repeated +/// corners (the 4th is derived). The uber-shader doesn't support this yet; +/// the ucode translator will emulate it as a geometry-stage fake. For P3 +/// we emit an empty draw. +fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive { + tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)"); + metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1); + ProcessedPrimitive { + topology: HostTopology::TriangleList, + rewritten_indices: Some(Vec::new()), + host_vertex_count: 0, + rejected: true, + } +} + +/// Widen a u16 index buffer to u32. The primitive processor normalizes to +/// u32 so downstream wgpu pipeline descriptors stay simple. +pub fn widen_indices(raw: &[u8], size: IndexSize, count: u32) -> Vec { + let mut out = Vec::with_capacity(count as usize); + match size { + IndexSize::Sixteen => { + for i in 0..count as usize { + let off = i * 2; + if off + 2 > raw.len() { + break; + } + // Xenos indices are big-endian on the wire. + let be = u16::from_be_bytes([raw[off], raw[off + 1]]); + out.push(be as u32); + } + } + IndexSize::ThirtyTwo => { + for i in 0..count as usize { + let off = i * 4; + if off + 4 > raw.len() { + break; + } + let be = u32::from_be_bytes([raw[off], raw[off + 1], raw[off + 2], raw[off + 3]]); + out.push(be); + } + } + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn triangle_list_passes_through() { + let p = process(PrimitiveType::TriangleList, 6, None); + assert_eq!(p.topology, HostTopology::TriangleList); + assert!(p.rewritten_indices.is_none()); + assert_eq!(p.host_vertex_count, 6); + assert!(!p.rejected); + } + + #[test] + fn fan_to_list_expands_correctly() { + // Fan of 5 vertices → triangles (0,1,2), (0,2,3), (0,3,4) + let p = process(PrimitiveType::TriangleFan, 5, None); + let idx = p.rewritten_indices.unwrap(); + assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 0, 3, 4]); + assert_eq!(p.topology, HostTopology::TriangleList); + assert_eq!(p.host_vertex_count, 9); + } + + #[test] + fn quad_list_expansion() { + let p = process(PrimitiveType::QuadList, 8, None); + let idx = p.rewritten_indices.unwrap(); + assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]); + } + + #[test] + fn widen_u16_indices_big_endian() { + // 3 indices [1, 2, 0x1234] in BE u16. + let raw = [0, 1, 0, 2, 0x12, 0x34]; + let out = widen_indices(&raw, IndexSize::Sixteen, 3); + assert_eq!(out, vec![1, 2, 0x1234]); + } + + #[test] + fn rejects_unknown_primitive() { + let p = process(PrimitiveType::Unknown(0x2A), 3, None); + assert!(p.rejected); + } +} diff --git a/crates/xenia-gpu/src/render_target_cache.rs b/crates/xenia-gpu/src/render_target_cache.rs new file mode 100644 index 0000000..43ee25d --- /dev/null +++ b/crates/xenia-gpu/src/render_target_cache.rs @@ -0,0 +1,384 @@ +//! EDRAM tile book + render-target key bookkeeping. +//! +//! Mirrors `xenia-canary/src/xenia/gpu/render_target_cache.h` at the data- +//! structure level. Xenos's 10 MiB EDRAM is divided into 2048 "tiles" of +//! 80×16 samples each; render targets claim a contiguous range of those +//! tiles based on `(base_tiles, pitch_tiles_at_32bpp, msaa_samples, format, +//! is_depth)`. Two render targets with overlapping tile ranges share the +//! underlying EDRAM — canary tracks this with per-tile "Host vs Shared" +//! ownership, which is what this module's `TileOwner` captures. +//! +//! P4 ships the **bookkeeping**. Actual host texture allocation per key (so +//! the host can draw into a wgpu texture matching the guest's RT) is left to +//! a future host-side cache built on top of this module; the same for +//! format-conversion compute shaders (the plan's P5 territory). + +use std::collections::HashMap; + +/// Number of EDRAM tiles on Xenos. Matches canary's `xenos::kEdramTileCount`. +pub const EDRAM_TILE_COUNT: usize = 2048; + +/// MSAA sample count encoded into [`RenderTargetKey`]. Canary uses this as +/// `xenos::MsaaSamples` (1×/2×/4×). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum MsaaSamples { + X1 = 0, + X2 = 1, + X4 = 2, +} + +impl MsaaSamples { + pub fn from_raw(raw: u32) -> Self { + match raw & 0x3 { + 1 => MsaaSamples::X2, + 2 => MsaaSamples::X4, + _ => MsaaSamples::X1, + } + } + pub fn count(self) -> u32 { + 1u32 << (self as u32) + } +} + +/// The packed EDRAM render-target identity. Bit layout matches +/// `render_target_cache.h:251-321`'s `RenderTargetKey` union (26 bits used, +/// stored as a single `u32` so it hashes cheaply). `pitch_tiles_at_32bpp` +/// is always the 32bpp-equivalent pitch — 64bpp targets halve their tile +/// pitch from the nominal tile grid (canary's `GetPitchTiles()` handles +/// that). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct RenderTargetKey { + pub base_tiles: u16, // [0..2048) + pub pitch_tiles_at_32bpp: u16, // 0..=256 in practice + pub msaa_samples: MsaaSamples, + pub is_depth: bool, + /// Color format: `xenos::ColorRenderTargetFormat` when !is_depth. + /// Depth format: `xenos::DepthRenderTargetFormat` when is_depth. + pub resource_format: u8, // 4 bits +} + +impl RenderTargetKey { + /// Pack into canary's 26-bit layout. Useful for compact storage / + /// hashing when we add a LRU cache later on. + pub fn pack(&self) -> u32 { + (self.base_tiles as u32 & 0x7FF) + | (((self.pitch_tiles_at_32bpp as u32) & 0xFF) << 11) + | (((self.msaa_samples as u32) & 0x3) << 19) + | ((self.is_depth as u32) << 21) + | (((self.resource_format as u32) & 0xF) << 22) + } + + pub fn unpack(raw: u32) -> Self { + Self { + base_tiles: (raw & 0x7FF) as u16, + pitch_tiles_at_32bpp: ((raw >> 11) & 0xFF) as u16, + msaa_samples: MsaaSamples::from_raw((raw >> 19) & 0x3), + is_depth: ((raw >> 21) & 1) != 0, + resource_format: ((raw >> 22) & 0xF) as u8, + } + } + + /// How many EDRAM tiles the whole surface occupies (rough estimate; a + /// real height-aware calc needs viewport info). We conservatively use + /// `pitch_tiles_at_32bpp * 1` until a draw tells us otherwise; callers + /// that know the height can call [`tile_footprint_with_height`]. + pub fn tile_pitch(&self) -> u16 { + // 64bpp formats pack two 32bpp tiles into one 64bpp tile. + if self.is_64bpp() { + self.pitch_tiles_at_32bpp / 2 + } else { + self.pitch_tiles_at_32bpp + } + } + + pub fn is_64bpp(&self) -> bool { + if self.is_depth { + false + } else { + // Canary: `ColorRenderTargetFormat::{k_16_16_16_16, + // k_16_16_16_16_FLOAT, k_32_32_FLOAT}` are 64bpp; indices 4, 5, 7 + // in the enum. (Kept narrow because the enum is 4 bits wide.) + matches!(self.resource_format, 4 | 5 | 7) + } + } + + /// Tiles claimed by this RT if its surface height is `rows_of_tiles` + /// (i.e. `ceil(height_in_samples / 16)`). + pub fn tile_footprint_with_height(&self, rows_of_tiles: u16) -> u16 { + self.tile_pitch().saturating_mul(rows_of_tiles) + } +} + +/// Who currently owns a tile of EDRAM. +/// +/// `None`: untouched; free to claim. +/// `Host(idx)`: a single RT has exclusive ownership. +/// `Shared(idx)`: two+ RT keys map to the same tile (usually after a +/// format change without an intervening clear); the named RT is the most +/// recent owner whose format should be honored for readback. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Default)] +pub enum TileOwner { + #[default] + None, + Host(u32), + Shared(u32), +} + + +/// Bookkeeping across the 2048 EDRAM tiles. Not a GPU resource by itself — +/// tracks which render target (by index) currently owns each tile. +pub struct EdramTileBook { + tiles: Vec, +} + +impl Default for EdramTileBook { + fn default() -> Self { + Self::new() + } +} + +impl EdramTileBook { + pub fn new() -> Self { + Self { + tiles: vec![TileOwner::None; EDRAM_TILE_COUNT], + } + } + + pub fn who_owns(&self, tile: u16) -> TileOwner { + self.tiles + .get(tile as usize) + .copied() + .unwrap_or(TileOwner::None) + } + + /// Mark `[base, base+count)` as owned by `rt_idx`. Pre-existing owners + /// in the range are demoted to `Shared` (format reinterpretation). + /// Returns the number of tiles newly claimed (not previously the same + /// owner). + pub fn claim(&mut self, base: u16, count: u16, rt_idx: u32) -> u32 { + let mut newly_claimed = 0u32; + for i in 0..(count as usize) { + let t = base as usize + i; + if t >= self.tiles.len() { + break; + } + let prev = self.tiles[t]; + let already_ours = matches!( + prev, + TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx + ); + match prev { + TileOwner::None => { + self.tiles[t] = TileOwner::Host(rt_idx); + } + TileOwner::Host(idx) if idx == rt_idx => { + // re-claim of same RT — no-op + } + _ => { + // Format change / shared range. + self.tiles[t] = TileOwner::Shared(rt_idx); + } + } + if !already_ours { + newly_claimed += 1; + } + } + newly_claimed + } + + /// Drop `rt_idx` from any tile it owns; tiles revert to `None` unless + /// they were `Shared(rt_idx)` (in which case they also revert to + /// `None`; the other sharer's ownership is lost — `release` is a + /// coarse "this RT is gone" operation). + pub fn release(&mut self, rt_idx: u32) { + for t in self.tiles.iter_mut() { + if matches!( + *t, + TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx + ) { + *t = TileOwner::None; + } + } + } + + /// Count tiles currently assigned to any RT (Host or Shared). + pub fn occupied_count(&self) -> u32 { + self.tiles + .iter() + .filter(|o| !matches!(o, TileOwner::None)) + .count() as u32 + } +} + +/// Minimal per-RT descriptor stored alongside the tile book. P5's texture +/// cache will expand this with the actual wgpu texture handle. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtDescriptor { + pub key: RenderTargetKey, + /// Number of times this key has been bound since creation. Rough + /// proxy for activity / hot-RT identification. + pub bind_count: u32, + /// Draw index on first bind — handy for debugging divergence. + pub first_draw_index: u32, +} + +/// Top-level cache: maps packed keys to small descriptors + the tile book. +pub struct RenderTargetCache { + next_idx: u32, + by_key: HashMap, + descriptors: HashMap, + pub tiles: EdramTileBook, +} + +impl Default for RenderTargetCache { + fn default() -> Self { + Self::new() + } +} + +impl RenderTargetCache { + pub fn new() -> Self { + Self { + next_idx: 0, + by_key: HashMap::new(), + descriptors: HashMap::new(), + tiles: EdramTileBook::new(), + } + } + + /// Look up or allocate an RT descriptor for `key`. `draw_index` is the + /// current monotonic draw counter — recorded on first insert for + /// provenance. + pub fn bind(&mut self, key: RenderTargetKey, draw_index: u32) -> u32 { + let packed = key.pack(); + if let Some(&idx) = self.by_key.get(&packed) { + if let Some(d) = self.descriptors.get_mut(&idx) { + d.bind_count += 1; + } + return idx; + } + let idx = self.next_idx; + self.next_idx += 1; + self.by_key.insert(packed, idx); + self.descriptors.insert( + idx, + RtDescriptor { + key, + bind_count: 1, + first_draw_index: draw_index, + }, + ); + idx + } + + pub fn descriptor(&self, idx: u32) -> Option<&RtDescriptor> { + self.descriptors.get(&idx) + } + + pub fn len(&self) -> usize { + self.descriptors.len() + } + + pub fn is_empty(&self) -> bool { + self.descriptors.is_empty() + } + + /// Claim tiles for the descriptor at `rt_idx`. `height_tiles` is + /// `ceil(viewport_height_samples / 16)` — callers supply it because + /// the key itself doesn't carry height. + pub fn claim_tiles(&mut self, rt_idx: u32, height_tiles: u16) -> u32 { + if let Some(d) = self.descriptors.get(&rt_idx) { + let footprint = d.key.tile_footprint_with_height(height_tiles); + self.tiles.claim(d.key.base_tiles, footprint, rt_idx) + } else { + 0 + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn render_target_key_pack_roundtrip() { + let k = RenderTargetKey { + base_tiles: 1600, + pitch_tiles_at_32bpp: 80, + msaa_samples: MsaaSamples::X4, + is_depth: true, + resource_format: 0b1010, + }; + let packed = k.pack(); + let round = RenderTargetKey::unpack(packed); + assert_eq!(round, k); + } + + #[test] + fn tile_book_claim_marks_owners() { + let mut book = EdramTileBook::new(); + assert_eq!(book.occupied_count(), 0); + let new_count = book.claim(100, 10, 42); + assert_eq!(new_count, 10); + assert_eq!(book.who_owns(100), TileOwner::Host(42)); + assert_eq!(book.who_owns(109), TileOwner::Host(42)); + assert_eq!(book.who_owns(110), TileOwner::None); + } + + #[test] + fn tile_book_claim_demotes_to_shared() { + let mut book = EdramTileBook::new(); + book.claim(100, 10, 1); + book.claim(105, 10, 2); + // Overlap: tiles 105..110 should be Shared(2); 100..105 stay Host(1); + // tiles 110..115 are fresh Host(2). + assert_eq!(book.who_owns(104), TileOwner::Host(1)); + assert_eq!(book.who_owns(105), TileOwner::Shared(2)); + assert_eq!(book.who_owns(110), TileOwner::Host(2)); + } + + #[test] + fn tile_book_release_frees_all() { + let mut book = EdramTileBook::new(); + book.claim(0, 50, 7); + book.release(7); + assert_eq!(book.occupied_count(), 0); + } + + #[test] + fn rt_cache_bind_is_idempotent_by_key() { + let mut cache = RenderTargetCache::new(); + let k = RenderTargetKey { + base_tiles: 0, + pitch_tiles_at_32bpp: 80, + msaa_samples: MsaaSamples::X1, + is_depth: false, + resource_format: 0, + }; + let a = cache.bind(k, 0); + let b = cache.bind(k, 1); + assert_eq!(a, b); + let d = cache.descriptor(a).unwrap(); + assert_eq!(d.bind_count, 2); + assert_eq!(d.first_draw_index, 0); + } + + #[test] + fn rt_cache_claim_tiles_tracks_footprint() { + let mut cache = RenderTargetCache::new(); + let k = RenderTargetKey { + base_tiles: 0, + pitch_tiles_at_32bpp: 80, // 32bpp 1280-wide target + msaa_samples: MsaaSamples::X1, + is_depth: false, + resource_format: 0, + }; + let idx = cache.bind(k, 0); + // 720 samples tall / 16 per tile = 45 rows → 80 * 45 = 3600 tiles; + // caps out at 2048. Verify clamping. + let newly = cache.claim_tiles(idx, 45); + assert_eq!(newly, 2048); + assert_eq!(cache.tiles.occupied_count(), 2048); + } +} diff --git a/crates/xenia-gpu/src/resolve.rs b/crates/xenia-gpu/src/resolve.rs new file mode 100644 index 0000000..e81bfe0 --- /dev/null +++ b/crates/xenia-gpu/src/resolve.rs @@ -0,0 +1,1260 @@ +//! EDRAM→guest-memory resolve byte copy. +//! +//! Fires from [`crate::gpu_system::GpuSystem::handle_event_initiator`] on +//! `TILE_FLUSH` (event 15). Reads samples out of the shadow EDRAM at the +//! source tile range, applies the `Endian128` byte swap, and writes tiled +//! u32 samples into guest memory via a 32bpp bitwise-equivalent fast path +//! (Canary `IsColorResolveFormatBitwiseEquivalent` — `xenos.h:614-639`). +//! +//! Ground truth: `xenia-canary/src/xenia/gpu/draw_util.cc:1102-1370` and +//! `xenos.h:1077-1114` (`GpuSwapInline`), `1039-1052` (`CopySampleSelect`). +//! +//! ## Endian ordering +//! +//! [`xenia_memory::access::MemoryAccess::write_u32`] stores big-endian +//! bytes (it calls `val.to_be_bytes()`). The Xenon CPU sees memory as big- +//! endian u32s, so `write_u32(addr, 0x11223344)` lands `[0x11, 0x22, 0x33, +//! 0x44]` in memory — which is the `kNone` (no swap) byte order from the +//! host's view of the sample. +//! +//! The resolve has an `Endian128` mode controlled by +//! `RB_COPY_DEST_INFO.copy_dest_endian`: games typically set `k8in32` so +//! that later texture fetches see little-endian bytes. We therefore +//! pre-swap the sample *before* `write_u32` so the big-endian store yields +//! the desired byte order in memory. + +use crate::draw_state::{ResolveInfo, ResolveSource}; +use crate::edram::ShadowEdram; +use crate::render_target_cache::MsaaSamples; +use crate::tiled_address::{align_pitch_to_macro_tile, tiled_2d_offset}; + +use xenia_memory::access::MemoryAccess; + +/// Stats returned from one resolve copy. Aggregated by the caller into +/// `GpuStats` counters so the HUD can surface them. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct ResolveCopyStats { + /// Number of 32bpp samples actually written to guest memory. + pub samples_written: u32, + /// Was the format path supported? `false` means we skipped. + pub supported: bool, +} + +/// `xenos::CopyCommand::kNull` = 3 — resolve emits no copy (clear-only). +pub const COPY_COMMAND_NULL: u8 = 3; + +/// Sanitized sample selector (`xenos::CopySampleSelect`, `xenos.h:1039`). +/// We keep the *raw* enum value in `ResolveInfo` and pass a sanitized one +/// here so callers can match on the effective mode rather than re-applying +/// the MSAA/depth sanitation rules from Canary `draw_util.cc:839-876`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CopySampleSelect { + K0 = 0, + K1 = 1, + K2 = 2, + K3 = 3, + K01 = 4, + K23 = 5, + K0123 = 6, +} + +impl CopySampleSelect { + pub fn from_raw(raw: u8) -> Self { + match raw & 0x7 { + 1 => Self::K1, + 2 => Self::K2, + 3 => Self::K3, + 4 => Self::K01, + 5 => Self::K23, + 6 | 7 => Self::K0123, + _ => Self::K0, + } + } + + /// Single-sample picks return `Some(index 0..=3)`; averaging picks + /// return `None` (caller must synthesize via per-sample reads). + pub fn single_sample_index(self) -> Option { + match self { + Self::K0 => Some(0), + Self::K1 => Some(1), + Self::K2 => Some(2), + Self::K3 => Some(3), + _ => None, + } + } + + /// `IsSingleCopySampleSelected` from `xenos.h:1049`. + pub fn is_single_sample(self) -> bool { + self.single_sample_index().is_some() + } +} + +/// `SanitizeCopySampleSelect` (Canary `draw_util.cc:839-876`). MSAA +/// modes + depth limit which sample selectors are valid; invalid ones +/// are silently remapped. Returning the sanitized enum lets the resolve +/// loop assume a single-sample pick for 1x MSAA, etc. +pub fn sanitize_sample_select( + raw: u8, + msaa: MsaaSamples, + is_depth: bool, +) -> CopySampleSelect { + let select = CopySampleSelect::from_raw(raw); + match msaa { + MsaaSamples::X1 => { + // Only sample 0 exists. Averaging modes → k0; >k0123 clamp. + match select { + CopySampleSelect::K0 => CopySampleSelect::K0, + _ => CopySampleSelect::K0, + } + } + MsaaSamples::X2 => { + // Samples 0 and 1 exist (stacked vertically). k2 → k0, k3 → k1; + // k23 → k01. Depth cannot average. + match select { + CopySampleSelect::K0 => CopySampleSelect::K0, + CopySampleSelect::K1 => CopySampleSelect::K1, + CopySampleSelect::K2 => CopySampleSelect::K0, + CopySampleSelect::K3 => CopySampleSelect::K1, + CopySampleSelect::K01 | CopySampleSelect::K23 | CopySampleSelect::K0123 => { + if is_depth { + CopySampleSelect::K0 + } else { + CopySampleSelect::K01 + } + } + } + } + MsaaSamples::X4 => { + // All single-samples valid. Depth cannot average → pick + // representative single sample (k01→k0, k23→k2, k0123→k0). + if is_depth { + match select { + CopySampleSelect::K01 => CopySampleSelect::K0, + CopySampleSelect::K23 => CopySampleSelect::K2, + CopySampleSelect::K0123 => CopySampleSelect::K0, + other => other, + } + } else { + select + } + } + } +} + +/// Sample-index to in-pixel (dx, dy) offset for the current MSAA mode. +/// Matches the standard Xbox 360 MSAA sample layout (Canary +/// `texture_util::GetMsaaSampleLocation` / the shader constants). For 1x, +/// always `(0, 0)`. +/// +/// * 2x MSAA: sample 0 = top line, sample 1 = bottom line. +/// * 4x MSAA: 2×2 grid `{(0,0),(1,0),(0,1),(1,1)}`. +#[inline] +fn sample_offset_in_pixel(sample_idx: u8, msaa: MsaaSamples) -> (u32, u32) { + match msaa { + MsaaSamples::X1 => (0, 0), + MsaaSamples::X2 => (0, (sample_idx & 1) as u32), + MsaaSamples::X4 => ((sample_idx & 1) as u32, ((sample_idx >> 1) & 1) as u32), + } +} + +/// Apply the `Endian128` byte swap to one 32-bit sample. Matches the cases +/// inside `GpuSwapInline` plus the 64/128-bit variants from +/// `xenos::Endian128`. The 64/128 modes cannot be expressed in a single u32 +/// so they fall through to `k8in32` and log at the call site. +#[inline] +pub fn apply_endian_128(value: u32, endian: u8) -> u32 { + match endian { + 0 => value, + // k8in16: swap bytes within each 16-bit word. + 1 => ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8), + // k8in32: full byte reversal. + 2 => value.swap_bytes(), + // k16in32: swap 16-bit halves. + 3 => value.rotate_left(16), + // k8in64 / k8in128: require cross-dword context. Approximate with + // k8in32 (byte-reverse each dword) so the bytes land in a sensible + // order; the caller logs the approximation. + 4 | 5 => value.swap_bytes(), + _ => value, + } +} + +/// `xenos::ColorFormat` values we use as destination formats for 32bpp +/// resolves. Canary `xenos.h:582-609`. +mod color_format { + pub const K_8_8_8_8: u8 = 6; + pub const K_2_10_10_10: u8 = 7; + pub const K_8_8_8_8_A: u8 = 14; + pub const K_16_16_FLOAT: u8 = 31; + pub const K_32_FLOAT: u8 = 36; + pub const K_8_8_8_8_AS_16_16_16_16: u8 = 50; + pub const K_2_10_10_10_AS_16_16_16_16: u8 = 54; + // ── 64bpp dest formats (Canary `xenos.h:582-609`) ────────────────── + /// `k_16_16_16_16` (4 channels × 16 bits, signed/unsigned variants + /// resolve identically — same bit layout). + pub const K_16_16_16_16: u8 = 26; + /// `k_16_16_16_16_FLOAT` (4 channels × half-float). + pub const K_16_16_16_16_FLOAT: u8 = 32; + /// `k_32_32_FLOAT` (R32 + G32, 64bpp). `xenos::TextureFormat = 37`. + pub const K_32_32_FLOAT: u8 = 37; + /// Depth textures (Canary `xenos::TextureFormat`). + pub const K_24_8: u8 = 22; + pub const K_24_8_FLOAT: u8 = 23; +} + +/// 32-bit bitwise-equivalence check covering 32bpp color and depth resolves. +/// Color side mirrors `xenos::IsColorResolveFormatBitwiseEquivalent` +/// (`xenos.h:614-639`). Depth side maps `DepthRenderTargetFormat` to +/// its textural form (`kD24S8 → k_24_8`, `kD24FS8 → k_24_8_FLOAT`). +pub fn is_32bpp_bitwise_equivalent( + source: ResolveSource, + source_is_64bpp: bool, + source_format: u8, + dest_format: u8, +) -> bool { + if source_is_64bpp { + return false; + } + match source { + ResolveSource::Color(_) => { + use color_format as cf; + match source_format { + // k_8_8_8_8 (0) and k_8_8_8_8_GAMMA (1). Gamma decode is + // applied by the sampler at texture-fetch time (TextureSign:: + // kGamma); the bits are identical, so the copy path is the + // same. + 0 | 1 => matches!( + dest_format, + cf::K_8_8_8_8 | cf::K_8_8_8_8_A | cf::K_8_8_8_8_AS_16_16_16_16 + ), + // k_2_10_10_10 (2) and k_2_10_10_10_AS_10_10_10_10 (10). + 2 | 10 => matches!( + dest_format, + cf::K_2_10_10_10 | cf::K_2_10_10_10_AS_16_16_16_16 + ), + // k_16_16_FLOAT (6). + 6 => dest_format == cf::K_16_16_FLOAT, + // k_32_FLOAT (14). + 14 => dest_format == cf::K_32_FLOAT, + _ => false, + } + } + ResolveSource::Depth => match source_format { + // kD24S8 (0) → k_24_8 (22). + 0 => dest_format == color_format::K_24_8, + // kD24FS8 (1) → k_24_8_FLOAT (23). + 1 => dest_format == color_format::K_24_8_FLOAT, + _ => false, + }, + } +} + +/// 64-bit bitwise-equivalence check (Canary `xenos.h:614-639` 64bpp arms). +/// Used when `info.source_is_64bpp == true`. Only color resolves go here — +/// depth is always 32bpp. +pub fn is_64bpp_bitwise_equivalent(source_format: u8, dest_format: u8) -> bool { + use color_format as cf; + match source_format { + // k_16_16_16_16 (5) — signed and unsigned variants resolve to the + // same bits because the resolve is a raw byte copy. + 5 => dest_format == cf::K_16_16_16_16, + // k_16_16_16_16_FLOAT (7). + 7 => dest_format == cf::K_16_16_16_16_FLOAT, + // k_32_32_FLOAT (15). + 15 => dest_format == cf::K_32_32_FLOAT, + _ => false, + } +} + +/// Run one resolve copy. Returns the number of samples successfully +/// written and whether the dest format was supported; the caller updates +/// `GpuStats::resolves_copied_total` / `resolves_skipped_total` accordingly. +pub fn copy_to_memory( + info: &ResolveInfo, + edram: &ShadowEdram, + mem: &dyn MemoryAccess, +) -> ResolveCopyStats { + // --- No-op paths (not a failure) --- + if info.coords.width == 0 || info.coords.height == 0 { + return ResolveCopyStats { + samples_written: 0, + supported: true, + }; + } + if info.copy_command == COPY_COMMAND_NULL { + return ResolveCopyStats { + samples_written: 0, + supported: true, + }; + } + + // --- Supported-shape gates --- + if info.copy_dest_array { + tracing::warn!( + src = info.copy_src_select, + fmt = info.dest_format, + "gpu: resolve skipped — copy_dest_array (3D/stacked) not implemented" + ); + return ResolveCopyStats::default(); + } + if info.dest_exp_bias != 0 { + tracing::warn!( + bias = info.dest_exp_bias, + "gpu: resolve skipped — dest_exp_bias != 0 not implemented" + ); + return ResolveCopyStats::default(); + } + let supported = if info.source_is_64bpp { + // 64bpp color resolve. Depth is always 32bpp so this only fires + // for `ResolveSource::Color(_)`. + matches!(info.source, ResolveSource::Color(_)) + && is_64bpp_bitwise_equivalent(info.source_format, info.dest_format) + } else { + is_32bpp_bitwise_equivalent( + info.source, + info.source_is_64bpp, + info.source_format, + info.dest_format, + ) + }; + if !supported { + tracing::warn!( + source = ?info.source, + source_format = info.source_format, + source_is_64bpp = info.source_is_64bpp, + dest_format = info.dest_format, + "gpu: resolve skipped — not a bitwise-equivalent pair" + ); + return ResolveCopyStats::default(); + } + + if info.dest_endian >= 4 { + tracing::warn!( + endian = info.dest_endian, + "gpu: resolve endian k8in64/k8in128 approximated as k8in32" + ); + } + + // Destination pitch must be aligned to 32 texels per + // `kStoragePitchHeightAlignmentBlocks`. `align_pitch_to_macro_tile` + // rounds to 32 (it's `MACRO_TILE_WIDTH_LOG2 = 5`). + let pitch_aligned = align_pitch_to_macro_tile(info.dest_pitch_pixels); + if pitch_aligned == 0 { + return ResolveCopyStats { + samples_written: 0, + supported: true, + }; + } + // bpp_log2: 2 for 32bpp, 3 for 64bpp. Drives the `tiled_2d_offset` + // stride calculation per Canary `texture_address.h:120-180`. + let bpp_log2: u32 = if info.source_is_64bpp { 3 } else { 2 }; + + let is_depth = matches!(info.source, ResolveSource::Depth); + let sanitized = sanitize_sample_select(info.copy_sample_select, info.msaa, is_depth); + // For averaging modes we'd previously fall back to sample 0 + warn. + // 3A wires real averaging via `read_pixel_averaged`; single-sample + // picks still take the fast path. + let single_sample_idx = sanitized.single_sample_index(); + + let mut samples_written: u32 = 0; + for dy in 0..info.coords.height { + let pixel_y = info.coords.y0 + dy; + for dx in 0..info.coords.width { + let pixel_x = info.coords.x0 + dx; + // Destination coordinates are 0-based against `dest_base` — the + // base already points at the top-left of the copy rectangle. + let dst_off = tiled_2d_offset(dx, dy, pitch_aligned, bpp_log2); + let dst_addr = info.dest_base.wrapping_add(dst_off); + + if info.source_is_64bpp { + let (lo, hi) = match single_sample_idx { + Some(idx) => { + let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords); + edram.read_sample_64bpp( + info.source_base_tiles, + info.surface_pitch_tiles, + sx, + sy, + ) + } + None => read_pixel_averaged_64bpp(edram, info, sanitized, pixel_x, pixel_y), + }; + let lo_swapped = apply_endian_128(lo, info.dest_endian); + let hi_swapped = apply_endian_128(hi, info.dest_endian); + mem.write_u32(dst_addr, lo_swapped); + mem.write_u32(dst_addr.wrapping_add(4), hi_swapped); + samples_written += 1; + } else { + let sample = match single_sample_idx { + Some(idx) => { + let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords); + edram.read_sample_32bpp( + info.source_base_tiles, + info.surface_pitch_tiles, + sx, + sy, + ) + } + None => read_pixel_averaged_32bpp( + edram, + info, + sanitized, + pixel_x, + pixel_y, + ), + }; + let swapped = apply_endian_128(sample, info.dest_endian); + mem.write_u32(dst_addr, swapped); + samples_written += 1; + } + } + } + + ResolveCopyStats { + samples_written, + supported: true, + } +} + +/// Compute the EDRAM sample-space (x, y) for `(pixel_x, pixel_y)` and a +/// single MSAA sample index. +#[inline] +fn sample_xy( + pixel_x: u32, + pixel_y: u32, + sample_idx: u8, + msaa: MsaaSamples, + coords: &crate::draw_state::ResolveCoordinates, +) -> (u32, u32) { + let (sample_dx, sample_dy) = sample_offset_in_pixel(sample_idx, msaa); + let sx = (pixel_x << coords.sample_count_log2_x) + sample_dx; + let sy = (pixel_y << coords.sample_count_log2_y) + sample_dy; + (sx, sy) +} + +/// Sample indices selected by an averaging `CopySampleSelect`. +/// `K01 → [0, 1]`, `K23 → [2, 3]`, `K0123 → [0, 1, 2, 3]`. Single-sample +/// picks should never reach this helper (caller checks `single_sample_index`). +fn averaging_sample_set(select: CopySampleSelect) -> &'static [u8] { + match select { + CopySampleSelect::K01 => &[0, 1], + CopySampleSelect::K23 => &[2, 3], + CopySampleSelect::K0123 => &[0, 1, 2, 3], + // Single-sample picks: caller must never invoke this — fall back + // to sample 0 just to keep the function total. + _ => &[0], + } +} + +/// Average N samples of a 32bpp pixel format. Each sample is read, decoded +/// by `source_format`, averaged in the appropriate numeric space, then +/// re-encoded back into the same 32bpp word. Mirrors Canary's resolve +/// shader paths in `resolve.xesli:595-629` (per-format averaging) — we +/// implement them on the CPU because the resolve runs on the host. +fn read_pixel_averaged_32bpp( + edram: &ShadowEdram, + info: &ResolveInfo, + select: CopySampleSelect, + pixel_x: u32, + pixel_y: u32, +) -> u32 { + let indices = averaging_sample_set(select); + let n = indices.len() as u32; + if n == 0 { + return 0; + } + // Pull every selected sample. + let mut raw = [0u32; 4]; + for (i, &idx) in indices.iter().enumerate() { + let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords); + raw[i] = edram.read_sample_32bpp(info.source_base_tiles, info.surface_pitch_tiles, sx, sy); + } + let raw_slice = &raw[..indices.len()]; + average_samples_32bpp(raw_slice, info.source_format, info.source) +} + +/// Average N samples of a 64bpp pixel format, returning `(lo, hi)`. +fn read_pixel_averaged_64bpp( + edram: &ShadowEdram, + info: &ResolveInfo, + select: CopySampleSelect, + pixel_x: u32, + pixel_y: u32, +) -> (u32, u32) { + let indices = averaging_sample_set(select); + let n = indices.len(); + if n == 0 { + return (0, 0); + } + let mut raw = [(0u32, 0u32); 4]; + for (i, &idx) in indices.iter().enumerate() { + let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords); + raw[i] = edram.read_sample_64bpp(info.source_base_tiles, info.surface_pitch_tiles, sx, sy); + } + let raw_slice = &raw[..n]; + average_samples_64bpp(raw_slice, info.source_format) +} + +/// Per-format averaging for 32bpp color/depth resolves. +fn average_samples_32bpp(samples: &[u32], source_format: u8, source: ResolveSource) -> u32 { + let n = samples.len() as u32; + debug_assert!(n > 0); + match source { + ResolveSource::Color(_) => match source_format { + // k_8_8_8_8 / k_8_8_8_8_GAMMA (0/1): per-channel rounded + // unsigned-int mean. Matches Canary's `resolve.xesli` per-component + // average for u8 — gamma is a sampler-time post-decode, the + // bits are identical for resolve purposes. + 0 | 1 => average_8_8_8_8(samples, n), + // k_2_10_10_10 / k_2_10_10_10_AS_10_10_10_10: per-field rounded + // unsigned-int mean. Field widths 2/10/10/10 from low to high. + 2 | 10 => average_2_10_10_10(samples, n), + // k_16_16_FLOAT (6): two half-floats packed in one u32. + 6 => average_2_half_floats(samples, n), + // k_32_FLOAT (14): one f32 per sample. + 14 => average_1_f32(samples, n), + // For any unsupported format, fall back to first sample — + // upstream gating already filtered to bitwise-equivalent pairs + // so this branch should be unreachable in practice. + _ => samples[0], + }, + // Depth resolves never carry MSAA averaging (sanitize collapses to + // single-sample); reaching this branch is a degenerate caller. + ResolveSource::Depth => samples[0], + } +} + +/// Per-format averaging for 64bpp color resolves. Returns `(lo, hi)`. +fn average_samples_64bpp(samples: &[(u32, u32)], source_format: u8) -> (u32, u32) { + let n = samples.len() as u32; + debug_assert!(n > 0); + match source_format { + // k_16_16_16_16 (5): four 16-bit channels across (lo, hi). Per- + // channel rounded unsigned-int mean. Signed/unsigned variants + // resolve identically because the resolve is a raw byte copy — + // averaging signed values as unsigned still gives the correct + // bits because two's-complement addition of `n` values divided + // by `n` lands on the same bit pattern after truncation. + 5 => average_4_u16(samples, n), + // k_16_16_16_16_FLOAT (7): four half-floats. + 7 => average_4_half_floats(samples, n), + // k_32_32_FLOAT (15): two f32 (R32 = lo, G32 = hi). + 15 => average_2_f32(samples, n), + _ => samples[0], + } +} + +#[inline] +fn average_8_8_8_8(samples: &[u32], n: u32) -> u32 { + // Per-byte rounded unsigned mean. + let mut sums = [0u32; 4]; + for &s in samples { + sums[0] += s & 0xFF; + sums[1] += (s >> 8) & 0xFF; + sums[2] += (s >> 16) & 0xFF; + sums[3] += (s >> 24) & 0xFF; + } + let half = n / 2; + let avg = |sum: u32| ((sum + half) / n) & 0xFF; + avg(sums[0]) + | (avg(sums[1]) << 8) + | (avg(sums[2]) << 16) + | (avg(sums[3]) << 24) +} + +#[inline] +fn average_2_10_10_10(samples: &[u32], n: u32) -> u32 { + // Field widths 2/10/10/10 (low to high). + let mut sum_a = 0u32; // 2 bits + let mut sum_b = 0u32; // 10 bits + let mut sum_g = 0u32; // 10 bits + let mut sum_r = 0u32; // 10 bits + for &s in samples { + sum_a += s & 0x3; + sum_b += (s >> 2) & 0x3FF; + sum_g += (s >> 12) & 0x3FF; + sum_r += (s >> 22) & 0x3FF; + } + let half = n / 2; + let avg = |sum: u32, width: u32| ((sum + half) / n) & ((1u32 << width) - 1); + avg(sum_a, 2) | (avg(sum_b, 10) << 2) | (avg(sum_g, 10) << 12) | (avg(sum_r, 10) << 22) +} + +#[inline] +fn half_to_f32(half: u16) -> f32 { + let sign = ((half >> 15) & 0x1) as u32; + let exp = ((half >> 10) & 0x1F) as i32; + let mant = (half & 0x3FF) as u32; + if exp == 0 { + if mant == 0 { + return f32::from_bits(sign << 31); + } + // Subnormal half → normalized f32. + let mut e = -14; + let mut m = mant; + while (m & 0x400) == 0 { + m <<= 1; + e -= 1; + } + m &= 0x3FF; + let f_exp = (e + 127) as u32; + return f32::from_bits((sign << 31) | (f_exp << 23) | (m << 13)); + } + if exp == 31 { + let f_exp = 0xFFu32; + let f_mant = mant << 13; + return f32::from_bits((sign << 31) | (f_exp << 23) | f_mant); + } + let f_exp = (exp - 15 + 127) as u32; + f32::from_bits((sign << 31) | (f_exp << 23) | (mant << 13)) +} + +#[inline] +fn f32_to_half(f: f32) -> u16 { + let bits = f.to_bits(); + let sign = ((bits >> 31) & 0x1) as u16; + let exp = ((bits >> 23) & 0xFF) as i32; + let mant = bits & 0x7FFFFF; + if exp == 0xFF { + // Inf or NaN. + let h_mant = if mant != 0 { 0x200 } else { 0 }; + return (sign << 15) | (0x1F << 10) | h_mant; + } + if exp == 0 { + return sign << 15; + } + let e = exp - 127 + 15; + if e >= 31 { + return (sign << 15) | (0x1F << 10); + } + if e <= 0 { + // Subnormal half. Round-to-nearest-even is overkill; truncate + // toward zero — averaging 4 floats then converting once is the + // dominant precision path anyway. + if e < -10 { + return sign << 15; + } + let m = (mant | 0x800000) >> ((1 - e) as u32 + 13); + return (sign << 15) | (m as u16); + } + let h_mant = (mant >> 13) as u16; + (sign << 15) | ((e as u16) << 10) | h_mant +} + +#[inline] +fn average_2_half_floats(samples: &[u32], n: u32) -> u32 { + // Each u32 = (lo: half, hi: half). Average as f32, re-encode. + let mut sum_lo = 0.0f32; + let mut sum_hi = 0.0f32; + for &s in samples { + sum_lo += half_to_f32((s & 0xFFFF) as u16); + sum_hi += half_to_f32(((s >> 16) & 0xFFFF) as u16); + } + let inv = 1.0f32 / n as f32; + let lo = f32_to_half(sum_lo * inv) as u32; + let hi = f32_to_half(sum_hi * inv) as u32; + lo | (hi << 16) +} + +#[inline] +fn average_1_f32(samples: &[u32], n: u32) -> u32 { + let mut sum = 0.0f32; + for &s in samples { + sum += f32::from_bits(s); + } + (sum / n as f32).to_bits() +} + +#[inline] +fn average_4_u16(samples: &[(u32, u32)], n: u32) -> (u32, u32) { + // (lo, hi) carry 4 × 16-bit channels. lo = (R, G), hi = (B, A) or similar + // packing — averaging is per-16-bit-field regardless of channel mapping. + let extract = |w: u32, shift: u32| (w >> shift) & 0xFFFF; + let mut sums = [0u32; 4]; + for &(lo, hi) in samples { + sums[0] += extract(lo, 0); + sums[1] += extract(lo, 16); + sums[2] += extract(hi, 0); + sums[3] += extract(hi, 16); + } + let half = n / 2; + let avg = |sum: u32| ((sum + half) / n) & 0xFFFF; + let lo = avg(sums[0]) | (avg(sums[1]) << 16); + let hi = avg(sums[2]) | (avg(sums[3]) << 16); + (lo, hi) +} + +#[inline] +fn average_4_half_floats(samples: &[(u32, u32)], n: u32) -> (u32, u32) { + let mut sums = [0.0f32; 4]; + for &(lo, hi) in samples { + sums[0] += half_to_f32((lo & 0xFFFF) as u16); + sums[1] += half_to_f32(((lo >> 16) & 0xFFFF) as u16); + sums[2] += half_to_f32((hi & 0xFFFF) as u16); + sums[3] += half_to_f32(((hi >> 16) & 0xFFFF) as u16); + } + let inv = 1.0f32 / n as f32; + let h0 = f32_to_half(sums[0] * inv) as u32; + let h1 = f32_to_half(sums[1] * inv) as u32; + let h2 = f32_to_half(sums[2] * inv) as u32; + let h3 = f32_to_half(sums[3] * inv) as u32; + (h0 | (h1 << 16), h2 | (h3 << 16)) +} + +#[inline] +fn average_2_f32(samples: &[(u32, u32)], n: u32) -> (u32, u32) { + let mut sum_lo = 0.0f32; + let mut sum_hi = 0.0f32; + for &(lo, hi) in samples { + sum_lo += f32::from_bits(lo); + sum_hi += f32::from_bits(hi); + } + let inv = 1.0f32 / n as f32; + ((sum_lo * inv).to_bits(), (sum_hi * inv).to_bits()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::draw_state::{ResolveCoordinates, ResolveInfo}; + use crate::edram::ShadowEdram; + use crate::render_target_cache::MsaaSamples; + use crate::tiled_address::{align_pitch_to_macro_tile, tiled_2d_offset}; + use xenia_memory::GuestMemory; + + /// Build a minimally-populated [`ResolveInfo`] for tests. + fn minimal_info(dest_base: u32, pitch: u32, height: u32) -> ResolveInfo { + ResolveInfo { + copy_src_select: 0, + copy_sample_select: 0, + color_clear_enable: false, + depth_clear_enable: false, + copy_command: 0, + dest_base, + dest_pitch_pixels: pitch, + dest_height_pixels: height, + dest_format: color_format::K_8_8_8_8, + dest_endian: 0, + dest_exp_bias: 0, + source: ResolveSource::Color(0), + coords: ResolveCoordinates { + x0: 0, + y0: 0, + width: pitch, + height, + sample_count_log2_x: 0, + sample_count_log2_y: 0, + }, + source_format: 0, + source_base_tiles: 0, + surface_pitch_tiles: pitch.div_ceil(80), + msaa: MsaaSamples::X1, + source_is_64bpp: false, + color_clear_value: 0, + color_clear_value_lo: 0, + depth_clear_value: 0, + copy_dest_array: false, + } + } + + fn fresh_mem() -> GuestMemory { + use xenia_memory::page_table::MemoryProtect; + let mut mem = GuestMemory::new().expect("guest memory"); + mem.alloc( + 0x4000_0000, + 0x0010_0000, + MemoryProtect::READ | MemoryProtect::WRITE, + ) + .expect("alloc"); + mem + } + + #[test] + fn endian_k_none_is_identity() { + assert_eq!(apply_endian_128(0x11223344, 0), 0x11223344); + } + + #[test] + fn endian_k8in16_swaps_byte_pairs() { + assert_eq!(apply_endian_128(0x11223344, 1), 0x22114433); + } + + #[test] + fn endian_k8in32_is_full_byte_reverse() { + assert_eq!(apply_endian_128(0x11223344, 2), 0x44332211); + } + + #[test] + fn endian_k16in32_swaps_halves() { + assert_eq!(apply_endian_128(0x11223344, 3), 0x33441122); + } + + #[test] + fn color_clear_resolve_writes_le_bytes_with_k8in32() { + // Clear-resolve a 32x8 rectangle of k_8_8_8_8 samples to pattern + // 0x11223344 with endian k8in32. Memory should contain LE bytes + // [0x44, 0x33, 0x22, 0x11] at every tiled sample offset. + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + edram.fill_rect_32bpp(0, 1, 0, 0, 32, 8, 0x11223344); + + let mut info = minimal_info(0x4000_0000, 32, 8); + info.dest_endian = 2; // k8in32 + info.source_base_tiles = 0; + info.surface_pitch_tiles = 1; + + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + assert_eq!(stats.samples_written, 32 * 8); + + let pitch_aligned = align_pitch_to_macro_tile(32); + for y in 0..8u32 { + for x in 0..32u32 { + let off = tiled_2d_offset(x, y, pitch_aligned, 2); + let addr = 0x4000_0000u32.wrapping_add(off); + let bytes = [ + mem.read_u8(addr), + mem.read_u8(addr.wrapping_add(1)), + mem.read_u8(addr.wrapping_add(2)), + mem.read_u8(addr.wrapping_add(3)), + ]; + assert_eq!( + bytes, + [0x44, 0x33, 0x22, 0x11], + "mismatch at ({x}, {y})" + ); + } + } + } + + #[test] + fn k_none_endian_keeps_big_endian_bytes() { + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + edram.fill_rect_32bpp(0, 1, 0, 0, 16, 8, 0xAABBCCDD); + + let mut info = minimal_info(0x4000_0000, 16, 8); + info.dest_endian = 0; + info.source_base_tiles = 0; + info.surface_pitch_tiles = 1; + + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + + let pitch_aligned = align_pitch_to_macro_tile(16); + let off = tiled_2d_offset(0, 0, pitch_aligned, 2); + let addr = 0x4000_0000u32.wrapping_add(off); + assert_eq!( + [ + mem.read_u8(addr), + mem.read_u8(addr.wrapping_add(1)), + mem.read_u8(addr.wrapping_add(2)), + mem.read_u8(addr.wrapping_add(3)), + ], + [0xAA, 0xBB, 0xCC, 0xDD] + ); + } + + #[test] + fn empty_rect_is_noop_and_no_page_version_bump() { + let mut mem = fresh_mem(); + let edram = ShadowEdram::new(); + let before = mem.page_version(0x4000_0000); + + let mut info = minimal_info(0x4000_0000, 0, 0); + info.coords.width = 0; + info.coords.height = 0; + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + assert_eq!(stats.samples_written, 0); + assert_eq!(mem.page_version(0x4000_0000), before); + } + + #[test] + fn unsupported_dest_format_is_graceful() { + let mut mem = fresh_mem(); + let edram = ShadowEdram::new(); + let mut info = minimal_info(0x4000_0000, 16, 16); + // k_16_16_16_16 is 64bpp — not bitwise-equivalent to any 32bpp dest. + info.dest_format = 26; + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(!stats.supported); + assert_eq!(stats.samples_written, 0); + } + + #[test] + fn resolve_bumps_page_version_for_texture_cache_invalidation() { + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + edram.fill_rect_32bpp(0, 1, 0, 0, 16, 8, 0xDEADBEEF); + + let before = mem.page_version(0x4000_0000); + let mut info = minimal_info(0x4000_0000, 16, 8); + info.source_base_tiles = 0; + info.surface_pitch_tiles = 1; + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + assert!(mem.page_version(0x4000_0000) > before); + } + + /// k_2_10_10_10 source ↔ k_2_10_10_10 dest is bitwise-equivalent per + /// Canary `xenos.h:624-627`. Same path, just different format bytes. + #[test] + fn k_2_10_10_10_is_bitwise_equivalent() { + assert!(is_32bpp_bitwise_equivalent( + ResolveSource::Color(0), false, /* source */ 2, /* dest */ 7, + )); + assert!(is_32bpp_bitwise_equivalent( + ResolveSource::Color(0), + false, + /* source k_2_10_10_10_AS_10_10_10_10 */ 10, + /* dest k_2_10_10_10_AS_16_16_16_16 */ 54, + )); + } + + /// k_8_8_8_8_GAMMA source resolves identically to k_8_8_8_8 (gamma is + /// applied at sample time, not on store). + #[test] + fn k_8_8_8_8_gamma_source_is_bitwise_equivalent() { + assert!(is_32bpp_bitwise_equivalent( + ResolveSource::Color(0), + false, + /* source k_8_8_8_8_GAMMA */ 1, + /* dest k_8_8_8_8 */ 6, + )); + } + + /// Depth resolve: kD24S8 → k_24_8, kD24FS8 → k_24_8_FLOAT. + #[test] + fn depth_resolve_format_equivalence() { + assert!(is_32bpp_bitwise_equivalent( + ResolveSource::Depth, + false, + /* kD24S8 */ 0, + /* k_24_8 */ 22, + )); + assert!(is_32bpp_bitwise_equivalent( + ResolveSource::Depth, + false, + /* kD24FS8 */ 1, + /* k_24_8_FLOAT */ 23, + )); + // Mismatched depth → texture format = not equivalent. + assert!(!is_32bpp_bitwise_equivalent( + ResolveSource::Depth, + false, + 0, + 23, + )); + } + + /// 64bpp source is never equivalent to a 32bpp dest, even when the + /// source/dest format numbers might look compatible. + #[test] + fn sixty_four_bpp_source_is_never_equivalent() { + assert!(!is_32bpp_bitwise_equivalent( + ResolveSource::Color(0), + true, + 5, // k_16_16_16_16 + 6, + )); + } + + /// 64bpp bitwise-equivalent pairs per Canary `xenos.h:614-639`. + #[test] + fn sixty_four_bpp_equivalence_pairs() { + // k_16_16_16_16 (5) → k_16_16_16_16 (26) + assert!(is_64bpp_bitwise_equivalent(5, 26)); + // k_16_16_16_16_FLOAT (7) → k_16_16_16_16_FLOAT (32) + assert!(is_64bpp_bitwise_equivalent(7, 32)); + // k_32_32_FLOAT (15) → k_32_32_FLOAT (37) + assert!(is_64bpp_bitwise_equivalent(15, 37)); + // Cross-format must reject. + assert!(!is_64bpp_bitwise_equivalent(5, 32)); + assert!(!is_64bpp_bitwise_equivalent(0, 26)); + } + + /// End-to-end 64bpp resolve: paint a `k_16_16_16_16` pattern into EDRAM + /// and confirm `copy_to_memory` lands two u32s per pixel into guest mem. + #[test] + fn sixty_four_bpp_resolve_writes_two_words_per_pixel() { + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + // 16x4 logical 64bpp samples; pitch = 1 32bpp tile. + edram.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xAABB_CCDD, 0x1122_3344); + + let mut info = minimal_info(0x4000_0000, 16, 4); + info.source = ResolveSource::Color(0); + info.source_format = 5; // k_16_16_16_16 + info.dest_format = color_format::K_16_16_16_16; + info.source_is_64bpp = true; + info.dest_endian = 0; // kNone + info.source_base_tiles = 0; + info.surface_pitch_tiles = 1; + info.coords.width = 16; + info.coords.height = 4; + + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + assert_eq!(stats.samples_written, 16 * 4); + + // First pixel: lo word at dst_off, hi word at dst_off + 4. With + // bpp_log2=3, pitch_aligned=32 (rounded from 16), tiled offset + // for (0,0) is 0. + let pitch_aligned = align_pitch_to_macro_tile(16); + let off = tiled_2d_offset(0, 0, pitch_aligned, 3); + let addr = 0x4000_0000u32.wrapping_add(off); + // BE store of 0xAABBCCDD = bytes [0xAA, 0xBB, 0xCC, 0xDD] + assert_eq!(mem.read_u8(addr), 0xAA); + assert_eq!(mem.read_u8(addr.wrapping_add(1)), 0xBB); + assert_eq!(mem.read_u8(addr.wrapping_add(2)), 0xCC); + assert_eq!(mem.read_u8(addr.wrapping_add(3)), 0xDD); + assert_eq!(mem.read_u8(addr.wrapping_add(4)), 0x11); + assert_eq!(mem.read_u8(addr.wrapping_add(7)), 0x44); + } + + /// MSAA averaging — `k_8_8_8_8` per-channel rounded mean of 4 samples. + /// Build a 4x MSAA RT where the 4 samples per pixel hold (0, 64, 128, + /// 192) in the red channel and check the resolve produces the rounded + /// mean (96). + #[test] + fn msaa_4x_averaging_k_8_8_8_8() { + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + // 4x MSAA: each pixel occupies a 2×2 sample grid. + // Pixel (0,0) sample positions (0..4) at sample-coords: + // s0: (0, 0) + // s1: (1, 0) + // s2: (0, 1) + // s3: (1, 1) + // Stuff R=[0, 64, 128, 192], G=B=A=0. + edram.write_sample_32bpp(0, 1, 0, 0, 0x00_00_00_00); // R=0 + edram.write_sample_32bpp(0, 1, 1, 0, 0x00_00_00_40); // R=64 + edram.write_sample_32bpp(0, 1, 0, 1, 0x00_00_00_80); // R=128 + edram.write_sample_32bpp(0, 1, 1, 1, 0x00_00_00_C0); // R=192 + + let mut info = minimal_info(0x4000_0000, 1, 1); + info.source = ResolveSource::Color(0); + info.source_format = 0; // k_8_8_8_8 + info.dest_format = color_format::K_8_8_8_8; + info.copy_sample_select = 6; // K0123 + info.msaa = MsaaSamples::X4; + info.coords.sample_count_log2_x = 1; + info.coords.sample_count_log2_y = 1; + info.coords.width = 1; + info.coords.height = 1; + info.dest_endian = 0; + info.source_base_tiles = 0; + info.surface_pitch_tiles = 1; + + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + assert_eq!(stats.samples_written, 1); + // R = (0+64+128+192 + 2)/4 = 96 = 0x60. Big-endian store. + let addr = 0x4000_0000u32; + // The byte order in u32 is [byte0, byte1, byte2, byte3] where + // byte0 = R. After BE store of pixel 0x000000_60 (R=0x60), the + // bytes at the resolve-tile offset are [0x00, 0x00, 0x00, 0x60]. + let bytes = [ + mem.read_u8(addr), + mem.read_u8(addr.wrapping_add(1)), + mem.read_u8(addr.wrapping_add(2)), + mem.read_u8(addr.wrapping_add(3)), + ]; + assert_eq!(bytes, [0x00, 0x00, 0x00, 0x60], "averaged R should be 0x60"); + } + + /// MSAA averaging — `k_32_FLOAT` averages 4 f32 samples linearly. + #[test] + fn msaa_4x_averaging_k_32_float() { + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + let f = |v: f32| v.to_bits(); + edram.write_sample_32bpp(0, 1, 0, 0, f(1.0)); + edram.write_sample_32bpp(0, 1, 1, 0, f(2.0)); + edram.write_sample_32bpp(0, 1, 0, 1, f(3.0)); + edram.write_sample_32bpp(0, 1, 1, 1, f(4.0)); + + let mut info = minimal_info(0x4000_0000, 1, 1); + info.source = ResolveSource::Color(0); + info.source_format = 14; // k_32_FLOAT + info.dest_format = color_format::K_32_FLOAT; + info.copy_sample_select = 6; // K0123 + info.msaa = MsaaSamples::X4; + info.coords.sample_count_log2_x = 1; + info.coords.sample_count_log2_y = 1; + info.coords.width = 1; + info.coords.height = 1; + info.dest_endian = 2; // k8in32 — game-typical for float sampling + info.source_base_tiles = 0; + info.surface_pitch_tiles = 1; + + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + // (1+2+3+4)/4 = 2.5 + let expected = 2.5f32.to_bits(); + // k8in32 swap = byte-reverse → BE store puts the LE-swapped bytes back + // in original (big-endian) order. Reconstruct guest-visible u32: + let bytes = [ + mem.read_u8(0x4000_0000), + mem.read_u8(0x4000_0001), + mem.read_u8(0x4000_0002), + mem.read_u8(0x4000_0003), + ]; + // After endian k8in32 (swap_bytes) and BE store, the bytes in memory + // are LE-from-CPU-perspective. So bytes here are u32::to_le_bytes(expected). + assert_eq!(bytes, expected.to_le_bytes()); + } + + /// MSAA averaging — `k_2_10_10_10` per-field rounded mean. + #[test] + fn msaa_2x_averaging_k_2_10_10_10() { + // 2x MSAA samples are stacked vertically (s0 at y=0, s1 at y=1). + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + // Field widths 2/10/10/10. Pack two values per field (a/b/g/r). + let pack = |a: u32, b: u32, g: u32, r: u32| { + (a & 0x3) | ((b & 0x3FF) << 2) | ((g & 0x3FF) << 12) | ((r & 0x3FF) << 22) + }; + edram.write_sample_32bpp(0, 1, 0, 0, pack(0, 100, 200, 300)); + edram.write_sample_32bpp(0, 1, 0, 1, pack(2, 200, 300, 400)); + + let mut info = minimal_info(0x4000_0000, 1, 1); + info.source = ResolveSource::Color(0); + info.source_format = 2; // k_2_10_10_10 + info.dest_format = color_format::K_2_10_10_10; + info.copy_sample_select = 4; // K01 + info.msaa = MsaaSamples::X2; + info.coords.sample_count_log2_x = 0; + info.coords.sample_count_log2_y = 1; + info.coords.width = 1; + info.coords.height = 1; + info.dest_endian = 0; + info.source_base_tiles = 0; + info.surface_pitch_tiles = 1; + + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + // Expected per-field: a=(0+2+1)/2=1, b=(100+200+1)/2=150, g=(200+300+1)/2=250, r=(300+400+1)/2=350 + let expected = pack(1, 150, 250, 350); + // Read back as BE u32 (big-endian byte ordering). + let bytes = [ + mem.read_u8(0x4000_0000), + mem.read_u8(0x4000_0001), + mem.read_u8(0x4000_0002), + mem.read_u8(0x4000_0003), + ]; + assert_eq!(bytes, expected.to_be_bytes()); + } + + /// End-to-end depth resolve: set up a depth RT at tile base 8, paint + /// it via clear value, and verify the copy emerges in guest memory + /// with the right bytes. + #[test] + fn depth_clear_resolve_end_to_end() { + let mut mem = fresh_mem(); + let mut edram = ShadowEdram::new(); + // Paint the depth tiles directly with a known pattern. + edram.fill_rect_32bpp(8, 1, 0, 0, 16, 8, 0x3FFF_FF00); + + let mut info = minimal_info(0x4000_0000, 16, 8); + info.source = ResolveSource::Depth; + info.source_format = 0; // kD24S8 + info.dest_format = color_format::K_24_8; + info.dest_endian = 2; // k8in32 + info.source_base_tiles = 8; + info.surface_pitch_tiles = 1; + + let stats = copy_to_memory(&info, &edram, &mut mem); + assert!(stats.supported); + assert_eq!(stats.samples_written, 16 * 8); + + // First pixel should be the endian-swapped pattern: BE-store of + // 0x3FFF_FF00.swap_bytes() = 0x00FF_FF3F → bytes [0x00, 0xFF, 0xFF, 0x3F]. + let pitch_aligned = align_pitch_to_macro_tile(16); + let off = tiled_2d_offset(0, 0, pitch_aligned, 2); + let addr = 0x4000_0000u32.wrapping_add(off); + assert_eq!( + [ + mem.read_u8(addr), + mem.read_u8(addr.wrapping_add(1)), + mem.read_u8(addr.wrapping_add(2)), + mem.read_u8(addr.wrapping_add(3)), + ], + [0x00, 0xFF, 0xFF, 0x3F] + ); + } + + /// `sanitize_sample_select` for 1x MSAA collapses every select to K0. + #[test] + fn sanitize_1x_msaa_collapses_to_k0() { + for raw in 0..=7u8 { + let s = sanitize_sample_select(raw, MsaaSamples::X1, false); + assert_eq!(s, CopySampleSelect::K0, "raw={raw}"); + } + } + + /// 2x MSAA: k2→k0, k3→k1, k23→k01; depth averages sanitize to k0. + #[test] + fn sanitize_2x_msaa_obeys_canary_rules() { + assert_eq!( + sanitize_sample_select(2, MsaaSamples::X2, false), + CopySampleSelect::K0 + ); + assert_eq!( + sanitize_sample_select(3, MsaaSamples::X2, false), + CopySampleSelect::K1 + ); + assert_eq!( + sanitize_sample_select(5, MsaaSamples::X2, false), + CopySampleSelect::K01 + ); + // Depth — no averaging. + assert_eq!( + sanitize_sample_select(4, MsaaSamples::X2, true), + CopySampleSelect::K0 + ); + assert_eq!( + sanitize_sample_select(6, MsaaSamples::X2, true), + CopySampleSelect::K0 + ); + } + + /// 4x MSAA: single-samples untouched for color; depth averages + /// collapse to a representative single sample (k0123 → k0). + #[test] + fn sanitize_4x_msaa_depth_collapses_averages() { + assert_eq!( + sanitize_sample_select(6, MsaaSamples::X4, true), + CopySampleSelect::K0 + ); + assert_eq!( + sanitize_sample_select(5, MsaaSamples::X4, true), + CopySampleSelect::K2 + ); + assert_eq!( + sanitize_sample_select(4, MsaaSamples::X4, true), + CopySampleSelect::K0 + ); + // Color keeps averages. + assert_eq!( + sanitize_sample_select(6, MsaaSamples::X4, false), + CopySampleSelect::K0123 + ); + } + + /// Sample offsets follow the standard Xbox 360 MSAA layout. + #[test] + fn sample_offset_layout() { + // 1x + assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X1), (0, 0)); + // 2x + assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X2), (0, 0)); + assert_eq!(sample_offset_in_pixel(1, MsaaSamples::X2), (0, 1)); + // 4x + assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X4), (0, 0)); + assert_eq!(sample_offset_in_pixel(1, MsaaSamples::X4), (1, 0)); + assert_eq!(sample_offset_in_pixel(2, MsaaSamples::X4), (0, 1)); + assert_eq!(sample_offset_in_pixel(3, MsaaSamples::X4), (1, 1)); + } +} diff --git a/crates/xenia-gpu/src/ring_drain.rs b/crates/xenia-gpu/src/ring_drain.rs new file mode 100644 index 0000000..2bf4a67 --- /dev/null +++ b/crates/xenia-gpu/src/ring_drain.rs @@ -0,0 +1,169 @@ +//! Ring-buffer drainer. +//! +//! Walks a guest PM4 ring buffer from `start_offset` forward, classifying each +//! packet via [`crate::pm4`] and stopping when it either reaches the end of +//! the window it was asked to scan, walks off a NOP-fill region, or hits a +//! malformed header. +//! +//! It does **not** execute draws — that's deferred to a later phase. Its job +//! is to (a) advance the read pointer far enough that games keep making +//! progress, and (b) surface `PM4_XE_SWAP` packets so `VdSwap` can forward +//! them to the host UI. + +use xenia_memory::MemoryAccess; + +use crate::pm4::{self, PacketKind}; + +/// Outcome of a [`drain`] call. +#[derive(Default, Debug, Clone, Copy)] +pub struct DrainResult { + /// Dword offset reached, relative to the start of the ring buffer. + pub new_offset: u32, + /// How many packets were walked in this call. + pub packets_walked: u32, + /// True if we saw `PM4_XE_SWAP` during the walk. + pub swap_seen: bool, + /// If `swap_seen`, the guest frontbuffer *physical* address written next + /// to `PM4_XE_SWAP` (dword 2 of the 4-payload packet). + pub swap_frontbuffer_phys: u32, + /// If `swap_seen`, the width written at dword 3. + pub swap_width: u32, + /// If `swap_seen`, the height written at dword 4. + pub swap_height: u32, +} + +/// Walk `max_packets` packets starting at dword offset `start_offset` in the +/// ring buffer at guest address `ring_base` of size `ring_size_dwords`. +/// +/// The offset is treated modulo `ring_size_dwords`. Walking stops when: +/// - `max_packets` have been walked, +/// - a `PM4_XE_SWAP` has been consumed (the swap is reported and we stop so +/// the UI sees the frame boundary before further drain), +/// - a header's declared total size would exceed the remaining budget, +/// - the ring size is zero (drainer is a no-op). +pub fn drain( + mem: &M, + ring_base: u32, + ring_size_dwords: u32, + start_offset: u32, + max_packets: u32, +) -> DrainResult { + if ring_size_dwords == 0 || ring_base == 0 { + return DrainResult::default(); + } + let mut result = DrainResult { + new_offset: start_offset % ring_size_dwords, + ..DrainResult::default() + }; + let mut offset = result.new_offset; + for _ in 0..max_packets { + let header_addr = ring_base.wrapping_add(offset.wrapping_mul(4)); + let header = mem.read_u32(header_addr); + let packet = pm4::decode(header); + // Refuse to walk past the ring in a single packet. + if packet.total_dwords > ring_size_dwords { + break; + } + // Type-3 PM4_XE_SWAP → record payload and stop. + if let PacketKind::Type3 { opcode, .. } = packet.kind + && opcode == pm4::PM4_XE_SWAP { + // Payload layout (from canary VdSwap_entry): + // [0] XE_SWAP header + // [1] kSwapSignature ("XNEX" = 0x584E4558) + // [2] frontbuffer physical address + // [3] width + // [4] height + let payload = |i: u32| { + let addr = + ring_base.wrapping_add(((offset + i) % ring_size_dwords).wrapping_mul(4)); + mem.read_u32(addr) + }; + result.swap_seen = true; + result.swap_frontbuffer_phys = payload(2); + result.swap_width = payload(3); + result.swap_height = payload(4); + offset = (offset + packet.total_dwords) % ring_size_dwords; + result.new_offset = offset; + result.packets_walked += 1; + return result; + } + offset = (offset + packet.total_dwords) % ring_size_dwords; + result.new_offset = offset; + result.packets_walked += 1; + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + use xenia_memory::GuestMemory; + use xenia_memory::page_table::MemoryProtect; + + fn build_mem() -> GuestMemory { + let mut mem = GuestMemory::new().unwrap(); + let rw = MemoryProtect::READ | MemoryProtect::WRITE; + mem.alloc(0x4000_0000, 0x1000, rw).unwrap(); + mem + } + + fn write_dword(mem: &GuestMemory, addr: u32, val: u32) { + mem.write_u32(addr, val); + } + + #[test] + fn walks_nops_until_budget_exhausted() { + let mut mem = build_mem(); + // Fill 10 dwords with Type-2 NOPs. + for i in 0..10 { + write_dword(&mut mem, 0x4000_0000 + i * 4, 0x8000_0000); + } + let r = drain(&mem, 0x4000_0000, 0x400, 0, 5); + assert_eq!(r.packets_walked, 5); + assert_eq!(r.new_offset, 5); + assert!(!r.swap_seen); + } + + #[test] + fn stops_at_swap_and_reports_payload() { + let mut mem = build_mem(); + // Two NOPs, then a PM4_XE_SWAP packet. + write_dword(&mut mem, 0x4000_0000, 0x8000_0000); + write_dword(&mut mem, 0x4000_0004, 0x8000_0000); + // MakePacketType3(PM4_XE_SWAP, 4) → (3<<30) | (3<<16) | (0x64<<8) + let swap_hdr = (3u32 << 30) | (3u32 << 16) | ((pm4::PM4_XE_SWAP as u32) << 8); + write_dword(&mut mem, 0x4000_0008, swap_hdr); + write_dword(&mut mem, 0x4000_000C, pm4::SWAP_SIGNATURE); + write_dword(&mut mem, 0x4000_0010, 0xDEAD_F000); // frontbuffer phys + write_dword(&mut mem, 0x4000_0014, 1280); + write_dword(&mut mem, 0x4000_0018, 720); + let r = drain(&mem, 0x4000_0000, 0x400, 0, 16); + assert!(r.swap_seen); + assert_eq!(r.swap_frontbuffer_phys, 0xDEAD_F000); + assert_eq!(r.swap_width, 1280); + assert_eq!(r.swap_height, 720); + assert_eq!(r.packets_walked, 3); + assert_eq!(r.new_offset, 7); // 2 NOPs (1 dword each) + 5-dword swap = 7 + } + + #[test] + fn wraps_around_ring() { + let mut mem = build_mem(); + // Ring size = 4 dwords. Start at offset 3 (last dword). Write a NOP + // there, then the walker should wrap to offset 0. + write_dword(&mut mem, 0x4000_000C, 0x8000_0000); + write_dword(&mut mem, 0x4000_0000, 0x8000_0000); + let r = drain(&mem, 0x4000_0000, 4, 3, 2); + assert_eq!(r.packets_walked, 2); + assert_eq!(r.new_offset, 1); + } + + #[test] + fn zero_ring_size_is_noop() { + let mem = build_mem(); + let r = drain(&mem, 0x4000_0000, 0, 0, 10); + assert_eq!(r.packets_walked, 0); + assert_eq!(r.new_offset, 0); + assert!(!r.swap_seen); + } +} diff --git a/crates/xenia-gpu/src/ring_view.rs b/crates/xenia-gpu/src/ring_view.rs new file mode 100644 index 0000000..a164d73 --- /dev/null +++ b/crates/xenia-gpu/src/ring_view.rs @@ -0,0 +1,123 @@ +//! Primary ring buffer view. +//! +//! Games allocate a ring buffer in physical memory (via +//! `MmAllocatePhysicalMemoryEx` with WRITE_COMBINE), then hand the base +//! address + log2(size) to `VdInitializeRingBuffer`. They subsequently push +//! PM4 packets into it, advancing the write-pointer by writing to a GPU +//! register (`CP_RB_WPTR`) or via kernel-call shims. +//! +//! The GPU consumes packets from `read_offset_dwords` up to (but not past) +//! the write pointer. After consuming enough bytes it writes `read_offset` +//! into the guest-memory address registered by `VdEnableRingBufferRPtrWriteBack` +//! so the game can know how much of the ring has been consumed. + +/// Tracks the primary ring buffer as set up by the guest. +#[derive(Debug, Clone, Copy, Default)] +pub struct RingBufferView { + /// Guest physical/virtual base address. `0` means uninitialized. + pub base: u32, + /// Size of the ring in dwords. `0` means uninitialized. + pub size_dwords: u32, + /// Dword offset the GPU has consumed up to (relative to `base`). + pub read_offset_dwords: u32, + /// Dword offset the guest has last written into (relative to `base`). + /// Updated either by an MMIO write to `CP_RB_WPTR` or by the kernel + /// (`VdSwap` is a hint — the game reserves a 64-dword slot in the ring + /// for it). + pub write_offset_dwords: u32, + /// Guest address where we mirror `read_offset_dwords` each time we make + /// progress. `0` if the game never called `VdEnableRingBufferRPtrWriteBack`. + pub rptr_writeback_addr: u32, + /// Write-back block granularity in dwords (from the `log2` arg to + /// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so + /// we don't actually use this for scheduling — kept for observability. + pub rptr_writeback_block_dwords: u32, +} + +impl RingBufferView { + pub fn new() -> Self { + Self::default() + } + + /// True if the guest has provided a base + size. + pub fn is_initialized(&self) -> bool { + self.base != 0 && self.size_dwords != 0 + } + + /// True if there is pending unread data to consume. + pub fn has_pending(&self) -> bool { + self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords + } + + /// Number of dwords we can consume without wrapping past the write ptr. + pub fn pending_dwords(&self) -> u32 { + if !self.is_initialized() { + return 0; + } + if self.write_offset_dwords >= self.read_offset_dwords { + self.write_offset_dwords - self.read_offset_dwords + } else { + // write has wrapped — we can read up to the end of the ring. + self.size_dwords - self.read_offset_dwords + } + } + + /// Advance the read pointer by `dwords`, wrapping at `size_dwords`. + pub fn advance_read(&mut self, dwords: u32) { + if self.size_dwords == 0 { + return; + } + self.read_offset_dwords = + (self.read_offset_dwords + dwords) % self.size_dwords; + } + + /// Guest address for the dword at relative offset `i` from the current + /// read pointer. `None` if uninitialized. + pub fn addr_at_offset(&self, offset_dwords: u32) -> Option { + if !self.is_initialized() { + return None; + } + let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords; + Some(self.base.wrapping_add(off.wrapping_mul(4))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn uninitialized_view_reports_empty() { + let v = RingBufferView::new(); + assert!(!v.is_initialized()); + assert!(!v.has_pending()); + assert_eq!(v.pending_dwords(), 0); + } + + #[test] + fn wrap_around_arithmetic() { + let mut v = RingBufferView::new(); + v.base = 0x4000_0000; + v.size_dwords = 16; + v.read_offset_dwords = 14; + v.write_offset_dwords = 2; // wrapped + + // We can only read to end-of-ring in one chunk. + assert_eq!(v.pending_dwords(), 2); + v.advance_read(2); + assert_eq!(v.read_offset_dwords, 0); + // Now unwrapped, 2 more to go. + assert_eq!(v.pending_dwords(), 2); + } + + #[test] + fn addr_at_offset_wraps() { + let mut v = RingBufferView::new(); + v.base = 0x4000_0000; + v.size_dwords = 4; + v.read_offset_dwords = 3; + assert_eq!(v.addr_at_offset(0), Some(0x4000_000C)); + assert_eq!(v.addr_at_offset(1), Some(0x4000_0000)); + assert_eq!(v.addr_at_offset(2), Some(0x4000_0004)); + } +} diff --git a/crates/xenia-gpu/src/shader_metrics.rs b/crates/xenia-gpu/src/shader_metrics.rs new file mode 100644 index 0000000..7aabe5e --- /dev/null +++ b/crates/xenia-gpu/src/shader_metrics.rs @@ -0,0 +1,350 @@ +//! Host-side static analysis over a [`ParsedShader`], emitted once per unique +//! shader blob. Produces the observability the plan's P3b/P3c sections call +//! for (`gpu.shader.interpret{stage,kind}` + `gpu.shader.reject{reason}`), so +//! the HUD can show when a game is reaching ops the WGSL interpreter falls +//! back on. +//! +//! Analysis is intentionally cheap: it scans each exec clause's instruction +//! triples, classifies them as ALU / vertex-fetch / texture-fetch using the +//! owning clause's sequence bitmap, and bumps counters accordingly. No GPU +//! readback is required — `reject` reasons are inferred from opcode values +//! alone. + +use metrics::counter; + +use crate::ucode::alu::{decode_alu, sop, vop}; +use crate::ucode::control_flow::ControlFlowInstruction; +use crate::ucode::fetch::{FetchInstruction, decode_fetch}; +use crate::ucode::ParsedShader; + +/// Walk `parsed` once and emit `gpu.shader.interpret` + `gpu.shader.reject` +/// counters. `stage` should be `"vs"` or `"ps"`. +pub fn emit_for(parsed: &ParsedShader, stage: &'static str) { + let mut alu_count: u64 = 0; + let mut vfetch_count: u64 = 0; + let mut tfetch_count: u64 = 0; + let mut rejects: Vec<(&'static str, u64)> = Vec::new(); + + let mut features: Vec<&'static str> = Vec::new(); + for clause in &parsed.cf { + match clause { + ControlFlowInstruction::Exec { + address, + count, + sequence, + .. + } => { + for i in 0..(*count as usize) { + let triple_idx = *address as usize + i; + let base = triple_idx * 3; + if base + 2 >= parsed.instructions.len() { + break; + } + let words = [ + parsed.instructions[base], + parsed.instructions[base + 1], + parsed.instructions[base + 2], + ]; + // sequence bit layout: 2 bits per triple, hi bit = is-fetch. + let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0; + if is_fetch { + match decode_fetch(words) { + FetchInstruction::Vertex(_) => vfetch_count += 1, + FetchInstruction::Texture(tf) => { + tfetch_count += 1; + match tf.dimension { + 0 => mark_feature(&mut features, "tfetch_1d"), + 2 => mark_feature(&mut features, "tfetch_3d"), + 3 => mark_feature(&mut features, "tfetch_cube"), + _ => {} + } + if tf.dimension != 1 { + bump(&mut rejects, "texfetch_dimension"); + } + } + FetchInstruction::Unknown { .. } => { + bump(&mut rejects, "fetch_unknown"); + } + } + } else { + alu_count += 1; + let alu = decode_alu(words); + if !vec_op_supported(alu.vector_opcode) { + bump(&mut rejects, "alu_vec_unsupported"); + } + if !scl_op_supported(alu.scalar_opcode) { + bump(&mut rejects, "alu_scl_unsupported"); + } + // Feature-of-interest detection for future phases. + // Transcendentals + kill + setp + cube/max4 are the + // high-value signals: they tell us which of the + // deferred capabilities Sylpheed actually exercises. + match alu.vector_opcode { + v if v == vop::CUBE => mark_feature(&mut features, "vec_cube"), + v if v == vop::MAX4 => mark_feature(&mut features, "vec_max4"), + v if v == vop::KILL_EQ + || v == vop::KILL_GT + || v == vop::KILL_GE + || v == vop::KILL_NE => + { + mark_feature(&mut features, "vec_kill"); + } + v if v == vop::CND_EQ + || v == vop::CND_GE + || v == vop::CND_GT => + { + mark_feature(&mut features, "vec_cnd"); + } + _ => {} + } + match alu.scalar_opcode { + s if s == sop::EXP + || s == sop::LOG + || s == sop::LOGC + || s == sop::SIN + || s == sop::COS => + { + mark_feature(&mut features, "scl_transcendental"); + } + s if s == sop::RSQ + || s == sop::RSQC + || s == sop::RSQF + || s == sop::SQRT => + { + mark_feature(&mut features, "scl_sqrt_family"); + } + s if s == sop::SETP_EQ + || s == sop::SETP_NE + || s == sop::SETP_GT + || s == sop::SETP_GE + || s == sop::SETP_INV + || s == sop::SETP_POP + || s == sop::SETP_CLR + || s == sop::SETP_RSTR => + { + mark_feature(&mut features, "scl_setp"); + } + s if s == sop::KILLS_EQ + || s == sop::KILLS_GT + || s == sop::KILLS_GE + || s == sop::KILLS_NE + || s == sop::KILLS_ONE => + { + mark_feature(&mut features, "scl_kills"); + } + _ => {} + } + if alu.predicated { + mark_feature(&mut features, "alu_predicated"); + } + } + } + } + ControlFlowInstruction::LoopStart { .. } + | ControlFlowInstruction::LoopEnd { .. } => { + mark_feature(&mut features, "cf_loop"); + bump(&mut rejects, "cf_loop"); + } + ControlFlowInstruction::CondJmp { .. } => { + mark_feature(&mut features, "cf_cond_jmp"); + bump(&mut rejects, "cf_cond_jmp"); + } + ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => { + mark_feature(&mut features, "cf_call_return"); + bump(&mut rejects, "cf_call_return"); + } + ControlFlowInstruction::Unknown { .. } => { + bump(&mut rejects, "cf_unknown"); + } + _ => {} + } + } + + counter!("gpu.shader.interpret", "stage" => stage, "kind" => "alu") + .increment(alu_count); + counter!("gpu.shader.interpret", "stage" => stage, "kind" => "vfetch") + .increment(vfetch_count); + counter!("gpu.shader.interpret", "stage" => stage, "kind" => "tfetch") + .increment(tfetch_count); + for (reason, n) in rejects { + counter!("gpu.shader.reject", "stage" => stage, "reason" => reason).increment(n); + } + for name in features { + counter!("gpu.feature.used", "stage" => stage, "name" => name).increment(1); + } +} + +fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) { + if !buf.contains(&name) { + buf.push(name); + } +} + +fn bump(buf: &mut Vec<(&'static str, u64)>, reason: &'static str) { + for entry in buf.iter_mut() { + if entry.0 == reason { + entry.1 += 1; + return; + } + } + buf.push((reason, 1)); +} + +fn vec_op_supported(op: u8) -> bool { + matches!( + op, + vop::ADD + | vop::MUL + | vop::MAX + | vop::MIN + | vop::SEQ + | vop::SGT + | vop::SGE + | vop::SNE + | vop::FRC + | vop::TRUNC + | vop::FLOOR + | vop::MAD + | vop::CND_EQ + | vop::CND_GE + | vop::CND_GT + | vop::DOT4 + | vop::DOT3 + | vop::DOT2_ADD + | vop::MAX4 + | vop::KILL_EQ + | vop::KILL_GT + | vop::KILL_GE + | vop::KILL_NE + | vop::DST + ) +} + +fn scl_op_supported(op: u8) -> bool { + matches!( + op, + sop::ADDS + | sop::ADDS_PREV + | sop::MULS + | sop::MULS_PREV + | sop::MAXS + | sop::MINS + | sop::SEQS + | sop::SGTS + | sop::SGES + | sop::SNES + | sop::FRCS + | sop::TRUNCS + | sop::FLOORS + | sop::EXP + | sop::LOG + | sop::LOGC + | sop::RCP + | sop::RCPC + | sop::RCPF + | sop::RSQ + | sop::RSQC + | sop::RSQF + | sop::SQRT + | sop::SUBS + | sop::SUBS_PREV + | sop::SETP_EQ + | sop::SETP_NE + | sop::SETP_GT + | sop::SETP_GE + | sop::SETP_INV + | sop::SETP_POP + | sop::SETP_CLR + | sop::SETP_RSTR + | sop::KILLS_EQ + | sop::KILLS_GT + | sop::KILLS_GE + | sop::KILLS_NE + | sop::KILLS_ONE + | sop::SIN + | sop::COS + | sop::RETAIN_PREV + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ucode::alu::{sop, vop}; + use crate::ucode::control_flow::ControlFlowInstruction; + + /// Build a minimal `ParsedShader` with one `Exec` clause containing + /// `count` ALU triples and assert the `alu` counter path works. + #[test] + fn emit_for_runs_on_synthetic_shader() { + let alu_w2 = (vop::ADD as u32) | ((sop::ADDS as u32) << 6) | (0xF << 12); + let shader = ParsedShader { + cf: vec![ + ControlFlowInstruction::Exec { + address: 0, + count: 2, + sequence: 0, // all ALU (no is-fetch bits) + is_end: false, + predicated: false, + predicate_condition: false, + }, + ControlFlowInstruction::Exit, + ], + instructions: vec![0, 0, alu_w2, 0, 0, alu_w2], + }; + // Just smoke: doesn't panic. Counters are validated via metrics + // exporters elsewhere; we only assert this doesn't throw on a + // well-formed ParsedShader. + emit_for(&shader, "vs"); + } + + /// P8: a shader containing `LoopStart` should mark `cf_loop` as used + /// so the HUD can surface which deferred feature a game triggers. + #[test] + fn feature_detection_flags_loops_and_kills() { + let kill_alu_w2 = + (vop::KILL_EQ as u32) | ((sop::RETAIN_PREV as u32) << 6) | (0xF << 12); + let shader = ParsedShader { + cf: vec![ + ControlFlowInstruction::LoopStart { + address: 0, + loop_id: 0, + }, + ControlFlowInstruction::Exec { + address: 0, + count: 1, + sequence: 0, + is_end: true, + predicated: false, + predicate_condition: false, + }, + ], + instructions: vec![0, 0, kill_alu_w2], + }; + // Smoke: emits cleanly. + emit_for(&shader, "ps"); + } + + #[test] + fn unsupported_ops_classified_as_rejects() { + // Opcode 63 is outside our supported sets for both pipes. + let alu_w2 = 63u32 | (63u32 << 6) | (0xF << 12); + let shader = ParsedShader { + cf: vec![ + ControlFlowInstruction::Exec { + address: 0, + count: 1, + sequence: 0, + is_end: true, + predicated: false, + predicate_condition: false, + }, + ], + instructions: vec![0, 0, alu_w2], + }; + // Again: smoke — but also confirm our static tables reject op 63. + assert!(!vec_op_supported(63)); + assert!(!scl_op_supported(63)); + emit_for(&shader, "ps"); + } +} diff --git a/crates/xenia-gpu/src/shaders/mod.rs b/crates/xenia-gpu/src/shaders/mod.rs new file mode 100644 index 0000000..fcd5bef --- /dev/null +++ b/crates/xenia-gpu/src/shaders/mod.rs @@ -0,0 +1,36 @@ +//! Embedded WGSL shader sources used by the host pipeline. + +/// Xenos uber-shader scaffold (P3). See the comment at the top of +/// [`XENOS_INTERP_WGSL`] for scope/limits and the plan's P3b target state. +pub const XENOS_INTERP_WGSL: &str = include_str!("xenos_interp.wgsl"); + +#[cfg(test)] +mod tests { + use super::*; + + /// Parsing through naga validates the shader against WGSL spec + wgpu's + /// type system. We don't need a full pipeline to catch typos and layout + /// mistakes — this test is fast and catches regressions at `cargo test` + /// time. + #[test] + fn xenos_interp_wgsl_parses() { + let module = naga::front::wgsl::parse_str(XENOS_INTERP_WGSL) + .expect("xenos_interp.wgsl must parse cleanly"); + // Sanity: we declared two entry points. + assert!(!module.entry_points.is_empty()); + assert!( + module + .entry_points + .iter() + .any(|e| e.name == "vs_main" && e.stage == naga::ShaderStage::Vertex), + "missing vs_main entry" + ); + assert!( + module + .entry_points + .iter() + .any(|e| e.name == "fs_main" && e.stage == naga::ShaderStage::Fragment), + "missing fs_main entry" + ); + } +} diff --git a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl new file mode 100644 index 0000000..8fcab3e --- /dev/null +++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl @@ -0,0 +1,974 @@ +// xenia-rs Xenos runtime microcode interpreter — P3b WGSL. +// +// Bindings (stable across P3b milestones): +// @group(0) @binding(0) draw_ctx (uniform, 16 B — XenosDrawConstants) +// @group(0) @binding(1) xenos_consts (uniform, ~9.2 KB — XenosConstants) +// @group(0) @binding(2) vs_ucode (storage, packed VS shader) +// @group(0) @binding(3) ps_ucode (storage, packed PS shader) +// @group(0) @binding(4) vertex_buffer (storage, raw guest VB dwords) +// +// Packed shader layout (both vs_ucode & ps_ucode): +// [0] = cf_count +// [1 .. 1 + cf_count*3] = CF table: (kind, primary, aux) × cf_count +// [1 + cf_count*3 ..] = instruction triples (3 dwords each) +// +// M3 state (this file): CF walker + operand decode helpers + register file +// scaffold are complete. ALU / fetch bodies are still stubs that fall back +// to the procedural-circle visualisation; M4-M7 fill them in. + +struct XenosDrawConstants { + draw_index: u32, + vertex_count: u32, + prim_kind: u32, + _pad: u32, +}; + +struct XenosConstants { + alu: array, 512>, + fetch: array, + bool_consts: array, + loop_consts: array, +}; + +@group(0) @binding(0) var draw_ctx : XenosDrawConstants; +// `xenos_consts` is a read-only storage buffer (not uniform) because the +// block contains tightly-packed `array` fields — WGSL's uniform +// address space requires 16-byte element stride, which would triple the +// allocation; storage accepts the natural 4-byte stride. +@group(0) @binding(1) var xenos_consts : XenosConstants; +@group(0) @binding(2) var vs_ucode : array; +@group(0) @binding(3) var ps_ucode : array; +@group(0) @binding(4) var vertex_buffer : array; + +// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for +// all texture slots; the P5 texture cache will replace this with per-slot +// bindings. +@group(1) @binding(0) var xenos_tex : texture_2d; +@group(1) @binding(1) var xenos_samp : sampler; + +// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ───────────── +const CF_KIND_EXEC: u32 = 0u; +const CF_KIND_EXEC_END: u32 = 1u; +const CF_KIND_ALLOC: u32 = 2u; +const CF_KIND_EXIT: u32 = 3u; +const CF_KIND_LOOP_START: u32 = 4u; +const CF_KIND_LOOP_END: u32 = 5u; +const CF_KIND_COND_JMP: u32 = 6u; +const CF_KIND_COND_CALL: u32 = 7u; +const CF_KIND_RETURN: u32 = 8u; +const CF_KIND_UNKNOWN: u32 = 15u; + +// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ────── +const ALLOC_KIND_POSITION: u32 = 0u; +const ALLOC_KIND_INTERPOLATORS: u32 = 1u; +const ALLOC_KIND_COLORS: u32 = 2u; + +// Per-invocation Xenos register file + scalar `ps` + predicate. +var registers: array, 128>; +var ps: f32; +var predicate: bool; + +// Currently-active export alloc kind; set by Alloc clauses. +var current_alloc: u32; + +// P3c additions: +// `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the +// interpreter exits. (`discard` inside a helper function is +// allowed in WGSL, but keeping it at the entry level makes +// control flow easier to read.) +// `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF +// clauses. Xenos supports up to 4 nested loops. +// `reject_mask` — bitfield of op categories we failed to interpret, so the +// PS fallback color + host-side diagnostics can surface it. +var kill_flag: bool; +var loop_depth: u32; +var loop_counters: array; +var loop_starts: array; +var reject_mask: u32; + +const REJECT_ALU_VEC: u32 = 1u; +const REJECT_ALU_SCL: u32 = 2u; +const REJECT_TEX_NON2D: u32 = 4u; +const REJECT_VFETCH_FMT: u32 = 8u; +const REJECT_CF_JUMP: u32 = 16u; +const REJECT_CF_CALL: u32 = 32u; +const REJECT_LOOP_OVERFLOW:u32 = 64u; + +struct VsOut { + @builtin(position) position: vec4, + @location(0) color: vec4, +}; + +struct FsOut { + @location(0) color0: vec4, +}; + +// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer. + +fn vs_cf_count() -> u32 { return vs_ucode[0]; } +fn ps_cf_count() -> u32 { return ps_ucode[0]; } + +fn vs_cf_kind(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; } +fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; } +fn vs_cf_aux(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 2u]; } +fn ps_cf_kind(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; } +fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; } +fn ps_cf_aux(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 2u]; } + +fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; } +fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; } + +// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`. +fn vs_instr_dword(t: u32, which: u32) -> u32 { + return vs_ucode[vs_instr_base() + t * 3u + which]; +} +fn ps_instr_dword(t: u32, which: u32) -> u32 { + return ps_ucode[ps_instr_base() + t * 3u + which]; +} + +// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu. + +fn apply_swizzle(v: vec4, swizzle: u32) -> vec4 { + // Swizzle is 8 bits: 2 bits per output lane, xyzw order. + let sx = (swizzle >> 0u) & 3u; + let sy = (swizzle >> 2u) & 3u; + let sz = (swizzle >> 4u) & 3u; + let sw = (swizzle >> 6u) & 3u; + return vec4(v[sx], v[sy], v[sz], v[sw]); +} + +fn apply_modifiers(v: vec4, negate: bool, take_abs: bool) -> vec4 { + var r = v; + if take_abs { + r = abs(r); + } + if negate { + r = -r; + } + return r; +} + +fn write_reg_masked(idx: u32, mask: u32, value: vec4) { + if idx >= 128u { + return; + } + let prev = registers[idx]; + var out = prev; + if (mask & 1u) != 0u { out.x = value.x; } + if (mask & 2u) != 0u { out.y = value.y; } + if (mask & 4u) != 0u { out.z = value.z; } + if (mask & 8u) != 0u { out.w = value.w; } + registers[idx] = out; +} + +// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and +// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in +// `xenia_gpu::ucode::alu::{vop,sop}`). +const VOP_ADD: u32 = 0u; +const VOP_MUL: u32 = 1u; +const VOP_MAX: u32 = 2u; +const VOP_MIN: u32 = 3u; +const VOP_SEQ: u32 = 4u; +const VOP_SGT: u32 = 5u; +const VOP_SGE: u32 = 6u; +const VOP_SNE: u32 = 7u; +const VOP_FRC: u32 = 8u; +const VOP_TRUNC: u32 = 9u; +const VOP_FLOOR: u32 = 10u; +const VOP_MAD: u32 = 11u; +const VOP_CND_EQ: u32 = 12u; +const VOP_CND_GE: u32 = 13u; +const VOP_CND_GT: u32 = 14u; +const VOP_DP4: u32 = 15u; +const VOP_DP3: u32 = 16u; +const VOP_DP2_ADD: u32 = 17u; +const VOP_CUBE: u32 = 18u; +const VOP_MAX4: u32 = 19u; +const VOP_KILL_EQ: u32 = 24u; +const VOP_KILL_GT: u32 = 25u; +const VOP_KILL_GE: u32 = 26u; +const VOP_KILL_NE: u32 = 27u; +const VOP_DST: u32 = 28u; +const VOP_MAX_A: u32 = 29u; + +const SOP_ADDS: u32 = 0u; +const SOP_ADDS_PREV: u32 = 1u; +const SOP_MULS: u32 = 2u; +const SOP_MULS_PREV: u32 = 3u; +const SOP_MAXS: u32 = 5u; +const SOP_MINS: u32 = 6u; +const SOP_SEQS: u32 = 7u; +const SOP_SGTS: u32 = 8u; +const SOP_SGES: u32 = 9u; +const SOP_SNES: u32 = 10u; +const SOP_FRCS: u32 = 11u; +const SOP_TRUNCS: u32 = 12u; +const SOP_FLOORS: u32 = 13u; +const SOP_EXP: u32 = 14u; +const SOP_LOGC: u32 = 15u; +const SOP_LOG: u32 = 16u; +const SOP_RCPC: u32 = 17u; +const SOP_RCPF: u32 = 18u; +const SOP_RCP: u32 = 19u; +const SOP_RSQC: u32 = 20u; +const SOP_RSQF: u32 = 21u; +const SOP_RSQ: u32 = 22u; +const SOP_SUBS: u32 = 25u; +const SOP_SUBS_PREV: u32 = 26u; +const SOP_SETP_EQ: u32 = 27u; +const SOP_SETP_NE: u32 = 28u; +const SOP_SETP_GT: u32 = 29u; +const SOP_SETP_GE: u32 = 30u; +const SOP_SETP_INV: u32 = 31u; +const SOP_SETP_POP: u32 = 32u; +const SOP_SETP_CLR: u32 = 33u; +const SOP_SETP_RSTR: u32 = 34u; +const SOP_KILLS_EQ: u32 = 35u; +const SOP_KILLS_GT: u32 = 36u; +const SOP_KILLS_GE: u32 = 37u; +const SOP_KILLS_NE: u32 = 38u; +const SOP_KILLS_ONE: u32 = 39u; +const SOP_SQRT: u32 = 40u; +const SOP_SIN: u32 = 48u; +const SOP_COS: u32 = 49u; +const SOP_RETAIN_PREV: u32 = 50u; + +// Read a vec4 source from the register file. Treats the src index as a +// direct r# reference (ignores c# selector + swizzle/modifiers for MVP). +// M4+ will extend this to decode the full operand header. +fn read_src(idx: u32) -> vec4 { + return registers[idx & 0x7Fu]; +} + +fn exec_vector_op(op: u32, a: vec4, b: vec4, c: vec4) -> vec4 { + switch op { + case VOP_ADD: { return a + b; } + case VOP_MUL: { return a * b; } + case VOP_MAX: { return max(a, b); } + case VOP_MIN: { return min(a, b); } + case VOP_SEQ: { + return vec4( + select(0.0, 1.0, a.x == b.x), + select(0.0, 1.0, a.y == b.y), + select(0.0, 1.0, a.z == b.z), + select(0.0, 1.0, a.w == b.w), + ); + } + case VOP_SGT: { + return vec4( + select(0.0, 1.0, a.x > b.x), + select(0.0, 1.0, a.y > b.y), + select(0.0, 1.0, a.z > b.z), + select(0.0, 1.0, a.w > b.w), + ); + } + case VOP_SGE: { + return vec4( + select(0.0, 1.0, a.x >= b.x), + select(0.0, 1.0, a.y >= b.y), + select(0.0, 1.0, a.z >= b.z), + select(0.0, 1.0, a.w >= b.w), + ); + } + case VOP_SNE: { + return vec4( + select(0.0, 1.0, a.x != b.x), + select(0.0, 1.0, a.y != b.y), + select(0.0, 1.0, a.z != b.z), + select(0.0, 1.0, a.w != b.w), + ); + } + case VOP_FRC: { return fract(a); } + case VOP_TRUNC: { return trunc(a); } + case VOP_FLOOR: { return floor(a); } + case VOP_MAD: { return a * b + c; } + case VOP_CND_EQ: { + // dst = (src0 == 0) ? src1 : src2 + return vec4( + select(c.x, b.x, a.x == 0.0), + select(c.y, b.y, a.y == 0.0), + select(c.z, b.z, a.z == 0.0), + select(c.w, b.w, a.w == 0.0), + ); + } + case VOP_CND_GE: { + return vec4( + select(c.x, b.x, a.x >= 0.0), + select(c.y, b.y, a.y >= 0.0), + select(c.z, b.z, a.z >= 0.0), + select(c.w, b.w, a.w >= 0.0), + ); + } + case VOP_CND_GT: { + return vec4( + select(c.x, b.x, a.x > 0.0), + select(c.y, b.y, a.y > 0.0), + select(c.z, b.z, a.z > 0.0), + select(c.w, b.w, a.w > 0.0), + ); + } + case VOP_DP4: { + let d = dot(a, b); + return vec4(d, d, d, d); + } + case VOP_DP3: { + let d = dot(a.xyz, b.xyz); + return vec4(d, d, d, d); + } + case VOP_DP2_ADD: { + let d = a.x * b.x + a.y * b.y + c.x; + return vec4(d, d, d, d); + } + case VOP_MAX4: { + let m = max(max(a.x, a.y), max(a.z, a.w)); + return vec4(m, m, m, m); + } + case VOP_KILL_EQ: { + if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w { + kill_flag = true; + return vec4(1.0, 1.0, 1.0, 1.0); + } + return vec4(0.0, 0.0, 0.0, 0.0); + } + case VOP_KILL_GT: { + if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w { + kill_flag = true; + return vec4(1.0, 1.0, 1.0, 1.0); + } + return vec4(0.0, 0.0, 0.0, 0.0); + } + case VOP_KILL_GE: { + if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w { + kill_flag = true; + return vec4(1.0, 1.0, 1.0, 1.0); + } + return vec4(0.0, 0.0, 0.0, 0.0); + } + case VOP_KILL_NE: { + if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w { + kill_flag = true; + return vec4(1.0, 1.0, 1.0, 1.0); + } + return vec4(0.0, 0.0, 0.0, 0.0); + } + case VOP_DST: { + // dest = (1, src0.y * src1.y, src0.z, src1.w) + return vec4(1.0, a.y * b.y, a.z, b.w); + } + case VOP_CUBE, VOP_MAX_A: { + // Cube face projection + MAX+AR are rare in P3c's target set; + // forward to max() fallback so MAX_A degrades gracefully and + // CUBE does *something* useful (max-axis selection). + reject_mask |= REJECT_ALU_VEC; + return max(a, b); + } + default: { + // Unsupported — identity fallback + diagnostic flag. + reject_mask |= REJECT_ALU_VEC; + return vec4(0.0, 0.0, 0.0, 0.0); + } + } +} + +// Scalar op executor. Takes: +// `src_a` / `src_b` — the two scalar source components (most ops use +// only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both). +// `prev` — current `ps` chain value. +// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`. +fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 { + switch op { + case SOP_ADDS: { return src_a + src_b; } + case SOP_ADDS_PREV: { return src_a + prev; } + case SOP_MULS: { return src_a * src_b; } + case SOP_MULS_PREV: { return src_a * prev; } + case SOP_SUBS: { return src_a - src_b; } + case SOP_SUBS_PREV: { return src_a - prev; } + case SOP_MAXS: { return max(src_a, src_b); } + case SOP_MINS: { return min(src_a, src_b); } + case SOP_SEQS: { return select(0.0, 1.0, src_a == 0.0); } + case SOP_SGTS: { return select(0.0, 1.0, src_a > 0.0); } + case SOP_SGES: { return select(0.0, 1.0, src_a >= 0.0); } + case SOP_SNES: { return select(0.0, 1.0, src_a != 0.0); } + case SOP_FRCS: { return fract(src_a); } + case SOP_TRUNCS: { return trunc(src_a); } + case SOP_FLOORS: { return floor(src_a); } + case SOP_EXP: { + // exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which + // WGSL's exp2 already produces for 0. + return exp2(src_a); + } + case SOP_LOG, SOP_LOGC: { + // log/logc both compute log2. Canary's LOGC clamps -INF to + // -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite + // inputs > 0, and we leave src <= 0 to be the shader author's + // problem (identity fallback via select). + return select(log2(src_a), 0.0, src_a == 1.0); + } + case SOP_RCP, SOP_RCPC, SOP_RCPF: { + // IEEE reciprocal with src==0 guard; the clamp-variants differ + // only in how they treat INF/NaN, which is shader-author-rare. + return select(0.0, 1.0 / src_a, src_a != 0.0); + } + case SOP_RSQ, SOP_RSQC, SOP_RSQF: { + return select(0.0, inverseSqrt(src_a), src_a > 0.0); + } + case SOP_SQRT: { return select(0.0, sqrt(src_a), src_a >= 0.0); } + case SOP_SIN: { return sin(src_a); } + case SOP_COS: { return cos(src_a); } + // Predicate writes — update `predicate` and produce a result that + // the surrounding ALU slot can still consume via `ps`. Canary's + // setp-variant dst-write semantics are preserved. + case SOP_SETP_EQ: { + predicate = (src_a == 0.0); + return select(1.0, 0.0, src_a == 0.0); + } + case SOP_SETP_NE: { + predicate = (src_a != 0.0); + return select(1.0, 0.0, src_a != 0.0); + } + case SOP_SETP_GT: { + predicate = (src_a > 0.0); + return select(1.0, 0.0, src_a > 0.0); + } + case SOP_SETP_GE: { + predicate = (src_a >= 0.0); + return select(1.0, 0.0, src_a >= 0.0); + } + case SOP_SETP_INV: { + if src_a == 1.0 { + predicate = true; + return 0.0; + } else { + predicate = false; + return select(src_a, 1.0, src_a == 0.0); + } + } + case SOP_SETP_POP: { + if src_a - 1.0 <= 0.0 { + predicate = true; + return 0.0; + } else { + predicate = false; + return src_a - 1.0; + } + } + case SOP_SETP_CLR: { + predicate = false; + // FLT_MAX sentinel. WGSL's bitcast keeps this portable. + return bitcast(0x7F7FFFFFu); + } + case SOP_SETP_RSTR: { + if src_a == 0.0 { + predicate = true; + return 0.0; + } else { + predicate = false; + return src_a; + } + } + // Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per + // canary, and `discard` runs at fragment exit when the flag is set. + case SOP_KILLS_EQ: { + if src_a == 0.0 { kill_flag = true; return 1.0; } + return 0.0; + } + case SOP_KILLS_GT: { + if src_a > 0.0 { kill_flag = true; return 1.0; } + return 0.0; + } + case SOP_KILLS_GE: { + if src_a >= 0.0 { kill_flag = true; return 1.0; } + return 0.0; + } + case SOP_KILLS_NE: { + if src_a != 0.0 { kill_flag = true; return 1.0; } + return 0.0; + } + case SOP_KILLS_ONE: { + if src_a == 1.0 { kill_flag = true; return 1.0; } + return 0.0; + } + case SOP_RETAIN_PREV: { return prev; } + default: { + reject_mask |= REJECT_ALU_SCL; + return 0.0; + } + } +} + +fn interpret_alu(t: u32, is_vertex: bool) { + // Read the 3-dword instruction triple. + var w0: u32; + var w1: u32; + var w2: u32; + if is_vertex { + w0 = vs_instr_dword(t, 0u); + w1 = vs_instr_dword(t, 1u); + w2 = vs_instr_dword(t, 2u); + } else { + w0 = ps_instr_dword(t, 0u); + w1 = ps_instr_dword(t, 1u); + w2 = ps_instr_dword(t, 2u); + } + // Field extraction matches `xenia_gpu::ucode::alu::decode_alu`. + let vec_op = w2 & 0x3Fu; + let scl_op = (w2 >> 6u) & 0x3Fu; + let vec_dst = (w2 >> 16u) & 0x7Fu; + let scl_dst = (w2 >> 24u) & 0x7Fu; + let vec_wm = (w2 >> 12u) & 0xFu; + let scl_wm = (w2 >> 8u) & 0xFu; + let src_a = w0 & 0xFFu; + let src_b = (w0 >> 8u) & 0xFFu; + let src_c = (w0 >> 16u) & 0xFFu; + let predicated = ((w0 >> 27u) & 1u) != 0u; + let predicate_condition = ((w0 >> 28u) & 1u) != 0u; + let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u; + // `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP + // treats every source as a full r#, no modifiers — M4+ decodes it. + _ = w1; + + // Honor per-instruction predicate: skip when predicated and the + // predicate doesn't match the required condition. + if predicated && (predicate != predicate_condition) { + return; + } + + // Vector pipe. + let a = read_src(src_a); + let b = read_src(src_b); + let c = read_src(src_c); + let vec_result = exec_vector_op(vec_op, a, b, c); + if vec_wm != 0u { + write_reg_masked(vec_dst, vec_wm, vec_result); + } + + // Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants + // read src_a.x and chain through the running `ps`. When `scalar_src_is_ps` + // is set the operand selector chooses `ps` as the primary source. + let scl_src_a = select(a.x, ps, scalar_src_is_ps); + let scl_src_b = b.x; + let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps); + ps = new_ps; + if scl_wm != 0u { + write_reg_masked(scl_dst, scl_wm, vec4(new_ps, new_ps, new_ps, new_ps)); + } +} +// Xenos VertexFormat values from `xenos.h:641`. +const VFMT_8_8_8_8: u32 = 6u; +const VFMT_2_10_10_10: u32 = 7u; +const VFMT_10_11_11: u32 = 16u; +const VFMT_11_11_10: u32 = 17u; +const VFMT_16_16: u32 = 25u; +const VFMT_16_16_16_16: u32 = 26u; +const VFMT_16_16_FLOAT: u32 = 31u; +const VFMT_16_16_16_16_FLOAT:u32 = 32u; +const VFMT_32: u32 = 33u; +const VFMT_32_32: u32 = 34u; +const VFMT_32_32_32_32: u32 = 35u; +const VFMT_32_FLOAT: u32 = 36u; +const VFMT_32_32_FLOAT: u32 = 37u; +const VFMT_32_32_32_32_FLOAT:u32 = 38u; +const VFMT_32_32_32_FLOAT: u32 = 57u; + +// Decode vertex fetch instruction fields (canary's VertexFetchInstruction +// layout in `ucode.h:690`): +// w0 [4:0] opcode +// w0 [10:5] src_reg[5:0] +// w0 [17:11] dst_reg[6:0] + must-be-one +// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0] +// w1 [21:16] format[5:0] +// w2 [7:0] stride (in dwords) +// w2 [30:8] offset (signed, in dwords) +// +// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle, +// sign/normalization flags are used for the most-common normalized-unsigned +// path. Rejects set `REJECT_VFETCH_FMT`. +fn interpret_vertex_fetch(t: u32) { + let w0 = vs_instr_dword(t, 0u); + let w1 = vs_instr_dword(t, 1u); + let w2 = vs_instr_dword(t, 2u); + let fetch_const = (w0 >> 5u) & 0x1Fu; + let dst_reg = (w0 >> 10u) & 0x7Fu; + let src_reg = (w0 >> 17u) & 0x7Fu; + let format = (w1 >> 16u) & 0x3Fu; + let stride = w2 & 0xFFu; + + // Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]); + // dword 1 carries (endian[1:0], size[25:2]). + let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u]; + let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u; + + let vidx = u32(registers[src_reg & 0x7Fu].x); + // Per-vertex byte offset; stride==0 means 1 element total (non-indexed). + let effective_stride = select(stride, 4u, stride == 0u); + let addr = base_dwords + vidx * effective_stride; + + let n = arrayLength(&vertex_buffer); + var result: vec4 = vec4(0.0, 0.0, 0.0, 1.0); + + switch format { + case VFMT_32_32_32_32_FLOAT: { + if addr + 3u < n { + result = vec4( + bitcast(vertex_buffer[addr + 0u]), + bitcast(vertex_buffer[addr + 1u]), + bitcast(vertex_buffer[addr + 2u]), + bitcast(vertex_buffer[addr + 3u]), + ); + } + } + case VFMT_32_32_32_FLOAT: { + if addr + 2u < n { + result = vec4( + bitcast(vertex_buffer[addr + 0u]), + bitcast(vertex_buffer[addr + 1u]), + bitcast(vertex_buffer[addr + 2u]), + 1.0, + ); + } + } + case VFMT_32_32_FLOAT: { + if addr + 1u < n { + result = vec4( + bitcast(vertex_buffer[addr + 0u]), + bitcast(vertex_buffer[addr + 1u]), + 0.0, + 1.0, + ); + } + } + case VFMT_32_FLOAT: { + if addr < n { + result = vec4(bitcast(vertex_buffer[addr]), 0.0, 0.0, 1.0); + } + } + case VFMT_8_8_8_8: { + if addr < n { + result = unpack4x8unorm(vertex_buffer[addr]); + } + } + case VFMT_16_16_FLOAT: { + if addr < n { + let h = unpack2x16float(vertex_buffer[addr]); + result = vec4(h.x, h.y, 0.0, 1.0); + } + } + case VFMT_16_16_16_16_FLOAT: { + if addr + 1u < n { + let h0 = unpack2x16float(vertex_buffer[addr]); + let h1 = unpack2x16float(vertex_buffer[addr + 1u]); + result = vec4(h0.x, h0.y, h1.x, h1.y); + } + } + case VFMT_16_16: { + if addr < n { + // Default to signed normalized; unsigned variants differ + // only for sign-extension and are less common on Xenos VBs. + let h = unpack2x16snorm(vertex_buffer[addr]); + result = vec4(h.x, h.y, 0.0, 1.0); + } + } + case VFMT_16_16_16_16: { + if addr + 1u < n { + let h0 = unpack2x16snorm(vertex_buffer[addr]); + let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]); + result = vec4(h0.x, h0.y, h1.x, h1.y); + } + } + case VFMT_2_10_10_10: { + // Unpack 10-bit R/G/B (signed or unsigned, default unsigned + // normalized) + 2-bit A. Unnormalized not exercised here. + if addr < n { + let packed = vertex_buffer[addr]; + let r = f32(packed & 0x3FFu) / 1023.0; + let g = f32((packed >> 10u) & 0x3FFu) / 1023.0; + let b = f32((packed >> 20u) & 0x3FFu) / 1023.0; + let a = f32((packed >> 30u) & 0x3u) / 3.0; + result = vec4(r, g, b, a); + } + } + default: { + reject_mask |= REJECT_VFETCH_FMT; + // Identity fallback preserves vertex-index visibility. + if addr < n { + result = vec4(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0); + } + } + } + + registers[dst_reg & 0x7Fu] = result; +} + +// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta +// dummy bound at group(1); the real per-slot texture cache lands with P5. +// Reads (u, v) from the source register's .xy and writes the sample into +// the destination register. `textureSampleLevel` works in both VS and PS +// (no implicit derivatives), so no per-stage specialisation needed. +fn interpret_texture_fetch(t: u32, is_vertex: bool) { + var w0: u32 = 0u; + if is_vertex { + w0 = vs_instr_dword(t, 0u); + } else { + w0 = ps_instr_dword(t, 0u); + } + let dst_reg = (w0 >> 10u) & 0x7Fu; + let src_reg = (w0 >> 17u) & 0x7Fu; + let uv = registers[src_reg & 0x7Fu].xy; + let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0); + registers[dst_reg & 0x7Fu] = sample; +} + +// Walk an Exec clause's instruction triples. +// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag +// (we ignore in MVP); bit 1 = is-fetch. +fn exec_vs(address: u32, count: u32, sequence: u32) { + for (var i: u32 = 0u; i < count; i = i + 1u) { + let t = address + i; + let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u; + if is_fetch { + let opcode = vs_instr_dword(t, 0u) & 0x1Fu; + // 0x00 = vertex fetch, 0x01 = texture fetch. + if opcode == 0u { + interpret_vertex_fetch(t); + } else if opcode == 1u { + interpret_texture_fetch(t, true); + } + } else { + interpret_alu(t, true); + } + } +} +fn exec_ps(address: u32, count: u32, sequence: u32) { + for (var i: u32 = 0u; i < count; i = i + 1u) { + let t = address + i; + let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u; + if is_fetch { + interpret_texture_fetch(t, false); + } else { + interpret_alu(t, false); + } + } +} + +// Reset the per-invocation register state to a known baseline. +fn reset_state() { + for (var i: u32 = 0u; i < 128u; i = i + 1u) { + registers[i] = vec4(0.0, 0.0, 0.0, 0.0); + } + ps = 0.0; + predicate = false; + current_alloc = 0u; + kill_flag = false; + loop_depth = 0u; + reject_mask = 0u; + for (var i: u32 = 0u; i < 4u; i = i + 1u) { + loop_counters[i] = 0u; + loop_starts[i] = 0u; + } +} + +// ── Stage entry points. + +// M7 register slots for exports. VS writes position at oPos (convention: +// Xenos export 0 within an `Alloc(Position)` range lands in registers[32]) +// and a set of interpolators. We track both via `current_alloc`: writes +// inside each alloc range are tagged and copied out at Exit. +const OPOS_REG: u32 = 32u; // synthetic slot used by the interpreter +const OCOLOR_REG: u32 = 33u; // color0 scratch slot + +@vertex +fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut { + reset_state(); + + // Seed r0 with the vertex index so simple shaders (or the procedural + // fallback) have access without a real vertex fetch. + registers[0] = vec4(f32(vidx), 0.0, 0.0, 1.0); + // Seed the export slots with a procedural fallback: if the shader + // never writes oPos / oColor, this keeps the output visible rather + // than collapsing to (0,0) which would skip rasterization. + let total = max(draw_ctx.vertex_count, 1u); + let t_param = f32(vidx) / f32(total); + let angle = t_param * 6.2831853; + let radius = 0.35; + registers[OPOS_REG] = vec4(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0); + let d = f32(draw_ctx.draw_index); + registers[OCOLOR_REG] = vec4( + 0.5 + 0.5 * sin(d * 0.37), + 0.5 + 0.5 * sin(d * 0.51 + 2.0), + 0.5 + 0.5 * sin(d * 0.73 + 4.0), + 1.0, + ); + + // Dead-binding guard for VERTEX-stage-only vertex_buffer access. + let vb_live = f32(vertex_buffer[0]) * 0.0; + + // Walk the VS CF table. + walk_cf_vs(); + + var out: VsOut; + // Use registers[OPOS_REG] as position; the procedural fallback above + // seeded it so an un-interpreted shader still draws a recognisable + // circle. + out.position = vec4(registers[OPOS_REG].xyz, registers[OPOS_REG].w); + out.color = vec4(registers[OCOLOR_REG].rgb + vec3(vb_live), registers[OCOLOR_REG].a); + return out; +} + +@fragment +fn fs_main(in: VsOut) -> FsOut { + reset_state(); + + walk_cf_ps(); + + // Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`; + // `discard` at the entry level (outside any helper) is the only way to + // guarantee early-out in WGSL. + if kill_flag { + discard; + } + + var out: FsOut; + out.color0 = in.color; + return out; +} + +// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16× +// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's +// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack +// we don't have). A hard iteration cap keeps the GPU from hanging on +// malformed or extreme shaders. +const CF_WALKER_MAX_ITER: u32 = 4096u; + +fn walk_cf_vs() { + let cf_n = vs_cf_count(); + var cf_i: u32 = 0u; + var iter: u32 = 0u; + loop { + if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; } + iter = iter + 1u; + let kind = vs_cf_kind(cf_i); + let primary = vs_cf_primary(cf_i); + let aux = vs_cf_aux(cf_i); + var advance: bool = true; + var stop: bool = false; + switch kind { + case CF_KIND_EXEC, CF_KIND_EXEC_END: { + let count = aux & 0xFFu; + let sequence = aux >> 8u; + exec_vs(primary, count, sequence); + if kind == CF_KIND_EXEC_END { stop = true; } + } + case CF_KIND_ALLOC: { current_alloc = primary; } + case CF_KIND_EXIT: { stop = true; } + case CF_KIND_LOOP_START: { + let loop_id = aux & 0x1Fu; + var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu; + if loop_count > 16u { + loop_count = 16u; + reject_mask |= REJECT_LOOP_OVERFLOW; + } + if loop_count > 0u && loop_depth < 4u { + loop_starts[loop_depth] = cf_i; + loop_counters[loop_depth] = loop_count; + loop_depth = loop_depth + 1u; + } + // count==0 → fall through; matching LOOP_END will pop. + } + case CF_KIND_LOOP_END: { + if loop_depth > 0u { + let d = loop_depth - 1u; + if loop_counters[d] > 1u { + loop_counters[d] = loop_counters[d] - 1u; + cf_i = loop_starts[d] + 1u; + advance = false; + } else { + loop_counters[d] = 0u; + loop_depth = d; + } + } + } + case CF_KIND_COND_JMP: { + let pred_bits = aux; + let is_pred = (pred_bits & 1u) != 0u; + let pred_cnd = (pred_bits & 2u) != 0u; + if !is_pred || predicate == pred_cnd { + cf_i = primary; + advance = false; + } + } + case CF_KIND_COND_CALL, CF_KIND_RETURN: { + // No call stack — mark and continue. + reject_mask |= REJECT_CF_CALL; + } + default: { reject_mask |= REJECT_CF_JUMP; } + } + if stop { break; } + if advance { cf_i = cf_i + 1u; } + } +} + +fn walk_cf_ps() { + let cf_n = ps_cf_count(); + var cf_i: u32 = 0u; + var iter: u32 = 0u; + loop { + if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; } + iter = iter + 1u; + let kind = ps_cf_kind(cf_i); + let primary = ps_cf_primary(cf_i); + let aux = ps_cf_aux(cf_i); + var advance: bool = true; + var stop: bool = false; + switch kind { + case CF_KIND_EXEC, CF_KIND_EXEC_END: { + let count = aux & 0xFFu; + let sequence = aux >> 8u; + exec_ps(primary, count, sequence); + if kind == CF_KIND_EXEC_END { stop = true; } + } + case CF_KIND_ALLOC: { current_alloc = primary; } + case CF_KIND_EXIT: { stop = true; } + case CF_KIND_LOOP_START: { + let loop_id = aux & 0x1Fu; + var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu; + if loop_count > 16u { + loop_count = 16u; + reject_mask |= REJECT_LOOP_OVERFLOW; + } + if loop_count > 0u && loop_depth < 4u { + loop_starts[loop_depth] = cf_i; + loop_counters[loop_depth] = loop_count; + loop_depth = loop_depth + 1u; + } + } + case CF_KIND_LOOP_END: { + if loop_depth > 0u { + let d = loop_depth - 1u; + if loop_counters[d] > 1u { + loop_counters[d] = loop_counters[d] - 1u; + cf_i = loop_starts[d] + 1u; + advance = false; + } else { + loop_counters[d] = 0u; + loop_depth = d; + } + } + } + case CF_KIND_COND_JMP: { + let pred_bits = aux; + let is_pred = (pred_bits & 1u) != 0u; + let pred_cnd = (pred_bits & 2u) != 0u; + if !is_pred || predicate == pred_cnd { + cf_i = primary; + advance = false; + } + } + case CF_KIND_COND_CALL, CF_KIND_RETURN: { + reject_mask |= REJECT_CF_CALL; + } + default: { reject_mask |= REJECT_CF_JUMP; } + } + if stop { break; } + if advance { cf_i = cf_i + 1u; } + } +} diff --git a/crates/xenia-gpu/src/texture_cache.rs b/crates/xenia-gpu/src/texture_cache.rs new file mode 100644 index 0000000..cc343bc --- /dev/null +++ b/crates/xenia-gpu/src/texture_cache.rs @@ -0,0 +1,970 @@ +//! Texture cache — P5. +//! +//! Two-layer design mirroring canary's `TextureCache`: +//! +//! * **CPU layer** (this module): owns decoded, linear, host-endian texel +//! byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the +//! guest memory's page-version counter to decide whether the cached +//! bytes are still fresh and re-decodes on miss or staleness. +//! * **GPU layer** (xenia-ui `texture_cache_host`): owns the +//! `wgpu::Texture` + `TextureView` for each cached key; pulls decoded +//! bytes from this CPU layer on upload. +//! +//! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and +//! `texture_info_formats.inl` for the format table. + +use std::collections::HashMap; + +use crate::tiled_address; + +/// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`. +/// Values are the raw enum numbers the guest writes into +/// `xe_gpu_texture_fetch_t.format`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum TextureFormat { + K1Reverse = 0, + K1 = 1, + K8 = 2, + K1555 = 3, + K565 = 4, + K6_5_5 = 5, + K8888 = 6, + K1010102 = 7, + K8_8 = 10, + K4_4_4_4 = 15, + K10_11_11 = 16, + K11_11_10 = 17, + Dxt1 = 18, + Dxt2_3 = 19, + Dxt4_5 = 20, + K24_8 = 22, + K24_8Float = 23, + K16 = 24, + K16_16 = 25, + K16_16_16_16 = 26, + K16Float = 30, + K16_16Float = 31, + K16_16_16_16Float = 32, + K32 = 33, + K32_32 = 34, + K32_32_32_32 = 35, + K32Float = 36, + K32_32Float = 37, + K32_32_32_32Float = 38, + Unknown(u8), +} + +impl TextureFormat { + pub fn from_raw(v: u8) -> Self { + use TextureFormat::*; + match v & 0x3F { + 0 => K1Reverse, + 1 => K1, + 2 => K8, + 3 => K1555, + 4 => K565, + 5 => K6_5_5, + 6 => K8888, + 7 => K1010102, + 10 => K8_8, + 15 => K4_4_4_4, + 16 => K10_11_11, + 17 => K11_11_10, + 18 => Dxt1, + 19 => Dxt2_3, + 20 => Dxt4_5, + 22 => K24_8, + 23 => K24_8Float, + 24 => K16, + 25 => K16_16, + 26 => K16_16_16_16, + 30 => K16Float, + 31 => K16_16Float, + 32 => K16_16_16_16Float, + 33 => K32, + 34 => K32_32, + 35 => K32_32_32_32, + 36 => K32Float, + 37 => K32_32Float, + 38 => K32_32_32_32Float, + other => Unknown(other), + } + } + + /// Block width/height in texels + bytes-per-block. For uncompressed + /// formats block_w = block_h = 1. For DXT formats block_w = block_h = + /// 4 (one 4×4 compressed block). + pub fn block_info(self) -> BlockInfo { + use TextureFormat::*; + match self { + K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte + K8 => BlockInfo::new(1, 1, 1), + K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2), + K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16 + | K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4), + K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8), + K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16), + Dxt1 => BlockInfo::new(4, 4, 8), + Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16), + Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback + } + } + + /// True iff this format lands on a wgpu texture format we can + /// natively bind — no CPU-side conversion per frame required. M5 + /// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still + /// counts as supported for the host-cache wiring), `k_DXT2_3` + /// (BC2), and `k_DXT4_5` (BC3). + pub fn is_host_supported(self) -> bool { + matches!( + self, + TextureFormat::K8888 + | TextureFormat::K565 + | TextureFormat::Dxt1 + | TextureFormat::Dxt2_3 + | TextureFormat::Dxt4_5 + ) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct BlockInfo { + pub block_w: u8, + pub block_h: u8, + pub bytes_per_block: u8, +} + +impl BlockInfo { + pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self { + Self { + block_w, + block_h, + bytes_per_block, + } + } + pub fn log2_bpb(self) -> u32 { + match self.bytes_per_block { + 1 => 0, + 2 => 1, + 4 => 2, + 8 => 3, + 16 => 4, + _ => 0, + } + } +} + +/// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Endian { + None = 0, + Swap8In16 = 1, + Swap8In32 = 2, + Swap16In32 = 3, +} + +impl Endian { + pub fn from_raw(v: u8) -> Self { + match v & 0x3 { + 1 => Endian::Swap8In16, + 2 => Endian::Swap8In32, + 3 => Endian::Swap16In32, + _ => Endian::None, + } + } + + /// Apply this endian's byte swap to one 32-bit unit. Matches canary's + /// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls + /// the same mask-shift pattern. + pub fn swap32(self, v: u32) -> u32 { + match self { + Endian::None => v, + Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8), + Endian::Swap8In32 => v.swap_bytes(), + Endian::Swap16In32 => v.rotate_right(16), + } + } +} + +/// Texture dimensionality (`xenos::DataDimension`). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Dimension { + D1 = 0, + D2 = 1, + D3Stacked = 2, + Cube = 3, +} + +impl Dimension { + pub fn from_raw(v: u8) -> Self { + match v & 0x3 { + 1 => Dimension::D2, + 2 => Dimension::D3Stacked, + 3 => Dimension::Cube, + _ => Dimension::D1, + } + } +} + +/// Identity of a cached texture. Matches canary's `TextureCache::TextureKey` +/// at the semantic level — we exclude mip/border state for P5 since neither +/// is populated yet. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct TextureKey { + /// Guest physical base (byte address — already shifted left by 12 from + /// the fetch-constant `base_address` field). + pub base_address: u32, + pub width: u16, + pub height: u16, + pub depth_or_slices: u16, + pub format: TextureFormat, + pub endian: Endian, + pub dimension: Dimension, + pub tiled: bool, + /// Row pitch in texels, already aligned to 32. Canary stores pitch/32 + /// in the fetch constant; we keep the raw texel count to avoid + /// callers remembering to shift. + pub pitch_texels: u16, +} + +/// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`). +/// Returns `None` if the constant is obviously unset (all zeros) or if +/// `type` is not the texture-constant marker. +pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option { + let d0 = dwords[0]; + let d1 = dwords[1]; + let d2 = dwords[2]; + let d5 = dwords[5]; + + // type: low 2 bits of dword 0 should be 2 (texture) per canary — + // 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so + // `None` filters it out here. + let ty = d0 & 0x3; + if d0 == 0 && d1 == 0 { + return None; + } + // Not a texture constant (e.g. 0 = vertex fetch constant reused). + if ty != 2 { + return None; + } + + let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels + let tiled = ((d0 >> 31) & 1) != 0; + let format = TextureFormat::from_raw((d1 & 0x3F) as u8); + let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8); + let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted. + let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8); + + // Size decode depends on dimension. + let (width, height, depth) = match dim { + Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16), + Dimension::D2 => ( + (d2 & 0x1FFF) as u16 + 1, + ((d2 >> 13) & 0x1FFF) as u16 + 1, + ((d2 >> 26) & 0x3F) as u16 + 1, + ), + Dimension::D3Stacked | Dimension::Cube => ( + (d2 & 0x7FF) as u16 + 1, + ((d2 >> 11) & 0x7FF) as u16 + 1, + ((d2 >> 22) & 0x3FF) as u16 + 1, + ), + }; + + Some(TextureKey { + base_address, + width, + height, + depth_or_slices: depth, + format, + endian, + dimension: dim, + tiled, + pitch_texels: ((pitch_5 as u16) * 32).max(width), + }) +} + +/// Decoded, linear, host-endian texture bytes ready for wgpu upload. +#[derive(Debug, Clone)] +pub struct CachedTexture { + pub key: TextureKey, + pub version_when_uploaded: u64, + /// Tightly packed. Layout depends on `key.format`: + /// - `K8888` → `width*height*4` bytes in Rgba8Unorm order. + /// - `Dxt1` → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after + /// block-level detile + dword-endian swap. + pub bytes: Vec, +} + +impl CachedTexture { + pub fn byte_size(&self) -> usize { + self.bytes.len() + } +} + +/// Errors that can happen during decode. The `ensure_cached` caller maps +/// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when +/// a texture fell back. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DecodeError { + UnsupportedFormat, + OutOfBounds, + ZeroSize, +} + +/// Read `len` bytes from guest memory starting at `addr`. Returns `None` +/// if the span would exceed the memory's reported end; otherwise returns +/// a freshly-allocated buffer with the bytes. +/// +/// The `MemoryAccess` trait exposes per-byte reads only; we batch them in +/// a single pass to avoid the per-byte virtual dispatch overhead for large +/// textures (1 MiB frontbuffer = 1M dispatch calls). +pub fn read_guest_bytes( + mem: &dyn xenia_memory::MemoryAccess, + addr: u32, + len: usize, +) -> Vec { + let mut out = Vec::with_capacity(len); + for i in 0..len { + let a = addr.wrapping_add(i as u32); + out.push(mem.read_u8(a)); + if a < addr { + // 32-bit overflow; unmap the tail. + break; + } + } + out +} + +/// Byte-swap the 32-bit dwords of `buf` in place according to `endian`. +/// `buf.len()` should be a multiple of 4; tail bytes are left untouched. +pub fn apply_endian_32(buf: &mut [u8], endian: Endian) { + if matches!(endian, Endian::None) { + return; + } + let mut i = 0; + while i + 4 <= buf.len() { + let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]); + let swapped = endian.swap32(v); + buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes()); + i += 4; + } +} + +/// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes. +/// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory → +/// we emit RGBA for wgpu) and the declared endian swap, then detiles via +/// the Xenos Tiled2D formula. +pub fn decode_k8888_tiled( + key: &TextureKey, + mem: &dyn xenia_memory::MemoryAccess, +) -> Result, DecodeError> { + if key.width == 0 || key.height == 0 { + return Err(DecodeError::ZeroSize); + } + let w = key.width as u32; + let h = key.height as u32; + let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32); + let total_bytes = (pitch_aligned * h * 4) as usize; + let mut raw = read_guest_bytes(mem, key.base_address, total_bytes); + if raw.len() < total_bytes { + return Err(DecodeError::OutOfBounds); + } + apply_endian_32(&mut raw, key.endian); + let mut linear = vec![0u8; (w * h * 4) as usize]; + if key.tiled { + if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() { + return Err(DecodeError::OutOfBounds); + } + } else { + // Non-tiled copy row-by-row honoring pitch. + for y in 0..h as usize { + let src = y * (pitch_aligned as usize) * 4; + let dst = y * (w as usize) * 4; + linear[dst..dst + (w as usize) * 4] + .copy_from_slice(&raw[src..src + (w as usize) * 4]); + } + } + // Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After + // endian.Swap8In32 guests' typical per-dword byte order becomes BGRA + // in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu. + for px in linear.chunks_exact_mut(4) { + px.swap(0, 2); + } + Ok(linear) +} + +/// Decode a DXT-compressed texture to raw block bytes (no format +/// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the +/// GPU does the actual decompression on upload). +/// +/// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D +/// formula, with stride counted in blocks. `bytes_per_block` is 8 for +/// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5). +pub fn decode_dxt_tiled( + key: &TextureKey, + mem: &dyn xenia_memory::MemoryAccess, + bytes_per_block: u32, +) -> Result, DecodeError> { + if key.width == 0 || key.height == 0 { + return Err(DecodeError::ZeroSize); + } + let block_w = 4u32; + let block_h = 4u32; + let w_blocks = (key.width as u32).div_ceil(block_w); + let h_blocks = (key.height as u32).div_ceil(block_h); + let pitch_blocks = tiled_address::align_pitch_to_macro_tile( + (key.pitch_texels as u32).div_ceil(block_w), + ); + let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize; + let mut raw = read_guest_bytes(mem, key.base_address, total_bytes); + if raw.len() < total_bytes { + return Err(DecodeError::OutOfBounds); + } + // DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar + // u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word + // order requires an endian swap at the u16/u32 level regardless of + // which BC-family format. + apply_endian_32(&mut raw, key.endian); + + let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize]; + if key.tiled { + if tiled_address::detile_2d( + &raw, + &mut out, + w_blocks, + h_blocks, + pitch_blocks, + bytes_per_block, + ) + .is_err() + { + return Err(DecodeError::OutOfBounds); + } + } else { + for y in 0..h_blocks as usize { + let src = y * (pitch_blocks as usize) * (bytes_per_block as usize); + let dst = y * (w_blocks as usize) * (bytes_per_block as usize); + out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)] + .copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]); + } + } + Ok(out) +} + +/// BC1 / DXT1 — 8-byte blocks. +pub fn decode_dxt1_tiled( + key: &TextureKey, + mem: &dyn xenia_memory::MemoryAccess, +) -> Result, DecodeError> { + decode_dxt_tiled(key, mem, 8) +} + +/// BC2 / DXT2_3 — 16-byte blocks. +pub fn decode_dxt23_tiled( + key: &TextureKey, + mem: &dyn xenia_memory::MemoryAccess, +) -> Result, DecodeError> { + decode_dxt_tiled(key, mem, 16) +} + +/// BC3 / DXT4_5 — 16-byte blocks. +pub fn decode_dxt45_tiled( + key: &TextureKey, + mem: &dyn xenia_memory::MemoryAccess, +) -> Result, DecodeError> { + decode_dxt_tiled(key, mem, 16) +} + +/// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high +/// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of +/// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a +/// sampled texture format on every backend, so CPU-side conversion is +/// the safe path even if it's 2× the texture memory. +/// +/// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes), +/// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear +/// output buffer. +pub fn decode_k565_tiled( + key: &TextureKey, + mem: &dyn xenia_memory::MemoryAccess, +) -> Result, DecodeError> { + if key.width == 0 || key.height == 0 { + return Err(DecodeError::ZeroSize); + } + let w = key.width as u32; + let h = key.height as u32; + // Pitch/block counts — block = 1 texel here, 2 bytes. + let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32); + let total_bytes = (pitch_aligned * h * 2) as usize; + let mut raw = read_guest_bytes(mem, key.base_address, total_bytes); + if raw.len() < total_bytes { + return Err(DecodeError::OutOfBounds); + } + // 16-bit word order is endian-swap-sensitive. + apply_endian_32(&mut raw, key.endian); + // Step 1: detile (bytes_per_block=2, tile in blocks=texels). + let mut linear_u16 = vec![0u8; (w * h * 2) as usize]; + if key.tiled { + if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() { + return Err(DecodeError::OutOfBounds); + } + } else { + for y in 0..h as usize { + let src = y * (pitch_aligned as usize) * 2; + let dst = y * (w as usize) * 2; + linear_u16[dst..dst + (w as usize) * 2] + .copy_from_slice(&raw[src..src + (w as usize) * 2]); + } + } + // Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16 + // is little-endian after `apply_endian_32` has normalized the word + // order (we keep host-native byte ordering post-swap). + let mut rgba = vec![0u8; (w * h * 4) as usize]; + for y in 0..h as usize { + for x in 0..w as usize { + let off = (y * w as usize + x) * 2; + let lo = linear_u16[off]; + let hi = linear_u16[off + 1]; + let word = u16::from_le_bytes([lo, hi]); + // 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4). + // Expand to full-range u8: replicate high bits into low + // (so 0b11111 → 0xFF, matching the standard 565→888 convention). + let r5 = ((word >> 11) & 0x1F) as u8; + let g6 = ((word >> 5) & 0x3F) as u8; + let b5 = (word & 0x1F) as u8; + let r = (r5 << 3) | (r5 >> 2); + let g = (g6 << 2) | (g6 >> 4); + let b = (b5 << 3) | (b5 >> 2); + let o = (y * w as usize + x) * 4; + rgba[o] = r; + rgba[o + 1] = g; + rgba[o + 2] = b; + rgba[o + 3] = 0xFF; + } + } + Ok(rgba) +} + +/// Version-aware CPU-side texture cache. Entries are keyed on +/// `TextureKey.hash` and carry a `version_when_uploaded` watermark against +/// the guest memory's page-version counter. `ensure_cached` queries +/// `GuestMemory::max_page_version` over the texture's byte span; if the +/// span has been written since cache time, the entry is re-decoded. +pub struct TextureCache { + entries: HashMap, + /// Monotonic counter of decodes performed — HUD surface. + pub decodes_total: u64, + /// Count of stale-miss re-decodes. + pub restale_total: u64, +} + +impl Default for TextureCache { + fn default() -> Self { + Self::new() + } +} + +impl TextureCache { + pub fn new() -> Self { + Self { + entries: HashMap::new(), + decodes_total: 0, + restale_total: 0, + } + } + + pub fn len(&self) -> usize { + self.entries.len() + } + + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> { + self.entries.get(key) + } + + /// Return a cached (or freshly-decoded) texture. The caller supplies + /// the current guest-memory page version covering the texture span; + /// see [`max_page_version_for`]. + pub fn ensure_cached( + &mut self, + key: TextureKey, + current_version: u64, + mem: &dyn xenia_memory::MemoryAccess, + ) -> Result<&CachedTexture, DecodeError> { + // Fast path: fresh entry exists. + if let Some(e) = self.entries.get(&key) { + if e.version_when_uploaded >= current_version { + return Ok(self.entries.get(&key).unwrap()); + } + self.restale_total += 1; + } + let bytes = match key.format { + TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?, + TextureFormat::K565 => decode_k565_tiled(&key, mem)?, + TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?, + TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?, + TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?, + _ => return Err(DecodeError::UnsupportedFormat), + }; + self.decodes_total += 1; + let entry = CachedTexture { + key, + version_when_uploaded: current_version, + bytes, + }; + self.entries.insert(key, entry); + Ok(self.entries.get(&key).unwrap()) + } + + pub fn byte_budget(&self) -> usize { + self.entries.values().map(|e| e.byte_size()).sum() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::cell::Cell; + + struct FakeMem(Box<[Cell]>); + impl FakeMem { + fn from_vec(v: Vec) -> Self { + FakeMem(v.into_iter().map(Cell::new).collect()) + } + } + impl xenia_memory::MemoryAccess for FakeMem { + fn read_u8(&self, a: u32) -> u8 { + self.0.get(a as usize).map(|c| c.get()).unwrap_or(0) + } + fn read_u16(&self, a: u32) -> u16 { + u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)]) + } + fn read_u32(&self, a: u32) -> u32 { + u32::from_be_bytes([ + self.read_u8(a), + self.read_u8(a + 1), + self.read_u8(a + 2), + self.read_u8(a + 3), + ]) + } + fn read_u64(&self, a: u32) -> u64 { + u64::from_be_bytes([ + self.read_u8(a), + self.read_u8(a + 1), + self.read_u8(a + 2), + self.read_u8(a + 3), + self.read_u8(a + 4), + self.read_u8(a + 5), + self.read_u8(a + 6), + self.read_u8(a + 7), + ]) + } + fn write_u8(&self, a: u32, v: u8) { + if let Some(slot) = self.0.get(a as usize) { + slot.set(v); + } + } + fn write_u16(&self, a: u32, v: u16) { + let b = v.to_be_bytes(); + self.write_u8(a, b[0]); + self.write_u8(a + 1, b[1]); + } + fn write_u32(&self, a: u32, v: u32) { + let b = v.to_be_bytes(); + for i in 0..4 { + self.write_u8(a + i as u32, b[i]); + } + } + fn write_u64(&self, a: u32, v: u64) { + let b = v.to_be_bytes(); + for i in 0..8 { + self.write_u8(a + i as u32, b[i]); + } + } + fn translate(&self, _: u32) -> Option<*const u8> { + None + } + fn translate_mut(&self, _: u32) -> Option<*mut u8> { + None + } + } + + #[test] + fn format_block_info_matches_canary_expectations() { + assert_eq!( + TextureFormat::K8888.block_info(), + BlockInfo::new(1, 1, 4) + ); + assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8)); + assert_eq!( + TextureFormat::Dxt4_5.block_info(), + BlockInfo::new(4, 4, 16) + ); + } + + #[test] + fn endian_swap_variants() { + assert_eq!(Endian::None.swap32(0x11223344), 0x11223344); + assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433); + assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211); + assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122); + } + + #[test] + fn decode_fetch_constant_rejects_empty() { + let z = [0u32; 6]; + assert!(decode_fetch_constant(z).is_none()); + } + + #[test] + fn decode_fetch_constant_parses_2d_k8888() { + // Build a synthetic k_8_8_8_8 2D texture fetch constant: + // dword0: pitch_5=40 (1280/32), tiled=1, type=2 + // dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12 + // dword2: width-1=1279, height-1=719 + // dword5: dimension=1 (2D) + let d0 = 0x8000_0000 | (40u32 << 22) | 2; + let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32; + let d2 = 1279u32 | ((719u32) << 13); + let d5 = 1u32 << 9; + let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed"); + assert_eq!(k.format, TextureFormat::K8888); + assert_eq!(k.endian, Endian::Swap8In32); + assert_eq!(k.width, 1280); + assert_eq!(k.height, 720); + assert_eq!(k.dimension, Dimension::D2); + assert!(k.tiled); + assert_eq!(k.pitch_texels, 1280); + } + + #[test] + fn decode_k8888_roundtrip_linear() { + // Build a 4×4 non-tiled image with pitch=32 (one macro-tile row). + // Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a + // big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must + // be (x, y, y*4+x, 0xFF) in RGBA order. + let w = 4u32; + let h = 4u32; + let pitch = 32u32; + let mut bytes = vec![0u8; (pitch * h * 4) as usize]; + for y in 0..h { + for x in 0..w { + let off = ((y * pitch + x) * 4) as usize; + let argb = (0xFFu32 << 24) + | ((x as u32) << 16) + | ((y as u32) << 8) + | ((y * 4 + x) as u32); + bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes()); + } + } + let mem = FakeMem::from_vec(bytes); + let key = TextureKey { + base_address: 0, + width: 4, + height: 4, + depth_or_slices: 1, + format: TextureFormat::K8888, + endian: Endian::Swap8In32, + dimension: Dimension::D2, + tiled: false, + pitch_texels: pitch as u16, + }; + let out = decode_k8888_tiled(&key, &mem).expect("decode"); + assert_eq!(out.len(), 16 * 4); + assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]); + let off = ((3 * 4 + 3) * 4) as usize; + assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]); + } + + // ── First-Pixels M5 format tests ────────────────────────────── + + /// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block. + /// Synthetic source of 0xDEADBEEF... bytes; assert the decoder + /// returns the same bytes (passthrough after endian swap). + #[test] + fn decode_dxt23_small_roundtrip() { + // 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32 + // (macro-tile-aligned) the block pitch is 8 (=32/4), and we + // allocate 8*1*16 = 128 bytes of source. + let mut bytes = vec![0u8; 128]; + for (i, b) in bytes.iter_mut().enumerate().take(16) { + *b = i as u8; + } + let mem = FakeMem::from_vec(bytes); + let key = TextureKey { + base_address: 0, + width: 4, + height: 4, + depth_or_slices: 1, + format: TextureFormat::Dxt2_3, + endian: Endian::None, // no swap — we can eyeball passthrough + dimension: Dimension::D2, + tiled: false, + pitch_texels: 32, + }; + let out = decode_dxt23_tiled(&key, &mem).expect("decode"); + assert_eq!(out.len(), 16); // 1 block × 16 bytes + for i in 0..16 { + assert_eq!(out[i], i as u8); + } + } + + /// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a + /// parallel test prevents a regression that sneaks up via the + /// generic `decode_dxt_tiled`. + #[test] + fn decode_dxt45_uses_16byte_blocks() { + let mem = FakeMem::from_vec(vec![0xAAu8; 256]); + let key = TextureKey { + base_address: 0, + width: 8, + height: 4, // 2×1 blocks + depth_or_slices: 1, + format: TextureFormat::Dxt4_5, + endian: Endian::None, + dimension: Dimension::D2, + tiled: false, + pitch_texels: 32, + }; + let out = decode_dxt45_tiled(&key, &mem).expect("decode"); + assert_eq!(out.len(), 2 * 16); + } + + /// k_5_6_5: a single white texel (all bits set, 0xFFFF) should + /// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red + /// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0 + /// B=0 via the high-bit-replicate convention. + #[test] + fn decode_k565_texel_expansion() { + // Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels + // → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white), + // texel[1] = 0xF800 (pure red). + let mut bytes = vec![0u8; 64]; + // 0xFFFF + bytes[0] = 0xFF; + bytes[1] = 0xFF; + // 0xF800 (big-endian memory): high byte 0xF8, low 0x00. + // But after apply_endian_32(Endian::None) we use little-endian + // word decoding — so memory must carry the bytes in LE order. + bytes[2] = 0x00; + bytes[3] = 0xF8; + let mem = FakeMem::from_vec(bytes); + let key = TextureKey { + base_address: 0, + width: 2, + height: 1, + depth_or_slices: 1, + format: TextureFormat::K565, + endian: Endian::None, + dimension: Dimension::D2, + tiled: false, + pitch_texels: 32, + }; + let out = decode_k565_tiled(&key, &mem).expect("decode"); + assert_eq!(out.len(), 2 * 4); + // Texel 0: white. + assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]); + // Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF). + assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]); + } + + #[test] + fn is_host_supported_covers_m5_formats() { + assert!(TextureFormat::K8888.is_host_supported()); + assert!(TextureFormat::K565.is_host_supported()); + assert!(TextureFormat::Dxt1.is_host_supported()); + assert!(TextureFormat::Dxt2_3.is_host_supported()); + assert!(TextureFormat::Dxt4_5.is_host_supported()); + // Unsupported formats should still report false. + assert!(!TextureFormat::K16.is_host_supported()); + assert!(!TextureFormat::K32Float.is_host_supported()); + } + + #[test] + fn texture_cache_caches_and_reuses() { + let mut cache = TextureCache::new(); + let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]); + let key = TextureKey { + base_address: 0, + width: 4, + height: 4, + depth_or_slices: 1, + format: TextureFormat::K8888, + endian: Endian::None, + dimension: Dimension::D2, + tiled: false, + pitch_texels: 32, + }; + cache.ensure_cached(key, 0, &mem).unwrap(); + assert_eq!(cache.decodes_total, 1); + // Same version: should hit cache. + cache.ensure_cached(key, 0, &mem).unwrap(); + assert_eq!(cache.decodes_total, 1); + // Higher version: stale → re-decode. + cache.ensure_cached(key, 1, &mem).unwrap(); + assert_eq!(cache.decodes_total, 2); + assert_eq!(cache.restale_total, 1); + } + + /// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey` + /// → `ensure_cached` on fresh/version-bumped memory → stale re-decode. + /// Mirrors what `vd_swap` does per frame. + #[test] + fn e2e_fetch_const_to_cache_with_versioning() { + // 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned. + let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2 + let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none + let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3 + let d5 = 1u32 << 9; // 2D + let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded"); + assert_eq!(key.format, TextureFormat::K8888); + assert_eq!(key.width, 4); + + let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]); + let mut cache = TextureCache::new(); + // v0 decode. + let first = cache + .ensure_cached(key, 0, &mem) + .expect("initial decode") + .clone(); + // Same version → cache hit. + cache.ensure_cached(key, 0, &mem).expect("hit"); + assert_eq!(cache.decodes_total, 1); + // Simulate the guest writing to the texture's pages: version bumps. + for b in &mem.0[..16] { + b.set(0xFF); + } + cache.ensure_cached(key, 1, &mem).expect("re-decode"); + assert_eq!(cache.decodes_total, 2); + assert_eq!(cache.restale_total, 1); + // Bytes differ from v0 (proof the re-decode happened). + let second = cache.get(&key).unwrap(); + assert_ne!(first.bytes, second.bytes); + } + + #[test] + fn texture_cache_rejects_unsupported_format() { + let mut cache = TextureCache::new(); + let mem = FakeMem::from_vec(vec![0u8; 1024]); + let key = TextureKey { + base_address: 0, + width: 4, + height: 4, + depth_or_slices: 1, + format: TextureFormat::K16, + endian: Endian::None, + dimension: Dimension::D2, + tiled: false, + pitch_texels: 32, + }; + assert!(matches!( + cache.ensure_cached(key, 0, &mem), + Err(DecodeError::UnsupportedFormat) + )); + } +} diff --git a/crates/xenia-gpu/src/tiled_address.rs b/crates/xenia-gpu/src/tiled_address.rs new file mode 100644 index 0000000..10dabfb --- /dev/null +++ b/crates/xenia-gpu/src/tiled_address.rs @@ -0,0 +1,178 @@ +//! Xenos tiled-texture address formula (2D, Tiled2D layout). +//! +//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D / +//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block +//! macro-tile pattern with bank+pipe interleave for its internal DRAM +//! banks; this formula inverts that so we can read pixels out in linear +//! order, given the tiled source buffer. +//! +//! We use this in two places during P4: +//! - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually). +//! - Any place we need to read tiled guest memory into a host-linear +//! buffer for CPU-side conversion before upload. + +/// Tile size constants from canary. +pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px +pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px + +/// Canary's `TiledCombine` helper — reassembles the DRAM address from the +/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits. +#[inline] +fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 { + (y_lsb << 4) + | (pipe << 6) + | (bank << 11) + | (outer_inner_bytes & 0b1111) + | (((outer_inner_bytes >> 4) & 0b1) << 5) + | (((outer_inner_bytes >> 5) & 0b111) << 8) + | ((outer_inner_bytes >> 8) << 12) +} + +/// 2D tiled offset in bytes from (x, y) into a tiled surface with +/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and +/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte +/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns +/// `u32` rather than canary's signed `int` since our callers stay in +/// unsigned arithmetic. +/// +/// This is the canonical formula — do not simplify without re-reading +/// `texture_address.h:190-208`; the bit-interleave cannot be expressed +/// as a linear function. +pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 { + let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2; + // Outer: which 32×32 macro tile we're in. + let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols) + + (x >> MACRO_TILE_WIDTH_LOG2)) + << 6; + // Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit + // because that bit becomes the `y_lsb` interleave bit below). + let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111); + let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2; + + let bank = (y >> 4) & 0b1; + let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1); + let y_lsb = y & 1; + + tiled_combine(outer_inner_bytes, bank, pipe, y_lsb) +} + +/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width +/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad. +#[inline] +pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 { + let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1; + (pitch_pixels + mask) & !mask +} + +/// Detile a 2D tiled surface into a linear destination buffer. The +/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at +/// one block in the tiled source, and the detiler writes it into `dst` +/// at the linear (x, y) position. +/// +/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as +/// a single 16-bit block, etc.). `dst` must be at least +/// `width * height * bpp` bytes long. +/// +/// Returns `Err(())` if the source doesn't contain enough bytes for the +/// largest offset the formula would produce (defensive — callers can +/// downgrade silently). +pub fn detile_2d( + src: &[u8], + dst: &mut [u8], + width: u32, + height: u32, + pitch_pixels: u32, + bpp: u32, +) -> Result<(), ()> { + let bpp_log2 = bpp.trailing_zeros(); + let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels); + let dst_pitch_bytes = (width * bpp) as usize; + let bpp_u = bpp as usize; + + for y in 0..height { + for x in 0..width { + let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize; + if src_off + bpp_u > src.len() { + return Err(()); + } + let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u; + if dst_off + bpp_u > dst.len() { + return Err(()); + } + dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// The (0, 0) pixel is always at byte offset 0 regardless of pitch. + #[test] + fn origin_is_zero() { + assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0); + } + + /// Round-trip: detiling a tiled buffer that was filled using the same + /// formula produces the identity linear image. + #[test] + fn roundtrip_small_pattern() { + let w = 32u32; + let h = 16u32; + let bpp = 4u32; + let pitch = align_pitch_to_macro_tile(w); + // Allocate a tiled buffer large enough for the largest offset. + let max_off = (0..h) + .flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4)) + .max() + .unwrap(); + let mut tiled = vec![0u8; max_off]; + // Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into + // each logical (x, y) position in the tiled buffer. + for y in 0..h { + for x in 0..w { + let off = tiled_2d_offset(x, y, pitch, 2) as usize; + tiled[off + 0] = x as u8; + tiled[off + 1] = y as u8; + tiled[off + 2] = (x ^ y) as u8; + tiled[off + 3] = 0xFF; + } + } + let mut linear = vec![0u8; (w * h * bpp) as usize]; + detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok"); + // Verify every logical pixel landed at the right linear offset. + for y in 0..h { + for x in 0..w { + let lin = ((y * w + x) * bpp) as usize; + assert_eq!(linear[lin + 0], x as u8); + assert_eq!(linear[lin + 1], y as u8); + assert_eq!(linear[lin + 2], (x ^ y) as u8); + assert_eq!(linear[lin + 3], 0xFF); + } + } + } + + /// Within a single macro-tile row, stepping `x` by 1 changes the low + /// 3 bits of `x` which feed the `inner_blocks` field — different + /// offsets are expected (no aliasing). + #[test] + fn neighbouring_pixels_have_distinct_offsets() { + let mut seen = std::collections::HashSet::new(); + for y in 0..16 { + for x in 0..32 { + assert!(seen.insert(tiled_2d_offset(x, y, 32, 2))); + } + } + } + + /// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281 + /// rounds to 1312. + #[test] + fn align_pitch_rounds_up_to_32() { + assert_eq!(align_pitch_to_macro_tile(1280), 1280); + assert_eq!(align_pitch_to_macro_tile(1281), 1312); + assert_eq!(align_pitch_to_macro_tile(31), 32); + } +} diff --git a/crates/xenia-gpu/src/translator.rs b/crates/xenia-gpu/src/translator.rs new file mode 100644 index 0000000..9a8d8c1 --- /dev/null +++ b/crates/xenia-gpu/src/translator.rs @@ -0,0 +1,557 @@ +//! Xenos → WGSL direct translator (P7). +//! +//! Replaces the runtime uber-shader interpreter (P3b/P3c) for shaders whose +//! feature set we cover. Emits a *standalone* WGSL module per shader +//! instead of walking a ucode buffer at draw time — pipeline compilation +//! happens once, then every subsequent dispatch is a direct `draw()`. +//! +//! The translator is deliberately narrow: when it encounters an opcode / +//! fetch format / CF shape it doesn't know, it returns [`None`] and the +//! caller falls back to the interpreter. This keeps the op-coverage work +//! incremental — future commits can add one opcode at a time without +//! invalidating the scaffolding. +//! +//! Current coverage (v1): +//! * Linear CF: `Exec`/`ExecEnd`, `Alloc`, `Exit`. No loops / branches / +//! calls / predicate-gated clauses. +//! * ALU vector: `ADD`, `MUL`, `MAX`, `MIN`, `MAD`, `DP4`, `DP3`, +//! `DP2_ADD`, `SEQ`, `SGT`, `SGE`, `SNE`, `FRC`, `FLOOR`. +//! * ALU scalar: `ADDS`, `MULS`, `MAXS`, `MINS`, `RCP`, `RETAIN_PREV`. +//! * Vertex fetch: `R32G32B32A32_FLOAT` only. +//! * Texture fetch: 2D via the single `@group(1)` slot (same one P5/M6 +//! binds). +//! * Exports: VS writes position + interpolator 0 (color); PS writes +//! color0. +//! +//! When a shader exceeds this subset, [`translate`] returns `None` and +//! `gpu.shader.translate_reject{reason}` is bumped by the caller. + +use crate::ucode::alu::{decode_alu, sop, vop, AluInstruction}; +use crate::ucode::control_flow::{AllocKind, ControlFlowInstruction}; +use crate::ucode::fetch::{decode_fetch, FetchInstruction}; +use crate::ucode::ParsedShader; + +/// Shader stage we're emitting for. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Stage { + Vertex, + Pixel, +} + +/// Success or refusal from the translator. On refusal, the caller falls +/// back to the runtime uber-shader interpreter. +#[derive(Debug)] +pub enum Translation { + /// The emitted WGSL body for *this stage only*. Both VS + PS get + /// wrapped into one module via [`combine_stages`]. + Ok(String), + /// Translator saw an op/pattern it doesn't handle; fallback. + Reject(&'static str), +} + +/// Full WGSL module for a (VS, PS) pair ready to hand to +/// `wgpu::Device::create_shader_module`. Shares the header across the two +/// bodies so bindings, struct declarations, and helpers aren't duplicated. +pub fn combine_stages(vs_body: &str, ps_body: &str) -> String { + let mut out = String::with_capacity(4096 + vs_body.len() + ps_body.len()); + out.push_str(MODULE_HEADER); + out.push_str(vs_body); + out.push_str(ps_body); + out +} + +/// Translate a single shader stage. Returns `None` on any unsupported +/// feature with a short reason string that the caller plumbs into the +/// `gpu.shader.translate_reject{reason}` metric. +pub fn translate(parsed: &ParsedShader, stage: Stage) -> Translation { + let mut ctx = EmitCtx::new(stage); + // Emit the stage entry function body. + if let Err(reason) = ctx.emit_stage_body(parsed) { + return Translation::Reject(reason); + } + Translation::Ok(ctx.finish()) +} + +/// Reject reasons; kept as static &'str for zero-alloc metrics. +pub mod reject { + pub const VEC_OP_UNSUPPORTED: &str = "vec_op_unsupported"; + pub const SCL_OP_UNSUPPORTED: &str = "scl_op_unsupported"; + pub const CF_LOOP: &str = "cf_loop"; + pub const CF_COND: &str = "cf_cond"; + pub const CF_CALL: &str = "cf_call"; + pub const CF_UNKNOWN: &str = "cf_unknown"; + pub const VFETCH_FMT: &str = "vfetch_fmt"; + pub const TFETCH_NON2D: &str = "tfetch_non2d"; + pub const INSTR_OOB: &str = "instr_oob"; +} + +/// Shader-module preamble (bindings, helpers, struct defs). The bindings +/// mirror the xenos pipeline's `@group(0)` + `@group(1)` layout from P5/M6 +/// so we can use **the same bind-group slots** — only the pipeline object +/// differs between interpreter mode and translator mode. +const MODULE_HEADER: &str = r#" +struct XenosDrawConstants { + draw_index: u32, + vertex_count: u32, + prim_kind: u32, + _pad: u32, +}; + +struct XenosConstants { + alu: array, 512>, + fetch: array, + bool_consts: array, + loop_consts: array, +}; + +@group(0) @binding(0) var draw_ctx : XenosDrawConstants; +@group(0) @binding(1) var xenos_consts : XenosConstants; +@group(0) @binding(2) var vs_ucode : array; +@group(0) @binding(3) var ps_ucode : array; +@group(0) @binding(4) var vertex_buffer : array; + +@group(1) @binding(0) var xenos_tex : texture_2d; +@group(1) @binding(1) var xenos_samp : sampler; + +struct VsOut { + @builtin(position) position: vec4, + @location(0) color: vec4, +}; + +struct FsOut { + @location(0) color0: vec4, +}; + +// Helper: reciprocal guarded against divide-by-zero. +fn xe_rcp(x: f32) -> f32 { + return select(0.0, 1.0 / x, x != 0.0); +} +"#; + +struct EmitCtx { + stage: Stage, + out: String, + indent: usize, +} + +impl EmitCtx { + fn new(stage: Stage) -> Self { + Self { + stage, + out: String::with_capacity(2048), + indent: 0, + } + } + + fn finish(self) -> String { + self.out + } + + fn push(&mut self, s: &str) { + for _ in 0..self.indent { + self.out.push_str(" "); + } + self.out.push_str(s); + self.out.push('\n'); + } + + fn emit_stage_body(&mut self, parsed: &ParsedShader) -> Result<(), &'static str> { + // Entry function + struct header. + match self.stage { + Stage::Vertex => { + self.push("@vertex"); + self.push("fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {"); + } + Stage::Pixel => { + self.push("@fragment"); + self.push("fn fs_main(in: VsOut) -> FsOut {"); + } + } + self.indent = 1; + // Register file + ps chain + export slots. All local `var`s so each + // invocation gets its own state; translator-emitted code doesn't + // need `var` because we don't share across function calls. + self.push("var r: array, 128>;"); + self.push("for (var i = 0u; i < 128u; i = i + 1u) { r[i] = vec4(0.0); }"); + self.push("var ps: f32 = 0.0;"); + match self.stage { + Stage::Vertex => { + // Seed r0 with vertex index for simple shaders that read it. + self.push("r[0] = vec4(f32(vidx), 0.0, 0.0, 1.0);"); + // Synthetic export slots — match the interpreter's layout so + // the fallback path and translator path produce the same + // visual output on shaders both support. + self.push("var opos: vec4 = vec4(0.0, 0.0, 0.0, 1.0);"); + self.push("var ocolor: vec4 = vec4(1.0, 1.0, 1.0, 1.0);"); + } + Stage::Pixel => { + // Seed r0.xy with interpolated color lane so trivial shaders + // that read r0 still produce something. + self.push("r[0] = in.color;"); + self.push("var ocolor0: vec4 = in.color;"); + } + } + + let mut current_alloc = AllocKind::Other; + for clause in &parsed.cf { + match clause { + ControlFlowInstruction::Exec { + address, + count, + sequence, + is_end, + predicated, + .. + } => { + if *predicated { + return Err(reject::CF_COND); + } + self.emit_exec(parsed, *address, *count, *sequence, current_alloc)?; + if *is_end { + break; + } + } + ControlFlowInstruction::Alloc { kind, .. } => { + current_alloc = *kind; + } + ControlFlowInstruction::Exit => break, + ControlFlowInstruction::LoopStart { .. } + | ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP), + ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND), + ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => { + return Err(reject::CF_CALL); + } + ControlFlowInstruction::Unknown { .. } => return Err(reject::CF_UNKNOWN), + } + } + + match self.stage { + Stage::Vertex => { + self.push("var out: VsOut;"); + self.push("out.position = opos;"); + self.push("out.color = ocolor;"); + self.push("return out;"); + } + Stage::Pixel => { + self.push("var out: FsOut;"); + self.push("out.color0 = ocolor0;"); + self.push("return out;"); + } + } + self.indent = 0; + self.push("}"); + Ok(()) + } + + fn emit_exec( + &mut self, + parsed: &ParsedShader, + address: u32, + count: u32, + sequence: u32, + current_alloc: AllocKind, + ) -> Result<(), &'static str> { + for i in 0..(count as usize) { + let triple_idx = address as usize + i; + let base = triple_idx * 3; + if base + 2 >= parsed.instructions.len() { + return Err(reject::INSTR_OOB); + } + let words = [ + parsed.instructions[base], + parsed.instructions[base + 1], + parsed.instructions[base + 2], + ]; + let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0; + if is_fetch { + match decode_fetch(words) { + FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?, + FetchInstruction::Texture(tf) => { + if tf.dimension != 1 { + return Err(reject::TFETCH_NON2D); + } + self.emit_tfetch(&tf); + } + FetchInstruction::Unknown { .. } => return Err(reject::VFETCH_FMT), + } + } else { + let alu = decode_alu(words); + self.emit_alu(&alu, current_alloc)?; + } + } + Ok(()) + } + + fn emit_alu( + &mut self, + alu: &AluInstruction, + current_alloc: AllocKind, + ) -> Result<(), &'static str> { + let a = format!("r[{}u]", alu.src_a & 0x7F); + let b = format!("r[{}u]", alu.src_b & 0x7F); + let c = format!("r[{}u]", alu.src_c & 0x7F); + + // Vector pipe. + if alu.vector_write_mask != 0 { + let expr = vector_expr(alu.vector_opcode, &a, &b, &c) + .ok_or(reject::VEC_OP_UNSUPPORTED)?; + let dst_reg = alu.vector_dest & 0x7F; + if alu.vector_dest_is_export { + self.emit_export(dst_reg, current_alloc, &expr, alu.vector_write_mask); + } else { + self.emit_masked_write(&format!("r[{dst_reg}u]"), &expr, alu.vector_write_mask); + } + } + + // Scalar pipe. Binary ops use (src_a.x, src_b.x); ps-variants use + // src_a.x + running ps. `scl_src_a` mirrors the interpreter's + // `scalar_src_is_ps` selector. + let scl_src_a = if alu.scalar_src_is_ps { + "ps".to_string() + } else { + format!("{}.x", a) + }; + let scl_src_b = format!("{}.x", b); + let expr = scalar_expr(alu.scalar_opcode, &scl_src_a, &scl_src_b, "ps") + .ok_or(reject::SCL_OP_UNSUPPORTED)?; + self.push(&format!("ps = {expr};")); + if alu.scalar_write_mask != 0 { + let v = "vec4(ps, ps, ps, ps)"; + let dst_reg = alu.scalar_dest & 0x7F; + self.emit_masked_write(&format!("r[{dst_reg}u]"), v, alu.scalar_write_mask); + } + Ok(()) + } + + fn emit_masked_write(&mut self, lhs: &str, rhs: &str, mask: u8) { + if mask == 0xF { + self.push(&format!("{lhs} = {rhs};")); + return; + } + self.push(&"{".to_string()); + self.indent += 1; + self.push(&format!("let _prev = {lhs};")); + self.push(&format!("let _new = {rhs};")); + let mut components = Vec::new(); + let letters = ['x', 'y', 'z', 'w']; + for (i, c) in letters.iter().enumerate() { + if (mask >> i) & 1 == 1 { + components.push(format!("_new.{c}")); + } else { + components.push(format!("_prev.{c}")); + } + } + self.push(&format!( + "{lhs} = vec4({}, {}, {}, {});", + components[0], components[1], components[2], components[3] + )); + self.indent -= 1; + self.push("}"); + } + + fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) { + // Xenos's export "register" indexing within an alloc range is + // normally (alloc_base + offset). Since our CF stream doesn't + // carry per-export slot offsets cleanly, use `alloc` to pick the + // target. + let lhs = match (self.stage, alloc) { + (Stage::Vertex, AllocKind::Position) => "opos", + (Stage::Vertex, AllocKind::Interpolators) => "ocolor", + (Stage::Vertex, AllocKind::Colors) => "ocolor", + (Stage::Vertex, _) => "ocolor", // fall through — any other alloc + (Stage::Pixel, AllocKind::Colors) => "ocolor0", + (Stage::Pixel, _) => "ocolor0", + }; + let _ = dst_reg; // per-slot export indexing reserved for a richer v2 + self.emit_masked_write(lhs, expr, mask); + } + + fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> { + // v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4 + // dwords. Matches the interpreter's MVP semantics; unlocks more + // formats alongside the CPU texture cache's format expansion. + let fetch_const = (vf.raw[0] >> 5) & 0x1F; + let src_reg = vf.src_register & 0x7F; + let dst_reg = vf.dest_register & 0x7F; + self.push(&format!( + "{{ let fc0 = xenos_consts.fetch[{}u]; \ + let base = (fc0 & 0xFFFFFFFCu) >> 2u; \ + let vidx = u32(r[{src_reg}u].x); \ + let addr = base + vidx * 4u; \ + let n = arrayLength(&vertex_buffer); \ + if (addr + 3u < n) {{ \ + r[{dst_reg}u] = vec4( \ + bitcast(vertex_buffer[addr + 0u]), \ + bitcast(vertex_buffer[addr + 1u]), \ + bitcast(vertex_buffer[addr + 2u]), \ + bitcast(vertex_buffer[addr + 3u])); \ + }} }}", + fetch_const * 2, + )); + Ok(()) + } + + fn emit_tfetch(&mut self, tf: &crate::ucode::fetch::TextureFetch) { + // v1: sample the single bound texture; UV = r[src].xy. P5's cache + // publishes the `fetch_const=0` texture into `@group(1)`; slot + // mismatch is a silent magenta for now. + let src_reg = tf.src_register & 0x7F; + let dst_reg = tf.dest_register & 0x7F; + self.push(&format!( + "r[{dst_reg}u] = textureSampleLevel(xenos_tex, xenos_samp, r[{src_reg}u].xy, 0.0);" + )); + } +} + +fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option { + let s = match op { + vop::ADD => format!("({a} + {b})"), + vop::MUL => format!("({a} * {b})"), + vop::MAX => format!("max({a}, {b})"), + vop::MIN => format!("min({a}, {b})"), + vop::MAD => format!("({a} * {b} + {c})"), + vop::DOT4 => format!("vec4(dot({a}, {b}))"), + vop::DOT3 => format!("vec4(dot({a}.xyz, {b}.xyz))"), + vop::DOT2_ADD => format!( + "vec4({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)" + ), + vop::SEQ => format!( + "vec4(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))" + ), + vop::SGT => format!( + "vec4(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))" + ), + vop::SGE => format!( + "vec4(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))" + ), + vop::SNE => format!( + "vec4(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))" + ), + vop::FRC => format!("fract({a})"), + vop::FLOOR => format!("floor({a})"), + _ => return None, + }; + Some(s) +} + +fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option { + let s = match op { + sop::ADDS => format!("({a} + {b})"), + sop::ADDS_PREV => format!("({a} + {prev})"), + sop::MULS => format!("({a} * {b})"), + sop::MULS_PREV => format!("({a} * {prev})"), + sop::MAXS => format!("max({a}, {b})"), + sop::MINS => format!("min({a}, {b})"), + sop::RCP => format!("xe_rcp({a})"), + sop::RETAIN_PREV => prev.to_string(), + _ => return None, + }; + Some(s) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ucode::alu::{sop, vop}; + use crate::ucode::control_flow::ControlFlowInstruction; + + fn synthetic_trivial_shader() -> ParsedShader { + // Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV + // with full write-mask on vector, zero on scalar. Alloc(Position) + // precedes so the ALU's export (if it were one) would target oPos. + let w2 = (vop::ADD as u32) + | ((sop::RETAIN_PREV as u32) << 6) + | (0xF << 12) // vector_write_mask + | (0u32 << 16); // vector_dest = 0 + ParsedShader { + cf: vec![ + ControlFlowInstruction::Alloc { + size: 1, + kind: AllocKind::Position, + }, + ControlFlowInstruction::Exec { + address: 0, + count: 1, + sequence: 0, + is_end: true, + predicated: false, + predicate_condition: false, + }, + ], + instructions: vec![0, 0, w2], + } + } + + #[test] + fn trivial_shader_translates() { + let shader = synthetic_trivial_shader(); + match translate(&shader, Stage::Vertex) { + Translation::Ok(body) => { + assert!(body.contains("fn vs_main")); + assert!(body.contains("r[0u] = (r[0u] + r[0u]);")); + assert!(body.contains("return out;")); + } + Translation::Reject(r) => panic!("rejected: {r}"), + } + } + + #[test] + fn combined_module_parses_as_wgsl() { + let shader = synthetic_trivial_shader(); + let vs = match translate(&shader, Stage::Vertex) { + Translation::Ok(body) => body, + Translation::Reject(r) => panic!("VS rejected: {r}"), + }; + let ps = match translate(&shader, Stage::Pixel) { + Translation::Ok(body) => body, + Translation::Reject(r) => panic!("PS rejected: {r}"), + }; + let module = combine_stages(&vs, &ps); + // naga is pinned as a dev-dep in this crate; if this fails the + // translator is emitting invalid WGSL. + match naga::front::wgsl::parse_str(&module) { + Ok(_) => {} + Err(e) => panic!( + "emitted WGSL failed to parse:\n{}\n--- module ---\n{}", + e, module + ), + } + } + + #[test] + fn loop_clause_rejected() { + let shader = ParsedShader { + cf: vec![ControlFlowInstruction::LoopStart { + address: 0, + loop_id: 0, + }], + instructions: vec![], + }; + assert!(matches!( + translate(&shader, Stage::Vertex), + Translation::Reject(reject::CF_LOOP) + )); + } + + #[test] + fn unsupported_op_rejected() { + let w2 = (29u32) // VOP_MAX_A, not in v1 subset + | ((sop::RETAIN_PREV as u32) << 6) + | (0xF << 12); + let shader = ParsedShader { + cf: vec![ControlFlowInstruction::Exec { + address: 0, + count: 1, + sequence: 0, + is_end: true, + predicated: false, + predicate_condition: false, + }], + instructions: vec![0, 0, w2], + }; + assert!(matches!( + translate(&shader, Stage::Vertex), + Translation::Reject(reject::VEC_OP_UNSUPPORTED) + )); + } +} diff --git a/crates/xenia-gpu/src/ucode/alu.rs b/crates/xenia-gpu/src/ucode/alu.rs new file mode 100644 index 0000000..4130a29 --- /dev/null +++ b/crates/xenia-gpu/src/ucode/alu.rs @@ -0,0 +1,206 @@ +//! Xenos ALU (vector + scalar) instruction decoder. +//! +//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode: +//! - word0: operand modifier flags + destination info +//! - word1: source register / swizzle fields +//! - word2: opcode + write mask + export target +//! +//! See `ucode.h:900-1400` for the full field map. This decoder captures the +//! minimal shape the uber-shader needs; flags we don't interpret yet are +//! retained as raw bits in `raw` for downstream inspection. + +/// Decoded ALU instruction. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct AluInstruction { + /// Vector ALU opcode (bits 0..6 of word2 in canary's layout). + pub vector_opcode: u8, + /// Scalar ALU opcode (bits 7..13 of word2). + pub scalar_opcode: u8, + /// Destination register index for vector result (7 bits). + pub vector_dest: u8, + /// Destination register index for scalar result (7 bits). + pub scalar_dest: u8, + /// 4-bit write mask for the vector result (x/y/z/w). + pub vector_write_mask: u8, + /// 4-bit write mask for the scalar result. + pub scalar_write_mask: u8, + /// Set when the instruction should write to the export bank (position, + /// interpolators, color, etc.) instead of the general register file. + pub vector_dest_is_export: bool, + /// Selects `ps` (previous scalar result) as the scalar operand when set. + pub scalar_src_is_ps: bool, + /// Source register indices (at most 3 for vector ops). + pub src_a: u8, + pub src_b: u8, + pub src_c: u8, + /// Set when the instruction is predicated; skipped if the predicate + /// doesn't match `predicate_condition`. + pub predicated: bool, + pub predicate_condition: bool, + /// Raw dwords — preserved verbatim so the translator / interpreter can + /// reach into fields we haven't parsed explicitly yet. + pub raw: [u32; 3], +} + +/// Decode a 3-dword ALU triple. +pub fn decode_alu(words: [u32; 3]) -> AluInstruction { + let w0 = words[0]; + let _w1 = words[1]; + let w2 = words[2]; + AluInstruction { + vector_opcode: (w2 & 0x3F) as u8, + scalar_opcode: ((w2 >> 6) & 0x3F) as u8, + vector_dest: ((w2 >> 16) & 0x7F) as u8, + scalar_dest: ((w2 >> 24) & 0x7F) as u8, + vector_write_mask: ((w2 >> 12) & 0xF) as u8, + scalar_write_mask: ((w2 >> 8) & 0xF) as u8, + vector_dest_is_export: ((w2 >> 23) & 1) != 0, + scalar_src_is_ps: ((w0 >> 26) & 1) != 0, + src_a: (w0 & 0xFF) as u8, + src_b: ((w0 >> 8) & 0xFF) as u8, + src_c: ((w0 >> 16) & 0xFF) as u8, + predicated: ((w0 >> 27) & 1) != 0, + predicate_condition: ((w0 >> 28) & 1) != 0, + raw: words, + } +} + +/// Vector ALU opcodes we reference by name. Values match canary's +/// `AluVectorOpcode` enum in `ucode.h:1354`. +pub mod vop { + pub const ADD: u8 = 0; + pub const MUL: u8 = 1; + pub const MAX: u8 = 2; + pub const MIN: u8 = 3; + pub const SEQ: u8 = 4; + pub const SGT: u8 = 5; + pub const SGE: u8 = 6; + pub const SNE: u8 = 7; + pub const FRC: u8 = 8; + pub const TRUNC: u8 = 9; + pub const FLOOR: u8 = 10; + pub const MAD: u8 = 11; + pub const CND_EQ: u8 = 12; + pub const CND_GE: u8 = 13; + pub const CND_GT: u8 = 14; + pub const DOT4: u8 = 15; + pub const DOT3: u8 = 16; + pub const DOT2_ADD: u8 = 17; + pub const CUBE: u8 = 18; + pub const MAX4: u8 = 19; + pub const SETP_EQ_PUSH: u8 = 20; + pub const SETP_NE_PUSH: u8 = 21; + pub const SETP_GT_PUSH: u8 = 22; + pub const SETP_GE_PUSH: u8 = 23; + pub const KILL_EQ: u8 = 24; + pub const KILL_GT: u8 = 25; + pub const KILL_GE: u8 = 26; + pub const KILL_NE: u8 = 27; + pub const DST: u8 = 28; + pub const MAX_A: u8 = 29; +} + +/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in +/// `ucode.h:1001`. +pub mod sop { + pub const ADDS: u8 = 0; + pub const ADDS_PREV: u8 = 1; + pub const MULS: u8 = 2; + pub const MULS_PREV: u8 = 3; + pub const MULS_PREV2: u8 = 4; + pub const MAXS: u8 = 5; + pub const MINS: u8 = 6; + pub const SEQS: u8 = 7; + pub const SGTS: u8 = 8; + pub const SGES: u8 = 9; + pub const SNES: u8 = 10; + pub const FRCS: u8 = 11; + pub const TRUNCS: u8 = 12; + pub const FLOORS: u8 = 13; + pub const EXP: u8 = 14; + pub const LOGC: u8 = 15; + pub const LOG: u8 = 16; + pub const RCPC: u8 = 17; + pub const RCPF: u8 = 18; + pub const RCP: u8 = 19; + pub const RSQC: u8 = 20; + pub const RSQF: u8 = 21; + pub const RSQ: u8 = 22; + pub const MAXAS: u8 = 23; + pub const MAXASF: u8 = 24; + pub const SUBS: u8 = 25; + pub const SUBS_PREV: u8 = 26; + pub const SETP_EQ: u8 = 27; + pub const SETP_NE: u8 = 28; + pub const SETP_GT: u8 = 29; + pub const SETP_GE: u8 = 30; + pub const SETP_INV: u8 = 31; + pub const SETP_POP: u8 = 32; + pub const SETP_CLR: u8 = 33; + pub const SETP_RSTR: u8 = 34; + pub const KILLS_EQ: u8 = 35; + pub const KILLS_GT: u8 = 36; + pub const KILLS_GE: u8 = 37; + pub const KILLS_NE: u8 = 38; + pub const KILLS_ONE: u8 = 39; + pub const SQRT: u8 = 40; + pub const MULSC0: u8 = 42; + pub const MULSC1: u8 = 43; + pub const ADDSC0: u8 = 44; + pub const ADDSC1: u8 = 45; + pub const SUBSC0: u8 = 46; + pub const SUBSC1: u8 = 47; + pub const SIN: u8 = 48; + pub const COS: u8 = 49; + pub const RETAIN_PREV: u8 = 50; +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Regression: our table previously drifted from canary's values (e.g. + /// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin + /// the most-often-used scalar + vector opcodes here. + #[test] + fn opcodes_match_canary_values() { + // Scalar. + assert_eq!(sop::MAXS, 5); + assert_eq!(sop::MINS, 6); + assert_eq!(sop::SEQS, 7); + assert_eq!(sop::EXP, 14); + assert_eq!(sop::LOG, 16); + assert_eq!(sop::RCP, 19); + assert_eq!(sop::RSQ, 22); + assert_eq!(sop::SUBS, 25); + assert_eq!(sop::SETP_EQ, 27); + assert_eq!(sop::KILLS_EQ, 35); + assert_eq!(sop::SQRT, 40); + assert_eq!(sop::SIN, 48); + assert_eq!(sop::RETAIN_PREV, 50); + // Vector. + assert_eq!(vop::SNE, 7); + assert_eq!(vop::CND_EQ, 12); + assert_eq!(vop::MAX4, 19); + assert_eq!(vop::KILL_EQ, 24); + assert_eq!(vop::DST, 28); + } + + #[test] + fn decode_extracts_opcodes_and_dests() { + // Build a minimal ALU word: + // vector_opcode = ADD (0), scalar_opcode = RCP (22), + // vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF + let w2 = (vop::ADD as u32) + | ((sop::RCP as u32) << 6) + | (0xF << 12) // vector_write_mask + | (3u32 << 16) // vector_dest + | (7u32 << 24); // scalar_dest + let alu = decode_alu([0, 0, w2]); + assert_eq!(alu.vector_opcode, vop::ADD); + assert_eq!(alu.scalar_opcode, sop::RCP); + assert_eq!(alu.vector_dest, 3); + assert_eq!(alu.scalar_dest, 7); + assert_eq!(alu.vector_write_mask, 0xF); + } +} diff --git a/crates/xenia-gpu/src/ucode/control_flow.rs b/crates/xenia-gpu/src/ucode/control_flow.rs new file mode 100644 index 0000000..941a49d --- /dev/null +++ b/crates/xenia-gpu/src/ucode/control_flow.rs @@ -0,0 +1,173 @@ +//! Xenos control-flow clause decoder. +//! +//! A shader's CF block is a sequence of 48-bit clauses packed two-per- +//! three-dword row. Each clause encodes an opcode and type-specific fields +//! (exec addr/count, loop start/end, branch target, etc.). +//! +//! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset +//! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`, +//! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the +//! translator can log + degrade. + +/// Parsed representation of one CF clause. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ControlFlowInstruction { + /// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions. + Exec { + /// Instruction-block dword index where this clause's instructions start, + /// expressed in **triple units** (each inst = 3 dwords). + address: u32, + /// Number of triples to execute. + count: u32, + /// The ALU-vs-fetch sequence bitmap (2 bits per instruction). + sequence: u32, + /// True when this clause ends the shader. + is_end: bool, + /// True if predicated; skip when predicate != predicate_condition. + predicated: bool, + predicate_condition: bool, + }, + /// `kLoopStart` — begin a `aL` loop referencing a loop constant. + LoopStart { address: u32, loop_id: u32 }, + /// `kLoopEnd` — close the loop; `address` points at the matching start. + LoopEnd { address: u32, loop_id: u32 }, + /// `kCondJmp` — conditional jump to another CF index. + CondJmp { + target: u32, + predicated: bool, + predicate_condition: bool, + }, + /// `kCondCall` — call into another CF subroutine. + CondCall { target: u32 }, + /// `kReturn` — return from subroutine. + Return, + /// `kAlloc` — pre-allocate export registers (position, interpolators, colors). + Alloc { size: u32, kind: AllocKind }, + /// Exit the shader (terminal). + Exit, + /// Unknown / unhandled opcode. + Unknown { opcode: u8 }, +} + +/// Export target types for `kAlloc` clauses. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AllocKind { + Position, + Interpolators, + Colors, + Memexport, + Other, +} + +impl AllocKind { + fn from_bits(b: u32) -> Self { + match b & 0x7 { + 0 => AllocKind::Position, + 1 => AllocKind::Interpolators, + 2 => AllocKind::Colors, + 3 => AllocKind::Memexport, + _ => AllocKind::Other, + } + } +} + +/// Decode one row (three consecutive CF dwords) into two CF clauses. +/// +/// Word layout per canary (`ucode.h:218-256`): +/// - word0 + lo16(word1) → CF_A's 48-bit payload +/// - hi16(word1) + word2 → CF_B's 48-bit payload +/// +/// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47). +pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) { + // Build each 48-bit value as u64; LE within the clause. + let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32); + let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16); + (decode_single(a), decode_single(b)) +} + +fn decode_single(payload: u64) -> ControlFlowInstruction { + // Top 4 bits of the 48-bit payload. + let opcode = ((payload >> 44) & 0xF) as u8; + // Predicate bit + condition live at the 28..30 range for exec/jmp. Rough + // extraction — good enough for the interpreter, which logs unknowns. + let predicated = ((payload >> 28) & 1) != 0; + let predicate_condition = ((payload >> 29) & 1) != 0; + + match opcode { + 0 => ControlFlowInstruction::Exec { + address: (payload & 0xFFF) as u32, + count: ((payload >> 12) & 0x7) as u32, + sequence: ((payload >> 16) & 0xFFF) as u32, + is_end: false, + predicated, + predicate_condition, + }, + 1 => ControlFlowInstruction::Exit, + 2 => ControlFlowInstruction::Exec { + address: (payload & 0xFFF) as u32, + count: ((payload >> 12) & 0x7) as u32, + sequence: ((payload >> 16) & 0xFFF) as u32, + is_end: true, + predicated, + predicate_condition, + }, + 6 => ControlFlowInstruction::LoopStart { + address: (payload & 0x3FF) as u32, + loop_id: ((payload >> 16) & 0x1F) as u32, + }, + 7 => ControlFlowInstruction::LoopEnd { + address: (payload & 0x3FF) as u32, + loop_id: ((payload >> 16) & 0x1F) as u32, + }, + 8 => ControlFlowInstruction::CondCall { + target: (payload & 0x3FF) as u32, + }, + 9 => ControlFlowInstruction::Return, + 10 => ControlFlowInstruction::CondJmp { + target: (payload & 0x3FF) as u32, + predicated, + predicate_condition, + }, + 12 => ControlFlowInstruction::Alloc { + size: (payload & 0x7) as u32, + kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32), + }, + other => ControlFlowInstruction::Unknown { opcode: other }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn opcode_exit_decodes() { + // opcode 1 (Exit) in bits 44..47 of A's 48-bit payload. + let payload: u64 = 1u64 << 44; + let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32); + let cf = decode_cf_pair(hi, lo, 0).0; + assert_eq!(cf, ControlFlowInstruction::Exit); + } + + #[test] + fn opcode_exec_end_carries_address_count() { + // opcode 2 (ExecEnd), address=4, count=2, sequence=0. + let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4; + let hi = (payload & 0xFFFF_FFFF) as u32; + let lo = ((payload >> 32) & 0xFFFF) as u32; + let cf = decode_cf_pair(hi, lo, 0).0; + match cf { + ControlFlowInstruction::Exec { + address, + count, + is_end, + .. + } => { + assert_eq!(address, 4); + assert_eq!(count, 2); + assert!(is_end); + } + other => panic!("expected Exec, got {other:?}"), + } + } +} diff --git a/crates/xenia-gpu/src/ucode/fetch.rs b/crates/xenia-gpu/src/ucode/fetch.rs new file mode 100644 index 0000000..85d1bba --- /dev/null +++ b/crates/xenia-gpu/src/ucode/fetch.rs @@ -0,0 +1,117 @@ +//! Xenos fetch (vertex + texture) instruction decoder. +//! +//! Like ALU instructions, fetches are 96 bits (3 dwords). The opcode lives +//! in the low 5 bits of word0. We split them into `VertexFetch` and +//! `TextureFetch` structurally because their operand layouts differ. +//! +//! Reference: `xenia-canary/src/xenia/gpu/ucode.h:690-877`. + +/// Decoded fetch instruction. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FetchInstruction { + Vertex(VertexFetch), + Texture(TextureFetch), + /// Unknown / minor variants we don't model yet. + Unknown { opcode: u8, raw: [u32; 3] }, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct VertexFetch { + /// Vertex fetch constant index (0..=95). + pub fetch_const: u8, + /// Source register index (vertex index in r#). + pub src_register: u8, + /// Destination register for the fetched value. + pub dest_register: u8, + /// 4-bit write mask. + pub dest_write_mask: u8, + pub raw: [u32; 3], +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct TextureFetch { + /// Texture fetch constant index (0..=31). + pub fetch_const: u8, + pub src_register: u8, + pub dest_register: u8, + pub dest_write_mask: u8, + /// Dimension: 0=1D, 1=2D, 2=3D/stacked, 3=cube. + pub dimension: u8, + pub raw: [u32; 3], +} + +/// Opcodes (low 5 bits of word0). From `ucode.h`. +pub mod op { + pub const VERTEX_FETCH: u8 = 0x00; + pub const TEXTURE_FETCH: u8 = 0x01; + pub const GET_TEXTURE_BORDER_COLOR_FRAC: u8 = 0x16; + pub const GET_TEXTURE_COMPUTED_LOD: u8 = 0x17; + pub const GET_TEXTURE_WEIGHTS: u8 = 0x18; + pub const GET_TEXTURE_GRADIENTS: u8 = 0x19; + pub const SET_TEXTURE_LOD: u8 = 0x1A; + pub const SET_TEXTURE_GRADIENTS_HORZ: u8 = 0x1B; + pub const SET_TEXTURE_GRADIENTS_VERT: u8 = 0x1C; +} + +pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction { + let w0 = words[0]; + let w1 = words[1]; + let opcode = (w0 & 0x1F) as u8; + match opcode { + op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch { + fetch_const: ((w0 >> 5) & 0x1F) as u8, + src_register: ((w0 >> 17) & 0x7F) as u8, + dest_register: ((w0 >> 10) & 0x7F) as u8, + dest_write_mask: ((w1 >> 23) & 0xF) as u8, + raw: words, + }), + op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch { + fetch_const: ((w0 >> 5) & 0x1F) as u8, + src_register: ((w0 >> 17) & 0x7F) as u8, + dest_register: ((w0 >> 10) & 0x7F) as u8, + dest_write_mask: ((w1 >> 23) & 0xF) as u8, + dimension: ((w1 >> 29) & 0x3) as u8, + raw: words, + }), + _ => FetchInstruction::Unknown { opcode, raw: words }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn decode_vertex_fetch() { + // opcode=0 (vertex), fetch_const=5, src=2, dest=7. + let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17); + let v = decode_fetch([w0, 0, 0]); + match v { + FetchInstruction::Vertex(vf) => { + assert_eq!(vf.fetch_const, 5); + assert_eq!(vf.src_register, 2); + assert_eq!(vf.dest_register, 7); + } + other => panic!("expected Vertex, got {other:?}"), + } + } + + #[test] + fn decode_texture_fetch() { + let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17); + let t = decode_fetch([w0, (2u32 << 29), 0]); + match t { + FetchInstruction::Texture(tf) => { + assert_eq!(tf.fetch_const, 3); + assert_eq!(tf.dimension, 2); + } + other => panic!("expected Texture, got {other:?}"), + } + } + + #[test] + fn unknown_opcode_is_classified() { + let v = decode_fetch([0x16, 0, 0]); // GET_TEXTURE_BORDER_COLOR_FRAC + assert!(matches!(v, FetchInstruction::Unknown { opcode: 0x16, .. })); + } +} diff --git a/crates/xenia-gpu/src/ucode/mod.rs b/crates/xenia-gpu/src/ucode/mod.rs new file mode 100644 index 0000000..70d2349 --- /dev/null +++ b/crates/xenia-gpu/src/ucode/mod.rs @@ -0,0 +1,249 @@ +//! Xenos (ATI R500-family) shader microcode decoder. +//! +//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a +//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU +//! instructions (vector + scalar pipes), and fetch instructions (vertex + +//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting +//! translator comes online in P7, it reuses the same parser. +//! +//! ## Binary layout +//! +//! A compiled shader has two sections back-to-back: +//! +//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs +//! two clauses into three 32-bit words: +//! ```text +//! word0 word1 word2 +//! [-CF_A (48)-][-CF_B (48)-] +//! ``` +//! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and +//! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits. +//! +//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch +//! instructions. Each control-flow clause of kind `Exec*` references a +//! contiguous range of these by `(address, count)` in dwords * 3. +//! +//! We read big-endian dwords straight out of guest memory (the `raw` +//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that +//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit +//! layout and `:700-877` for the fetch/ALU mix. + +pub mod alu; +pub mod control_flow; +pub mod fetch; + +use self::alu::AluInstruction; +use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair}; +use self::fetch::FetchInstruction; + +/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept +/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants. +pub mod cf_kind { + pub const EXEC: u32 = 0; + pub const EXEC_END: u32 = 1; + pub const ALLOC: u32 = 2; + pub const EXIT: u32 = 3; + pub const LOOP_START: u32 = 4; + pub const LOOP_END: u32 = 5; + pub const COND_JMP: u32 = 6; + pub const COND_CALL: u32 = 7; + pub const RETURN: u32 = 8; + pub const UNKNOWN: u32 = 15; +} + +/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause. +pub mod cf_alloc_kind { + pub const POSITION: u32 = 0; + pub const INTERPOLATORS: u32 = 1; + pub const COLORS: u32 = 2; + pub const MEMEXPORT: u32 = 3; + pub const OTHER: u32 = 4; +} + +/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime +/// interpreter expects: +/// +/// ```text +/// [0] cf_count +/// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause +/// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch) +/// ``` +/// +/// The CF table lets WGSL walk clauses without reconstructing bit-packed +/// layouts on the GPU. Semantics per `kind`: +/// +/// | kind | primary | aux | +/// |-------------|----------------------------|------------------------------| +/// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count | +/// | ALLOC | alloc_kind (see cf_alloc_kind) | size | +/// | EXIT | 0 | 0 | +/// | LOOP_START | address | loop_id | +/// | LOOP_END | address | loop_id | +/// | COND_JMP | target | predicate flags | +/// | COND_CALL | target | 0 | +/// | RETURN | 0 | 0 | +/// | UNKNOWN | opcode | 0 | +pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec { + let cf_count = parsed.cf.len() as u32; + let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len()); + out.push(cf_count); + for clause in &parsed.cf { + let (kind, primary, aux) = encode_cf(*clause); + out.push(kind); + out.push(primary); + out.push(aux); + } + out.extend_from_slice(&parsed.instructions); + out +} + +fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) { + use ControlFlowInstruction::*; + match c { + Exec { + address, + count, + sequence, + is_end, + predicated, + predicate_condition, + } => { + let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1); + let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC } + | (pred_bits << 8); + (kind, address, (sequence << 8) | count) + } + Alloc { size, kind } => { + let akind = match kind { + AllocKind::Position => cf_alloc_kind::POSITION, + AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS, + AllocKind::Colors => cf_alloc_kind::COLORS, + AllocKind::Memexport => cf_alloc_kind::MEMEXPORT, + AllocKind::Other => cf_alloc_kind::OTHER, + }; + (cf_kind::ALLOC, akind, size) + } + Exit => (cf_kind::EXIT, 0, 0), + LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id), + LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id), + CondJmp { + target, + predicated, + predicate_condition, + } => { + let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1); + (cf_kind::COND_JMP, target, pred_bits) + } + CondCall { target } => (cf_kind::COND_CALL, target, 0), + Return => (cf_kind::RETURN, 0, 0), + Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0), + } +} + +/// One instruction word set from the instruction-block section. Xenos packs +/// ALU and fetch instructions identically (96 bits each); the owning exec +/// clause's "sequence" bitmap decides which is which. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DecodedInstruction { + /// ALU pipe (vector ALU + optional co-issued scalar ALU). + Alu(AluInstruction), + /// Vertex or texture fetch. + Fetch(FetchInstruction), +} + +/// Parsed shader: the control-flow clause list + the raw 32-bit instruction +/// words. The uber-shader / translator is expected to index into +/// `instructions` based on `(clause.address * 3, clause.count * 3)`. +#[derive(Debug, Clone, Default)] +pub struct ParsedShader { + pub cf: Vec, + /// Raw instruction dwords. Each 3-dword triple is one ALU or fetch + /// instruction; the owning `Exec` clause's `sequence` bitmap picks the + /// kind. + pub instructions: Vec, +} + +/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire +/// microcode buffer (control flow + instructions). Heuristic: CF dword count +/// is encoded in the first word's low 12 bits of the last exec clause — +/// canary iterates until it hits a clause of kind `Exit`. We do the same. +pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader { + let mut cf = Vec::new(); + // CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's + // layout). Walk pairs of 3 dwords per pair of clauses. + let mut i = 0usize; + while i + 2 < raw_dwords.len() { + let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]); + let (first, second) = a; + let seen_exit = matches!( + first, + ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. } + ) || matches!( + second, + ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. } + ); + cf.push(first); + cf.push(second); + i += 3; + if seen_exit { + break; + } + } + // Everything after `i` dwords is the instruction block. + let instructions = raw_dwords[i..].to_vec(); + ParsedShader { cf, instructions } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_blob_parses_empty() { + let p = parse_shader(&[]); + assert!(p.cf.is_empty()); + assert!(p.instructions.is_empty()); + } + + #[test] + fn pack_for_wgsl_layout_is_correct() { + // Build a tiny ParsedShader by hand and verify the packed form. + let parsed = ParsedShader { + cf: vec![ + ControlFlowInstruction::Exec { + address: 0x10, + count: 3, + sequence: 0b1010, + is_end: false, + predicated: false, + predicate_condition: false, + }, + ControlFlowInstruction::Exit, + ], + instructions: vec![0x1111, 0x2222, 0x3333], + }; + let packed = pack_for_wgsl(&parsed); + assert_eq!(packed[0], 2, "cf_count"); + // First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03 + assert_eq!(packed[1] & 0xFF, cf_kind::EXEC); + assert_eq!(packed[2], 0x10); + assert_eq!(packed[3], (0b1010 << 8) | 3); + // Second clause: EXIT + assert_eq!(packed[4] & 0xFF, cf_kind::EXIT); + // Instruction block starts at 1 + 2*3 = 7 + assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]); + } + + #[test] + fn trivial_exit_clause_stops_parsing() { + // Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary. + // Exit clause is opcode 1 in the top 4 bits of the upper 16 bits. + let w0 = 0u32; // clause A body + let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A + let w2 = 0u32; + let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]); + assert!(!p.cf.is_empty()); + // Exit detected → remaining dword is instruction data. + assert_eq!(p.instructions, vec![0xDEAD_BEEF]); + } +} diff --git a/crates/xenia-gpu/src/xenos_constants.rs b/crates/xenia-gpu/src/xenos_constants.rs new file mode 100644 index 0000000..9ed2a17 --- /dev/null +++ b/crates/xenia-gpu/src/xenos_constants.rs @@ -0,0 +1,124 @@ +//! The "Xenos constants" block the WGSL interpreter consumes per draw. +//! +//! Mirrors the Xenos register-file regions that carry the per-draw constant +//! values shaders reference at runtime: +//! +//! | Region | Base | Count | Size | +//! |--------|------|-------|------| +//! | ALU | 0x4000 | 512 × vec4 | 8 KB | +//! | Fetch | 0x4800 | 256 × u32 | 1 KB | +//! | Bool | 0x4900 | 8 × u32 | 32 B | +//! | Loop | 0x4908 | 32 × u32 | 128 B | +//! +//! Total: ~9.2 KB, well under the 64 KB min uniform buffer size on all wgpu +//! backends. The `XenosConstantsBlock` is declared `#[repr(C)]` + bytemuck +//! `Pod` so it can be `bytemuck::bytes_of()`'d directly into a wgpu uniform +//! buffer. The matching WGSL `struct XenosConstants` lives in +//! `shaders/xenos_interp.wgsl`. + +use bytemuck::{Pod, Zeroable}; + +use crate::register_file::RegisterFile; + +pub const ALU_CONSTANT_COUNT: usize = 512; +pub const FETCH_CONSTANT_COUNT: usize = 256; +pub const BOOL_CONSTANT_COUNT: usize = 8; +pub const LOOP_CONSTANT_COUNT: usize = 32; + +pub const CONST_BASE_ALU: u32 = 0x4000; +pub const CONST_BASE_FETCH: u32 = 0x4800; +pub const CONST_BASE_BOOL: u32 = 0x4900; +pub const CONST_BASE_LOOP: u32 = 0x4908; + +/// Per-draw constants block uploaded once to the uniform buffer at +/// `@group(0) @binding(1)`. +#[repr(C)] +#[derive(Clone, Copy)] +pub struct XenosConstantsBlock { + pub alu: [[f32; 4]; ALU_CONSTANT_COUNT], + pub fetch: [u32; FETCH_CONSTANT_COUNT], + pub bool_consts: [u32; BOOL_CONSTANT_COUNT], + pub loop_consts: [u32; LOOP_CONSTANT_COUNT], +} + +// SAFETY: all fields are Pod arrays of Pod primitives; `#[repr(C)]` fixes +// the layout. `bytemuck` derives `Pod` only when alignment + padding line +// up, so manual `unsafe impl` is the right tool here. +unsafe impl Zeroable for XenosConstantsBlock {} +unsafe impl Pod for XenosConstantsBlock {} + +impl Default for XenosConstantsBlock { + fn default() -> Self { + Self { + alu: [[0.0; 4]; ALU_CONSTANT_COUNT], + fetch: [0; FETCH_CONSTANT_COUNT], + bool_consts: [0; BOOL_CONSTANT_COUNT], + loop_consts: [0; LOOP_CONSTANT_COUNT], + } + } +} + +impl XenosConstantsBlock { + /// Size in bytes — exposed for tests + wgpu buffer sizing. + pub const SIZE: usize = std::mem::size_of::(); + + /// Snapshot the constants from a Xenos `RegisterFile` into a dense, + /// host-friendly layout the WGSL interpreter expects. ALU constants + /// (vec4 each) are 4 consecutive registers; fetch constants are u32. + pub fn snapshot(rf: &RegisterFile) -> Self { + let mut out = Self::default(); + for i in 0..ALU_CONSTANT_COUNT { + let base = CONST_BASE_ALU + (i as u32) * 4; + out.alu[i] = [ + f32::from_bits(rf.read(base)), + f32::from_bits(rf.read(base + 1)), + f32::from_bits(rf.read(base + 2)), + f32::from_bits(rf.read(base + 3)), + ]; + } + for i in 0..FETCH_CONSTANT_COUNT { + out.fetch[i] = rf.read(CONST_BASE_FETCH + i as u32); + } + for i in 0..BOOL_CONSTANT_COUNT { + out.bool_consts[i] = rf.read(CONST_BASE_BOOL + i as u32); + } + for i in 0..LOOP_CONSTANT_COUNT { + out.loop_consts[i] = rf.read(CONST_BASE_LOOP + i as u32); + } + out + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Layout-sanity: total size is (512·16) + (256·4) + (8·4) + (32·4) = + /// 8192 + 1024 + 32 + 128 = 9376 bytes. If this number drifts, either + /// the constant counts changed or the compiler added padding; either + /// way we want to know at test time because the WGSL struct layout in + /// `xenos_interp.wgsl` depends on it. + #[test] + fn xenos_constants_block_size_is_stable() { + assert_eq!(XenosConstantsBlock::SIZE, 9376); + } + + #[test] + fn snapshot_roundtrip_from_register_file() { + let mut rf = RegisterFile::new(); + // Write a recognisable pattern to alu[0] = (1.0, 2.0, 3.0, 4.0) + rf.write(CONST_BASE_ALU + 0, f32::to_bits(1.0)); + rf.write(CONST_BASE_ALU + 1, f32::to_bits(2.0)); + rf.write(CONST_BASE_ALU + 2, f32::to_bits(3.0)); + rf.write(CONST_BASE_ALU + 3, f32::to_bits(4.0)); + rf.write(CONST_BASE_FETCH + 5, 0xDEAD_BEEF); + rf.write(CONST_BASE_BOOL, 0x1234); + rf.write(CONST_BASE_LOOP + 3, 0x5678); + + let snap = XenosConstantsBlock::snapshot(&rf); + assert_eq!(snap.alu[0], [1.0, 2.0, 3.0, 4.0]); + assert_eq!(snap.fetch[5], 0xDEAD_BEEF); + assert_eq!(snap.bool_consts[0], 0x1234); + assert_eq!(snap.loop_consts[3], 0x5678); + } +}