xenia-rs/crates/xenia-gpu/src/resolve.rs

//! EDRAM→guest-memory resolve byte copy.
//!
//! Fires from [`crate::gpu_system::GpuSystem::handle_event_initiator`] on
//! `TILE_FLUSH` (event 15). Reads samples out of the shadow EDRAM at the
//! source tile range, applies the `Endian128` byte swap, and writes tiled
//! u32 samples into guest memory via a 32bpp bitwise-equivalent fast path
//! (Canary `IsColorResolveFormatBitwiseEquivalent` — `xenos.h:614-639`).
//!
//! Ground truth: `xenia-canary/src/xenia/gpu/draw_util.cc:1102-1370` and
//! `xenos.h:1077-1114` (`GpuSwapInline`), `1039-1052` (`CopySampleSelect`).
//!
//! ## Endian ordering
//!
//! [`xenia_memory::access::MemoryAccess::write_u32`] stores big-endian
//! bytes (it calls `val.to_be_bytes()`). The Xenon CPU sees memory as big-
//! endian u32s, so `write_u32(addr, 0x11223344)` lands `[0x11, 0x22, 0x33,
//! 0x44]` in memory — which is the `kNone` (no swap) byte order from the
//! host's view of the sample.
//!
//! The resolve has an `Endian128` mode controlled by
//! `RB_COPY_DEST_INFO.copy_dest_endian`: games typically set `k8in32` so
//! that later texture fetches see little-endian bytes. We therefore
//! pre-swap the sample *before* `write_u32` so the big-endian store yields
//! the desired byte order in memory.

use crate::draw_state::{ResolveInfo, ResolveSource};
use crate::edram::ShadowEdram;
use crate::render_target_cache::MsaaSamples;
use crate::tiled_address::{align_pitch_to_macro_tile, tiled_2d_offset};

use xenia_memory::access::MemoryAccess;

/// Stats returned from one resolve copy. Aggregated by the caller into
/// `GpuStats` counters so the HUD can surface them.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct ResolveCopyStats {
    /// Number of 32bpp samples actually written to guest memory.
    pub samples_written: u32,
    /// Was the format path supported? `false` means we skipped.
    pub supported: bool,
}

/// `xenos::CopyCommand::kNull` = 3 — resolve emits no copy (clear-only).
pub const COPY_COMMAND_NULL: u8 = 3;

/// Sanitized sample selector (`xenos::CopySampleSelect`, `xenos.h:1039`).
/// We keep the *raw* enum value in `ResolveInfo` and pass a sanitized one
/// here so callers can match on the effective mode rather than re-applying
/// the MSAA/depth sanitation rules from Canary `draw_util.cc:839-876`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CopySampleSelect {
    K0 = 0,
    K1 = 1,
    K2 = 2,
    K3 = 3,
    K01 = 4,
    K23 = 5,
    K0123 = 6,
}

impl CopySampleSelect {
    pub fn from_raw(raw: u8) -> Self {
        match raw & 0x7 {
            1 => Self::K1,
            2 => Self::K2,
            3 => Self::K3,
            4 => Self::K01,
            5 => Self::K23,
            6 | 7 => Self::K0123,
            _ => Self::K0,
        }
    }

    /// Single-sample picks return `Some(index 0..=3)`; averaging picks
    /// return `None` (caller must synthesize via per-sample reads).
    pub fn single_sample_index(self) -> Option<u8> {
        match self {
            Self::K0 => Some(0),
            Self::K1 => Some(1),
            Self::K2 => Some(2),
            Self::K3 => Some(3),
            _ => None,
        }
    }

    /// `IsSingleCopySampleSelected` from `xenos.h:1049`.
    pub fn is_single_sample(self) -> bool {
        self.single_sample_index().is_some()
    }
}

/// `SanitizeCopySampleSelect` (Canary `draw_util.cc:839-876`). MSAA
/// modes + depth limit which sample selectors are valid; invalid ones
/// are silently remapped. Returning the sanitized enum lets the resolve
/// loop assume a single-sample pick for 1x MSAA, etc.
pub fn sanitize_sample_select(
    raw: u8,
    msaa: MsaaSamples,
    is_depth: bool,
) -> CopySampleSelect {
    let select = CopySampleSelect::from_raw(raw);
    match msaa {
        MsaaSamples::X1 => {
            // Only sample 0 exists. Averaging modes → k0; >k0123 clamp.
            match select {
                CopySampleSelect::K0 => CopySampleSelect::K0,
                _ => CopySampleSelect::K0,
            }
        }
        MsaaSamples::X2 => {
            // Samples 0 and 1 exist (stacked vertically). k2 → k0, k3 → k1;
            // k23 → k01. Depth cannot average.
            match select {
                CopySampleSelect::K0 => CopySampleSelect::K0,
                CopySampleSelect::K1 => CopySampleSelect::K1,
                CopySampleSelect::K2 => CopySampleSelect::K0,
                CopySampleSelect::K3 => CopySampleSelect::K1,
                CopySampleSelect::K01 | CopySampleSelect::K23 | CopySampleSelect::K0123 => {
                    if is_depth {
                        CopySampleSelect::K0
                    } else {
                        CopySampleSelect::K01
                    }
                }
            }
        }
        MsaaSamples::X4 => {
            // All single-samples valid. Depth cannot average → pick
            // representative single sample (k01→k0, k23→k2, k0123→k0).
            if is_depth {
                match select {
                    CopySampleSelect::K01 => CopySampleSelect::K0,
                    CopySampleSelect::K23 => CopySampleSelect::K2,
                    CopySampleSelect::K0123 => CopySampleSelect::K0,
                    other => other,
                }
            } else {
                select
            }
        }
    }
}

/// Sample-index to in-pixel (dx, dy) offset for the current MSAA mode.
/// Matches the standard Xbox 360 MSAA sample layout (Canary
/// `texture_util::GetMsaaSampleLocation` / the shader constants). For 1x,
/// always `(0, 0)`.
///
/// * 2x MSAA: sample 0 = top line, sample 1 = bottom line.
/// * 4x MSAA: 2×2 grid `{(0,0),(1,0),(0,1),(1,1)}`.
#[inline]
fn sample_offset_in_pixel(sample_idx: u8, msaa: MsaaSamples) -> (u32, u32) {
    match msaa {
        MsaaSamples::X1 => (0, 0),
        MsaaSamples::X2 => (0, (sample_idx & 1) as u32),
        MsaaSamples::X4 => ((sample_idx & 1) as u32, ((sample_idx >> 1) & 1) as u32),
    }
}

/// Apply the `Endian128` byte swap to one 32-bit sample. Matches the cases
/// inside `GpuSwapInline` plus the 64/128-bit variants from
/// `xenos::Endian128`. The 64/128 modes cannot be expressed in a single u32
/// so they fall through to `k8in32` and log at the call site.
#[inline]
pub fn apply_endian_128(value: u32, endian: u8) -> u32 {
    match endian {
        0 => value,
        // k8in16: swap bytes within each 16-bit word.
        1 => ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8),
        // k8in32: full byte reversal.
        2 => value.swap_bytes(),
        // k16in32: swap 16-bit halves.
        3 => value.rotate_left(16),
        // k8in64 / k8in128: require cross-dword context. Approximate with
        // k8in32 (byte-reverse each dword) so the bytes land in a sensible
        // order; the caller logs the approximation.
        4 | 5 => value.swap_bytes(),
        _ => value,
    }
}

/// `xenos::ColorFormat` values we use as destination formats for 32bpp
/// resolves. Canary `xenos.h:582-609`.
mod color_format {
    pub const K_8_8_8_8: u8 = 6;
    pub const K_2_10_10_10: u8 = 7;
    pub const K_8_8_8_8_A: u8 = 14;
    pub const K_16_16_FLOAT: u8 = 31;
    pub const K_32_FLOAT: u8 = 36;
    pub const K_8_8_8_8_AS_16_16_16_16: u8 = 50;
    pub const K_2_10_10_10_AS_16_16_16_16: u8 = 54;
    // ── 64bpp dest formats (Canary `xenos.h:582-609`) ──────────────────
    /// `k_16_16_16_16` (4 channels × 16 bits, signed/unsigned variants
    /// resolve identically — same bit layout).
    pub const K_16_16_16_16: u8 = 26;
    /// `k_16_16_16_16_FLOAT` (4 channels × half-float).
    pub const K_16_16_16_16_FLOAT: u8 = 32;
    /// `k_32_32_FLOAT` (R32 + G32, 64bpp). `xenos::TextureFormat = 37`.
    pub const K_32_32_FLOAT: u8 = 37;
    /// Depth textures (Canary `xenos::TextureFormat`).
    pub const K_24_8: u8 = 22;
    pub const K_24_8_FLOAT: u8 = 23;
}

/// 32-bit bitwise-equivalence check covering 32bpp color and depth resolves.
/// Color side mirrors `xenos::IsColorResolveFormatBitwiseEquivalent`
/// (`xenos.h:614-639`). Depth side maps `DepthRenderTargetFormat` to
/// its textural form (`kD24S8 → k_24_8`, `kD24FS8 → k_24_8_FLOAT`).
pub fn is_32bpp_bitwise_equivalent(
    source: ResolveSource,
    source_is_64bpp: bool,
    source_format: u8,
    dest_format: u8,
) -> bool {
    if source_is_64bpp {
        return false;
    }
    match source {
        ResolveSource::Color(_) => {
            use color_format as cf;
            match source_format {
                // k_8_8_8_8 (0) and k_8_8_8_8_GAMMA (1). Gamma decode is
                // applied by the sampler at texture-fetch time (TextureSign::
                // kGamma); the bits are identical, so the copy path is the
                // same.
                0 | 1 => matches!(
                    dest_format,
                    cf::K_8_8_8_8 | cf::K_8_8_8_8_A | cf::K_8_8_8_8_AS_16_16_16_16
                ),
                // k_2_10_10_10 (2) and k_2_10_10_10_AS_10_10_10_10 (10).
                2 | 10 => matches!(
                    dest_format,
                    cf::K_2_10_10_10 | cf::K_2_10_10_10_AS_16_16_16_16
                ),
                // k_16_16_FLOAT (6).
                6 => dest_format == cf::K_16_16_FLOAT,
                // k_32_FLOAT (14).
                14 => dest_format == cf::K_32_FLOAT,
                _ => false,
            }
        }
        ResolveSource::Depth => match source_format {
            // kD24S8 (0) → k_24_8 (22).
            0 => dest_format == color_format::K_24_8,
            // kD24FS8 (1) → k_24_8_FLOAT (23).
            1 => dest_format == color_format::K_24_8_FLOAT,
            _ => false,
        },
    }
}

/// 64-bit bitwise-equivalence check (Canary `xenos.h:614-639` 64bpp arms).
/// Used when `info.source_is_64bpp == true`. Only color resolves go here —
/// depth is always 32bpp.
pub fn is_64bpp_bitwise_equivalent(source_format: u8, dest_format: u8) -> bool {
    use color_format as cf;
    match source_format {
        // k_16_16_16_16 (5) — signed and unsigned variants resolve to the
        // same bits because the resolve is a raw byte copy.
        5 => dest_format == cf::K_16_16_16_16,
        // k_16_16_16_16_FLOAT (7).
        7 => dest_format == cf::K_16_16_16_16_FLOAT,
        // k_32_32_FLOAT (15).
        15 => dest_format == cf::K_32_32_FLOAT,
        _ => false,
    }
}

/// Run one resolve copy. Returns the number of samples successfully
/// written and whether the dest format was supported; the caller updates
/// `GpuStats::resolves_copied_total` / `resolves_skipped_total` accordingly.
pub fn copy_to_memory(
    info: &ResolveInfo,
    edram: &ShadowEdram,
    mem: &dyn MemoryAccess,
) -> ResolveCopyStats {
    // --- No-op paths (not a failure) ---
    if info.coords.width == 0 || info.coords.height == 0 {
        return ResolveCopyStats {
            samples_written: 0,
            supported: true,
        };
    }
    if info.copy_command == COPY_COMMAND_NULL {
        return ResolveCopyStats {
            samples_written: 0,
            supported: true,
        };
    }

    // --- Supported-shape gates ---
    if info.copy_dest_array {
        tracing::warn!(
            src = info.copy_src_select,
            fmt = info.dest_format,
            "gpu: resolve skipped — copy_dest_array (3D/stacked) not implemented"
        );
        return ResolveCopyStats::default();
    }
    if info.dest_exp_bias != 0 {
        tracing::warn!(
            bias = info.dest_exp_bias,
            "gpu: resolve skipped — dest_exp_bias != 0 not implemented"
        );
        return ResolveCopyStats::default();
    }
    let supported = if info.source_is_64bpp {
        // 64bpp color resolve. Depth is always 32bpp so this only fires
        // for `ResolveSource::Color(_)`.
        matches!(info.source, ResolveSource::Color(_))
            && is_64bpp_bitwise_equivalent(info.source_format, info.dest_format)
    } else {
        is_32bpp_bitwise_equivalent(
            info.source,
            info.source_is_64bpp,
            info.source_format,
            info.dest_format,
        )
    };
    if !supported {
        tracing::warn!(
            source = ?info.source,
            source_format = info.source_format,
            source_is_64bpp = info.source_is_64bpp,
            dest_format = info.dest_format,
            "gpu: resolve skipped — not a bitwise-equivalent pair"
        );
        return ResolveCopyStats::default();
    }

    if info.dest_endian >= 4 {
        tracing::warn!(
            endian = info.dest_endian,
            "gpu: resolve endian k8in64/k8in128 approximated as k8in32"
        );
    }

    // Destination pitch must be aligned to 32 texels per
    // `kStoragePitchHeightAlignmentBlocks`. `align_pitch_to_macro_tile`
    // rounds to 32 (it's `MACRO_TILE_WIDTH_LOG2 = 5`).
    let pitch_aligned = align_pitch_to_macro_tile(info.dest_pitch_pixels);
    if pitch_aligned == 0 {
        return ResolveCopyStats {
            samples_written: 0,
            supported: true,
        };
    }
    // bpp_log2: 2 for 32bpp, 3 for 64bpp. Drives the `tiled_2d_offset`
    // stride calculation per Canary `texture_address.h:120-180`.
    let bpp_log2: u32 = if info.source_is_64bpp { 3 } else { 2 };

    let is_depth = matches!(info.source, ResolveSource::Depth);
    let sanitized = sanitize_sample_select(info.copy_sample_select, info.msaa, is_depth);
    // For averaging modes we'd previously fall back to sample 0 + warn.
    // 3A wires real averaging via `read_pixel_averaged`; single-sample
    // picks still take the fast path.
    let single_sample_idx = sanitized.single_sample_index();

    let mut samples_written: u32 = 0;
    for dy in 0..info.coords.height {
        let pixel_y = info.coords.y0 + dy;
        for dx in 0..info.coords.width {
            let pixel_x = info.coords.x0 + dx;
            // Destination coordinates are 0-based against `dest_base` — the
            // base already points at the top-left of the copy rectangle.
            let dst_off = tiled_2d_offset(dx, dy, pitch_aligned, bpp_log2);
            let dst_addr = info.dest_base.wrapping_add(dst_off);

            if info.source_is_64bpp {
                let (lo, hi) = match single_sample_idx {
                    Some(idx) => {
                        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
                        edram.read_sample_64bpp(
                            info.source_base_tiles,
                            info.surface_pitch_tiles,
                            sx,
                            sy,
                        )
                    }
                    None => read_pixel_averaged_64bpp(edram, info, sanitized, pixel_x, pixel_y),
                };
                let lo_swapped = apply_endian_128(lo, info.dest_endian);
                let hi_swapped = apply_endian_128(hi, info.dest_endian);
                mem.write_u32(dst_addr, lo_swapped);
                mem.write_u32(dst_addr.wrapping_add(4), hi_swapped);
                samples_written += 1;
            } else {
                let sample = match single_sample_idx {
                    Some(idx) => {
                        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
                        edram.read_sample_32bpp(
                            info.source_base_tiles,
                            info.surface_pitch_tiles,
                            sx,
                            sy,
                        )
                    }
                    None => read_pixel_averaged_32bpp(
                        edram,
                        info,
                        sanitized,
                        pixel_x,
                        pixel_y,
                    ),
                };
                let swapped = apply_endian_128(sample, info.dest_endian);
                mem.write_u32(dst_addr, swapped);
                samples_written += 1;
            }
        }
    }

    ResolveCopyStats {
        samples_written,
        supported: true,
    }
}

/// Compute the EDRAM sample-space (x, y) for `(pixel_x, pixel_y)` and a
/// single MSAA sample index.
#[inline]
fn sample_xy(
    pixel_x: u32,
    pixel_y: u32,
    sample_idx: u8,
    msaa: MsaaSamples,
    coords: &crate::draw_state::ResolveCoordinates,
) -> (u32, u32) {
    let (sample_dx, sample_dy) = sample_offset_in_pixel(sample_idx, msaa);
    let sx = (pixel_x << coords.sample_count_log2_x) + sample_dx;
    let sy = (pixel_y << coords.sample_count_log2_y) + sample_dy;
    (sx, sy)
}

/// Sample indices selected by an averaging `CopySampleSelect`.
/// `K01 → [0, 1]`, `K23 → [2, 3]`, `K0123 → [0, 1, 2, 3]`. Single-sample
/// picks should never reach this helper (caller checks `single_sample_index`).
fn averaging_sample_set(select: CopySampleSelect) -> &'static [u8] {
    match select {
        CopySampleSelect::K01 => &[0, 1],
        CopySampleSelect::K23 => &[2, 3],
        CopySampleSelect::K0123 => &[0, 1, 2, 3],
        // Single-sample picks: caller must never invoke this — fall back
        // to sample 0 just to keep the function total.
        _ => &[0],
    }
}

/// Average N samples of a 32bpp pixel format. Each sample is read, decoded
/// by `source_format`, averaged in the appropriate numeric space, then
/// re-encoded back into the same 32bpp word. Mirrors Canary's resolve
/// shader paths in `resolve.xesli:595-629` (per-format averaging) — we
/// implement them on the CPU because the resolve runs on the host.
fn read_pixel_averaged_32bpp(
    edram: &ShadowEdram,
    info: &ResolveInfo,
    select: CopySampleSelect,
    pixel_x: u32,
    pixel_y: u32,
) -> u32 {
    let indices = averaging_sample_set(select);
    let n = indices.len() as u32;
    if n == 0 {
        return 0;
    }
    // Pull every selected sample.
    let mut raw = [0u32; 4];
    for (i, &idx) in indices.iter().enumerate() {
        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
        raw[i] = edram.read_sample_32bpp(info.source_base_tiles, info.surface_pitch_tiles, sx, sy);
    }
    let raw_slice = &raw[..indices.len()];
    average_samples_32bpp(raw_slice, info.source_format, info.source)
}

/// Average N samples of a 64bpp pixel format, returning `(lo, hi)`.
fn read_pixel_averaged_64bpp(
    edram: &ShadowEdram,
    info: &ResolveInfo,
    select: CopySampleSelect,
    pixel_x: u32,
    pixel_y: u32,
) -> (u32, u32) {
    let indices = averaging_sample_set(select);
    let n = indices.len();
    if n == 0 {
        return (0, 0);
    }
    let mut raw = [(0u32, 0u32); 4];
    for (i, &idx) in indices.iter().enumerate() {
        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
        raw[i] = edram.read_sample_64bpp(info.source_base_tiles, info.surface_pitch_tiles, sx, sy);
    }
    let raw_slice = &raw[..n];
    average_samples_64bpp(raw_slice, info.source_format)
}

/// Per-format averaging for 32bpp color/depth resolves.
fn average_samples_32bpp(samples: &[u32], source_format: u8, source: ResolveSource) -> u32 {
    let n = samples.len() as u32;
    debug_assert!(n > 0);
    match source {
        ResolveSource::Color(_) => match source_format {
            // k_8_8_8_8 / k_8_8_8_8_GAMMA (0/1): per-channel rounded
            // unsigned-int mean. Matches Canary's `resolve.xesli` per-component
            // average for u8 — gamma is a sampler-time post-decode, the
            // bits are identical for resolve purposes.
            0 | 1 => average_8_8_8_8(samples, n),
            // k_2_10_10_10 / k_2_10_10_10_AS_10_10_10_10: per-field rounded
            // unsigned-int mean. Field widths 2/10/10/10 from low to high.
            2 | 10 => average_2_10_10_10(samples, n),
            // k_16_16_FLOAT (6): two half-floats packed in one u32.
            6 => average_2_half_floats(samples, n),
            // k_32_FLOAT (14): one f32 per sample.
            14 => average_1_f32(samples, n),
            // For any unsupported format, fall back to first sample —
            // upstream gating already filtered to bitwise-equivalent pairs
            // so this branch should be unreachable in practice.
            _ => samples[0],
        },
        // Depth resolves never carry MSAA averaging (sanitize collapses to
        // single-sample); reaching this branch is a degenerate caller.
        ResolveSource::Depth => samples[0],
    }
}

/// Per-format averaging for 64bpp color resolves. Returns `(lo, hi)`.
fn average_samples_64bpp(samples: &[(u32, u32)], source_format: u8) -> (u32, u32) {
    let n = samples.len() as u32;
    debug_assert!(n > 0);
    match source_format {
        // k_16_16_16_16 (5): four 16-bit channels across (lo, hi). Per-
        // channel rounded unsigned-int mean. Signed/unsigned variants
        // resolve identically because the resolve is a raw byte copy —
        // averaging signed values as unsigned still gives the correct
        // bits because two's-complement addition of `n` values divided
        // by `n` lands on the same bit pattern after truncation.
        5 => average_4_u16(samples, n),
        // k_16_16_16_16_FLOAT (7): four half-floats.
        7 => average_4_half_floats(samples, n),
        // k_32_32_FLOAT (15): two f32 (R32 = lo, G32 = hi).
        15 => average_2_f32(samples, n),
        _ => samples[0],
    }
}

#[inline]
fn average_8_8_8_8(samples: &[u32], n: u32) -> u32 {
    // Per-byte rounded unsigned mean.
    let mut sums = [0u32; 4];
    for &s in samples {
        sums[0] += s & 0xFF;
        sums[1] += (s >> 8) & 0xFF;
        sums[2] += (s >> 16) & 0xFF;
        sums[3] += (s >> 24) & 0xFF;
    }
    let half = n / 2;
    let avg = |sum: u32| ((sum + half) / n) & 0xFF;
    avg(sums[0])
        | (avg(sums[1]) << 8)
        | (avg(sums[2]) << 16)
        | (avg(sums[3]) << 24)
}

#[inline]
fn average_2_10_10_10(samples: &[u32], n: u32) -> u32 {
    // Field widths 2/10/10/10 (low to high).
    let mut sum_a = 0u32; // 2 bits
    let mut sum_b = 0u32; // 10 bits
    let mut sum_g = 0u32; // 10 bits
    let mut sum_r = 0u32; // 10 bits
    for &s in samples {
        sum_a += s & 0x3;
        sum_b += (s >> 2) & 0x3FF;
        sum_g += (s >> 12) & 0x3FF;
        sum_r += (s >> 22) & 0x3FF;
    }
    let half = n / 2;
    let avg = |sum: u32, width: u32| ((sum + half) / n) & ((1u32 << width) - 1);
    avg(sum_a, 2) | (avg(sum_b, 10) << 2) | (avg(sum_g, 10) << 12) | (avg(sum_r, 10) << 22)
}

#[inline]
fn half_to_f32(half: u16) -> f32 {
    let sign = ((half >> 15) & 0x1) as u32;
    let exp = ((half >> 10) & 0x1F) as i32;
    let mant = (half & 0x3FF) as u32;
    if exp == 0 {
        if mant == 0 {
            return f32::from_bits(sign << 31);
        }
        // Subnormal half → normalized f32.
        let mut e = -14;
        let mut m = mant;
        while (m & 0x400) == 0 {
            m <<= 1;
            e -= 1;
        }
        m &= 0x3FF;
        let f_exp = (e + 127) as u32;
        return f32::from_bits((sign << 31) | (f_exp << 23) | (m << 13));
    }
    if exp == 31 {
        let f_exp = 0xFFu32;
        let f_mant = mant << 13;
        return f32::from_bits((sign << 31) | (f_exp << 23) | f_mant);
    }
    let f_exp = (exp - 15 + 127) as u32;
    f32::from_bits((sign << 31) | (f_exp << 23) | (mant << 13))
}

#[inline]
fn f32_to_half(f: f32) -> u16 {
    let bits = f.to_bits();
    let sign = ((bits >> 31) & 0x1) as u16;
    let exp = ((bits >> 23) & 0xFF) as i32;
    let mant = bits & 0x7FFFFF;
    if exp == 0xFF {
        // Inf or NaN.
        let h_mant = if mant != 0 { 0x200 } else { 0 };
        return (sign << 15) | (0x1F << 10) | h_mant;
    }
    if exp == 0 {
        return sign << 15;
    }
    let e = exp - 127 + 15;
    if e >= 31 {
        return (sign << 15) | (0x1F << 10);
    }
    if e <= 0 {
        // Subnormal half. Round-to-nearest-even is overkill; truncate
        // toward zero — averaging 4 floats then converting once is the
        // dominant precision path anyway.
        if e < -10 {
            return sign << 15;
        }
        let m = (mant | 0x800000) >> ((1 - e) as u32 + 13);
        return (sign << 15) | (m as u16);
    }
    let h_mant = (mant >> 13) as u16;
    (sign << 15) | ((e as u16) << 10) | h_mant
}

#[inline]
fn average_2_half_floats(samples: &[u32], n: u32) -> u32 {
    // Each u32 = (lo: half, hi: half). Average as f32, re-encode.
    let mut sum_lo = 0.0f32;
    let mut sum_hi = 0.0f32;
    for &s in samples {
        sum_lo += half_to_f32((s & 0xFFFF) as u16);
        sum_hi += half_to_f32(((s >> 16) & 0xFFFF) as u16);
    }
    let inv = 1.0f32 / n as f32;
    let lo = f32_to_half(sum_lo * inv) as u32;
    let hi = f32_to_half(sum_hi * inv) as u32;
    lo | (hi << 16)
}

#[inline]
fn average_1_f32(samples: &[u32], n: u32) -> u32 {
    let mut sum = 0.0f32;
    for &s in samples {
        sum += f32::from_bits(s);
    }
    (sum / n as f32).to_bits()
}

#[inline]
fn average_4_u16(samples: &[(u32, u32)], n: u32) -> (u32, u32) {
    // (lo, hi) carry 4 × 16-bit channels. lo = (R, G), hi = (B, A) or similar
    // packing — averaging is per-16-bit-field regardless of channel mapping.
    let extract = |w: u32, shift: u32| (w >> shift) & 0xFFFF;
    let mut sums = [0u32; 4];
    for &(lo, hi) in samples {
        sums[0] += extract(lo, 0);
        sums[1] += extract(lo, 16);
        sums[2] += extract(hi, 0);
        sums[3] += extract(hi, 16);
    }
    let half = n / 2;
    let avg = |sum: u32| ((sum + half) / n) & 0xFFFF;
    let lo = avg(sums[0]) | (avg(sums[1]) << 16);
    let hi = avg(sums[2]) | (avg(sums[3]) << 16);
    (lo, hi)
}

#[inline]
fn average_4_half_floats(samples: &[(u32, u32)], n: u32) -> (u32, u32) {
    let mut sums = [0.0f32; 4];
    for &(lo, hi) in samples {
        sums[0] += half_to_f32((lo & 0xFFFF) as u16);
        sums[1] += half_to_f32(((lo >> 16) & 0xFFFF) as u16);
        sums[2] += half_to_f32((hi & 0xFFFF) as u16);
        sums[3] += half_to_f32(((hi >> 16) & 0xFFFF) as u16);
    }
    let inv = 1.0f32 / n as f32;
    let h0 = f32_to_half(sums[0] * inv) as u32;
    let h1 = f32_to_half(sums[1] * inv) as u32;
    let h2 = f32_to_half(sums[2] * inv) as u32;
    let h3 = f32_to_half(sums[3] * inv) as u32;
    (h0 | (h1 << 16), h2 | (h3 << 16))
}

#[inline]
fn average_2_f32(samples: &[(u32, u32)], n: u32) -> (u32, u32) {
    let mut sum_lo = 0.0f32;
    let mut sum_hi = 0.0f32;
    for &(lo, hi) in samples {
        sum_lo += f32::from_bits(lo);
        sum_hi += f32::from_bits(hi);
    }
    let inv = 1.0f32 / n as f32;
    ((sum_lo * inv).to_bits(), (sum_hi * inv).to_bits())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::draw_state::{ResolveCoordinates, ResolveInfo};
    use crate::edram::ShadowEdram;
    use crate::render_target_cache::MsaaSamples;
    use crate::tiled_address::{align_pitch_to_macro_tile, tiled_2d_offset};
    use xenia_memory::GuestMemory;

    /// Build a minimally-populated [`ResolveInfo`] for tests.
    fn minimal_info(dest_base: u32, pitch: u32, height: u32) -> ResolveInfo {
        ResolveInfo {
            copy_src_select: 0,
            copy_sample_select: 0,
            color_clear_enable: false,
            depth_clear_enable: false,
            copy_command: 0,
            dest_base,
            dest_pitch_pixels: pitch,
            dest_height_pixels: height,
            dest_format: color_format::K_8_8_8_8,
            dest_endian: 0,
            dest_exp_bias: 0,
            source: ResolveSource::Color(0),
            coords: ResolveCoordinates {
                x0: 0,
                y0: 0,
                width: pitch,
                height,
                sample_count_log2_x: 0,
                sample_count_log2_y: 0,
            },
            source_format: 0,
            source_base_tiles: 0,
            surface_pitch_tiles: pitch.div_ceil(80),
            msaa: MsaaSamples::X1,
            source_is_64bpp: false,
            color_clear_value: 0,
            color_clear_value_lo: 0,
            depth_clear_value: 0,
            copy_dest_array: false,
        }
    }

    fn fresh_mem() -> GuestMemory {
        use xenia_memory::page_table::MemoryProtect;
        let mut mem = GuestMemory::new().expect("guest memory");
        mem.alloc(
            0x4000_0000,
            0x0010_0000,
            MemoryProtect::READ | MemoryProtect::WRITE,
        )
        .expect("alloc");
        mem
    }

    #[test]
    fn endian_k_none_is_identity() {
        assert_eq!(apply_endian_128(0x11223344, 0), 0x11223344);
    }

    #[test]
    fn endian_k8in16_swaps_byte_pairs() {
        assert_eq!(apply_endian_128(0x11223344, 1), 0x22114433);
    }

    #[test]
    fn endian_k8in32_is_full_byte_reverse() {
        assert_eq!(apply_endian_128(0x11223344, 2), 0x44332211);
    }

    #[test]
    fn endian_k16in32_swaps_halves() {
        assert_eq!(apply_endian_128(0x11223344, 3), 0x33441122);
    }

    #[test]
    fn color_clear_resolve_writes_le_bytes_with_k8in32() {
        // Clear-resolve a 32x8 rectangle of k_8_8_8_8 samples to pattern
        // 0x11223344 with endian k8in32. Memory should contain LE bytes
        // [0x44, 0x33, 0x22, 0x11] at every tiled sample offset.
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        edram.fill_rect_32bpp(0, 1, 0, 0, 32, 8, 0x11223344);

        let mut info = minimal_info(0x4000_0000, 32, 8);
        info.dest_endian = 2; // k8in32
        info.source_base_tiles = 0;
        info.surface_pitch_tiles = 1;

        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        assert_eq!(stats.samples_written, 32 * 8);

        let pitch_aligned = align_pitch_to_macro_tile(32);
        for y in 0..8u32 {
            for x in 0..32u32 {
                let off = tiled_2d_offset(x, y, pitch_aligned, 2);
                let addr = 0x4000_0000u32.wrapping_add(off);
                let bytes = [
                    mem.read_u8(addr),
                    mem.read_u8(addr.wrapping_add(1)),
                    mem.read_u8(addr.wrapping_add(2)),
                    mem.read_u8(addr.wrapping_add(3)),
                ];
                assert_eq!(
                    bytes,
                    [0x44, 0x33, 0x22, 0x11],
                    "mismatch at ({x}, {y})"
                );
            }
        }
    }

    #[test]
    fn k_none_endian_keeps_big_endian_bytes() {
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        edram.fill_rect_32bpp(0, 1, 0, 0, 16, 8, 0xAABBCCDD);

        let mut info = minimal_info(0x4000_0000, 16, 8);
        info.dest_endian = 0;
        info.source_base_tiles = 0;
        info.surface_pitch_tiles = 1;

        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);

        let pitch_aligned = align_pitch_to_macro_tile(16);
        let off = tiled_2d_offset(0, 0, pitch_aligned, 2);
        let addr = 0x4000_0000u32.wrapping_add(off);
        assert_eq!(
            [
                mem.read_u8(addr),
                mem.read_u8(addr.wrapping_add(1)),
                mem.read_u8(addr.wrapping_add(2)),
                mem.read_u8(addr.wrapping_add(3)),
            ],
            [0xAA, 0xBB, 0xCC, 0xDD]
        );
    }

    #[test]
    fn empty_rect_is_noop_and_no_page_version_bump() {
        let mut mem = fresh_mem();
        let edram = ShadowEdram::new();
        let before = mem.page_version(0x4000_0000);

        let mut info = minimal_info(0x4000_0000, 0, 0);
        info.coords.width = 0;
        info.coords.height = 0;
        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        assert_eq!(stats.samples_written, 0);
        assert_eq!(mem.page_version(0x4000_0000), before);
    }

    #[test]
    fn unsupported_dest_format_is_graceful() {
        let mut mem = fresh_mem();
        let edram = ShadowEdram::new();
        let mut info = minimal_info(0x4000_0000, 16, 16);
        // k_16_16_16_16 is 64bpp — not bitwise-equivalent to any 32bpp dest.
        info.dest_format = 26;
        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(!stats.supported);
        assert_eq!(stats.samples_written, 0);
    }

    #[test]
    fn resolve_bumps_page_version_for_texture_cache_invalidation() {
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        edram.fill_rect_32bpp(0, 1, 0, 0, 16, 8, 0xDEADBEEF);

        let before = mem.page_version(0x4000_0000);
        let mut info = minimal_info(0x4000_0000, 16, 8);
        info.source_base_tiles = 0;
        info.surface_pitch_tiles = 1;
        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        assert!(mem.page_version(0x4000_0000) > before);
    }

    /// k_2_10_10_10 source ↔ k_2_10_10_10 dest is bitwise-equivalent per
    /// Canary `xenos.h:624-627`. Same path, just different format bytes.
    #[test]
    fn k_2_10_10_10_is_bitwise_equivalent() {
        assert!(is_32bpp_bitwise_equivalent(
            ResolveSource::Color(0), false, /* source */ 2, /* dest */ 7,
        ));
        assert!(is_32bpp_bitwise_equivalent(
            ResolveSource::Color(0),
            false,
            /* source k_2_10_10_10_AS_10_10_10_10 */ 10,
            /* dest k_2_10_10_10_AS_16_16_16_16 */ 54,
        ));
    }

    /// k_8_8_8_8_GAMMA source resolves identically to k_8_8_8_8 (gamma is
    /// applied at sample time, not on store).
    #[test]
    fn k_8_8_8_8_gamma_source_is_bitwise_equivalent() {
        assert!(is_32bpp_bitwise_equivalent(
            ResolveSource::Color(0),
            false,
            /* source k_8_8_8_8_GAMMA */ 1,
            /* dest k_8_8_8_8 */ 6,
        ));
    }

    /// Depth resolve: kD24S8 → k_24_8, kD24FS8 → k_24_8_FLOAT.
    #[test]
    fn depth_resolve_format_equivalence() {
        assert!(is_32bpp_bitwise_equivalent(
            ResolveSource::Depth,
            false,
            /* kD24S8 */ 0,
            /* k_24_8 */ 22,
        ));
        assert!(is_32bpp_bitwise_equivalent(
            ResolveSource::Depth,
            false,
            /* kD24FS8 */ 1,
            /* k_24_8_FLOAT */ 23,
        ));
        // Mismatched depth → texture format = not equivalent.
        assert!(!is_32bpp_bitwise_equivalent(
            ResolveSource::Depth,
            false,
            0,
            23,
        ));
    }

    /// 64bpp source is never equivalent to a 32bpp dest, even when the
    /// source/dest format numbers might look compatible.
    #[test]
    fn sixty_four_bpp_source_is_never_equivalent() {
        assert!(!is_32bpp_bitwise_equivalent(
            ResolveSource::Color(0),
            true,
            5, // k_16_16_16_16
            6,
        ));
    }

    /// 64bpp bitwise-equivalent pairs per Canary `xenos.h:614-639`.
    #[test]
    fn sixty_four_bpp_equivalence_pairs() {
        // k_16_16_16_16 (5) → k_16_16_16_16 (26)
        assert!(is_64bpp_bitwise_equivalent(5, 26));
        // k_16_16_16_16_FLOAT (7) → k_16_16_16_16_FLOAT (32)
        assert!(is_64bpp_bitwise_equivalent(7, 32));
        // k_32_32_FLOAT (15) → k_32_32_FLOAT (37)
        assert!(is_64bpp_bitwise_equivalent(15, 37));
        // Cross-format must reject.
        assert!(!is_64bpp_bitwise_equivalent(5, 32));
        assert!(!is_64bpp_bitwise_equivalent(0, 26));
    }

    /// End-to-end 64bpp resolve: paint a `k_16_16_16_16` pattern into EDRAM
    /// and confirm `copy_to_memory` lands two u32s per pixel into guest mem.
    #[test]
    fn sixty_four_bpp_resolve_writes_two_words_per_pixel() {
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        // 16x4 logical 64bpp samples; pitch = 1 32bpp tile.
        edram.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xAABB_CCDD, 0x1122_3344);

        let mut info = minimal_info(0x4000_0000, 16, 4);
        info.source = ResolveSource::Color(0);
        info.source_format = 5; // k_16_16_16_16
        info.dest_format = color_format::K_16_16_16_16;
        info.source_is_64bpp = true;
        info.dest_endian = 0; // kNone
        info.source_base_tiles = 0;
        info.surface_pitch_tiles = 1;
        info.coords.width = 16;
        info.coords.height = 4;

        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        assert_eq!(stats.samples_written, 16 * 4);

        // First pixel: lo word at dst_off, hi word at dst_off + 4. With
        // bpp_log2=3, pitch_aligned=32 (rounded from 16), tiled offset
        // for (0,0) is 0.
        let pitch_aligned = align_pitch_to_macro_tile(16);
        let off = tiled_2d_offset(0, 0, pitch_aligned, 3);
        let addr = 0x4000_0000u32.wrapping_add(off);
        // BE store of 0xAABBCCDD = bytes [0xAA, 0xBB, 0xCC, 0xDD]
        assert_eq!(mem.read_u8(addr), 0xAA);
        assert_eq!(mem.read_u8(addr.wrapping_add(1)), 0xBB);
        assert_eq!(mem.read_u8(addr.wrapping_add(2)), 0xCC);
        assert_eq!(mem.read_u8(addr.wrapping_add(3)), 0xDD);
        assert_eq!(mem.read_u8(addr.wrapping_add(4)), 0x11);
        assert_eq!(mem.read_u8(addr.wrapping_add(7)), 0x44);
    }

    /// MSAA averaging — `k_8_8_8_8` per-channel rounded mean of 4 samples.
    /// Build a 4x MSAA RT where the 4 samples per pixel hold (0, 64, 128,
    /// 192) in the red channel and check the resolve produces the rounded
    /// mean (96).
    #[test]
    fn msaa_4x_averaging_k_8_8_8_8() {
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        // 4x MSAA: each pixel occupies a 2×2 sample grid.
        // Pixel (0,0) sample positions (0..4) at sample-coords:
        //   s0: (0, 0)
        //   s1: (1, 0)
        //   s2: (0, 1)
        //   s3: (1, 1)
        // Stuff R=[0, 64, 128, 192], G=B=A=0.
        edram.write_sample_32bpp(0, 1, 0, 0, 0x00_00_00_00); // R=0
        edram.write_sample_32bpp(0, 1, 1, 0, 0x00_00_00_40); // R=64
        edram.write_sample_32bpp(0, 1, 0, 1, 0x00_00_00_80); // R=128
        edram.write_sample_32bpp(0, 1, 1, 1, 0x00_00_00_C0); // R=192

        let mut info = minimal_info(0x4000_0000, 1, 1);
        info.source = ResolveSource::Color(0);
        info.source_format = 0; // k_8_8_8_8
        info.dest_format = color_format::K_8_8_8_8;
        info.copy_sample_select = 6; // K0123
        info.msaa = MsaaSamples::X4;
        info.coords.sample_count_log2_x = 1;
        info.coords.sample_count_log2_y = 1;
        info.coords.width = 1;
        info.coords.height = 1;
        info.dest_endian = 0;
        info.source_base_tiles = 0;
        info.surface_pitch_tiles = 1;

        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        assert_eq!(stats.samples_written, 1);
        // R = (0+64+128+192 + 2)/4 = 96 = 0x60. Big-endian store.
        let addr = 0x4000_0000u32;
        // The byte order in u32 is [byte0, byte1, byte2, byte3] where
        // byte0 = R. After BE store of pixel 0x000000_60 (R=0x60), the
        // bytes at the resolve-tile offset are [0x00, 0x00, 0x00, 0x60].
        let bytes = [
            mem.read_u8(addr),
            mem.read_u8(addr.wrapping_add(1)),
            mem.read_u8(addr.wrapping_add(2)),
            mem.read_u8(addr.wrapping_add(3)),
        ];
        assert_eq!(bytes, [0x00, 0x00, 0x00, 0x60], "averaged R should be 0x60");
    }

    /// MSAA averaging — `k_32_FLOAT` averages 4 f32 samples linearly.
    #[test]
    fn msaa_4x_averaging_k_32_float() {
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        let f = |v: f32| v.to_bits();
        edram.write_sample_32bpp(0, 1, 0, 0, f(1.0));
        edram.write_sample_32bpp(0, 1, 1, 0, f(2.0));
        edram.write_sample_32bpp(0, 1, 0, 1, f(3.0));
        edram.write_sample_32bpp(0, 1, 1, 1, f(4.0));

        let mut info = minimal_info(0x4000_0000, 1, 1);
        info.source = ResolveSource::Color(0);
        info.source_format = 14; // k_32_FLOAT
        info.dest_format = color_format::K_32_FLOAT;
        info.copy_sample_select = 6; // K0123
        info.msaa = MsaaSamples::X4;
        info.coords.sample_count_log2_x = 1;
        info.coords.sample_count_log2_y = 1;
        info.coords.width = 1;
        info.coords.height = 1;
        info.dest_endian = 2; // k8in32 — game-typical for float sampling
        info.source_base_tiles = 0;
        info.surface_pitch_tiles = 1;

        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        // (1+2+3+4)/4 = 2.5
        let expected = 2.5f32.to_bits();
        // k8in32 swap = byte-reverse → BE store puts the LE-swapped bytes back
        // in original (big-endian) order. Reconstruct guest-visible u32:
        let bytes = [
            mem.read_u8(0x4000_0000),
            mem.read_u8(0x4000_0001),
            mem.read_u8(0x4000_0002),
            mem.read_u8(0x4000_0003),
        ];
        // After endian k8in32 (swap_bytes) and BE store, the bytes in memory
        // are LE-from-CPU-perspective. So bytes here are u32::to_le_bytes(expected).
        assert_eq!(bytes, expected.to_le_bytes());
    }

    /// MSAA averaging — `k_2_10_10_10` per-field rounded mean.
    #[test]
    fn msaa_2x_averaging_k_2_10_10_10() {
        // 2x MSAA samples are stacked vertically (s0 at y=0, s1 at y=1).
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        // Field widths 2/10/10/10. Pack two values per field (a/b/g/r).
        let pack = |a: u32, b: u32, g: u32, r: u32| {
            (a & 0x3) | ((b & 0x3FF) << 2) | ((g & 0x3FF) << 12) | ((r & 0x3FF) << 22)
        };
        edram.write_sample_32bpp(0, 1, 0, 0, pack(0, 100, 200, 300));
        edram.write_sample_32bpp(0, 1, 0, 1, pack(2, 200, 300, 400));

        let mut info = minimal_info(0x4000_0000, 1, 1);
        info.source = ResolveSource::Color(0);
        info.source_format = 2; // k_2_10_10_10
        info.dest_format = color_format::K_2_10_10_10;
        info.copy_sample_select = 4; // K01
        info.msaa = MsaaSamples::X2;
        info.coords.sample_count_log2_x = 0;
        info.coords.sample_count_log2_y = 1;
        info.coords.width = 1;
        info.coords.height = 1;
        info.dest_endian = 0;
        info.source_base_tiles = 0;
        info.surface_pitch_tiles = 1;

        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        // Expected per-field: a=(0+2+1)/2=1, b=(100+200+1)/2=150, g=(200+300+1)/2=250, r=(300+400+1)/2=350
        let expected = pack(1, 150, 250, 350);
        // Read back as BE u32 (big-endian byte ordering).
        let bytes = [
            mem.read_u8(0x4000_0000),
            mem.read_u8(0x4000_0001),
            mem.read_u8(0x4000_0002),
            mem.read_u8(0x4000_0003),
        ];
        assert_eq!(bytes, expected.to_be_bytes());
    }

    /// End-to-end depth resolve: set up a depth RT at tile base 8, paint
    /// it via clear value, and verify the copy emerges in guest memory
    /// with the right bytes.
    #[test]
    fn depth_clear_resolve_end_to_end() {
        let mut mem = fresh_mem();
        let mut edram = ShadowEdram::new();
        // Paint the depth tiles directly with a known pattern.
        edram.fill_rect_32bpp(8, 1, 0, 0, 16, 8, 0x3FFF_FF00);

        let mut info = minimal_info(0x4000_0000, 16, 8);
        info.source = ResolveSource::Depth;
        info.source_format = 0; // kD24S8
        info.dest_format = color_format::K_24_8;
        info.dest_endian = 2; // k8in32
        info.source_base_tiles = 8;
        info.surface_pitch_tiles = 1;

        let stats = copy_to_memory(&info, &edram, &mut mem);
        assert!(stats.supported);
        assert_eq!(stats.samples_written, 16 * 8);

        // First pixel should be the endian-swapped pattern: BE-store of
        // 0x3FFF_FF00.swap_bytes() = 0x00FF_FF3F → bytes [0x00, 0xFF, 0xFF, 0x3F].
        let pitch_aligned = align_pitch_to_macro_tile(16);
        let off = tiled_2d_offset(0, 0, pitch_aligned, 2);
        let addr = 0x4000_0000u32.wrapping_add(off);
        assert_eq!(
            [
                mem.read_u8(addr),
                mem.read_u8(addr.wrapping_add(1)),
                mem.read_u8(addr.wrapping_add(2)),
                mem.read_u8(addr.wrapping_add(3)),
            ],
            [0x00, 0xFF, 0xFF, 0x3F]
        );
    }

    /// `sanitize_sample_select` for 1x MSAA collapses every select to K0.
    #[test]
    fn sanitize_1x_msaa_collapses_to_k0() {
        for raw in 0..=7u8 {
            let s = sanitize_sample_select(raw, MsaaSamples::X1, false);
            assert_eq!(s, CopySampleSelect::K0, "raw={raw}");
        }
    }

    /// 2x MSAA: k2→k0, k3→k1, k23→k01; depth averages sanitize to k0.
    #[test]
    fn sanitize_2x_msaa_obeys_canary_rules() {
        assert_eq!(
            sanitize_sample_select(2, MsaaSamples::X2, false),
            CopySampleSelect::K0
        );
        assert_eq!(
            sanitize_sample_select(3, MsaaSamples::X2, false),
            CopySampleSelect::K1
        );
        assert_eq!(
            sanitize_sample_select(5, MsaaSamples::X2, false),
            CopySampleSelect::K01
        );
        // Depth — no averaging.
        assert_eq!(
            sanitize_sample_select(4, MsaaSamples::X2, true),
            CopySampleSelect::K0
        );
        assert_eq!(
            sanitize_sample_select(6, MsaaSamples::X2, true),
            CopySampleSelect::K0
        );
    }

    /// 4x MSAA: single-samples untouched for color; depth averages
    /// collapse to a representative single sample (k0123 → k0).
    #[test]
    fn sanitize_4x_msaa_depth_collapses_averages() {
        assert_eq!(
            sanitize_sample_select(6, MsaaSamples::X4, true),
            CopySampleSelect::K0
        );
        assert_eq!(
            sanitize_sample_select(5, MsaaSamples::X4, true),
            CopySampleSelect::K2
        );
        assert_eq!(
            sanitize_sample_select(4, MsaaSamples::X4, true),
            CopySampleSelect::K0
        );
        // Color keeps averages.
        assert_eq!(
            sanitize_sample_select(6, MsaaSamples::X4, false),
            CopySampleSelect::K0123
        );
    }

    /// Sample offsets follow the standard Xbox 360 MSAA layout.
    #[test]
    fn sample_offset_layout() {
        // 1x
        assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X1), (0, 0));
        // 2x
        assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X2), (0, 0));
        assert_eq!(sample_offset_in_pixel(1, MsaaSamples::X2), (0, 1));
        // 4x
        assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X4), (0, 0));
        assert_eq!(sample_offset_in_pixel(1, MsaaSamples::X4), (1, 0));
        assert_eq!(sample_offset_in_pixel(2, MsaaSamples::X4), (0, 1));
        assert_eq!(sample_offset_in_pixel(3, MsaaSamples::X4), (1, 1));
    }
}