//! Xenos tiled-texture address formula (2D, Tiled2D layout).
//!
//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
//! macro-tile pattern with bank+pipe interleave for its internal DRAM
//! banks; this formula inverts that so we can read pixels out in linear
//! order, given the tiled source buffer.
//!
//! We use this in two places during P4:
//!  - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
//!  - Any place we need to read tiled guest memory into a host-linear
//!    buffer for CPU-side conversion before upload.

/// Tile size constants from canary.
pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px

/// Canary's `TiledCombine` helper — reassembles the DRAM address from the
/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
#[inline]
fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
    (y_lsb << 4)
        | (pipe << 6)
        | (bank << 11)
        | (outer_inner_bytes & 0b1111)
        | (((outer_inner_bytes >> 4) & 0b1) << 5)
        | (((outer_inner_bytes >> 5) & 0b111) << 8)
        | ((outer_inner_bytes >> 8) << 12)
}

/// 2D tiled offset in bytes from (x, y) into a tiled surface with
/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
/// `u32` rather than canary's signed `int` since our callers stay in
/// unsigned arithmetic.
///
/// This is the canonical formula — do not simplify without re-reading
/// `texture_address.h:190-208`; the bit-interleave cannot be expressed
/// as a linear function.
pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
    let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
    // Outer: which 32×32 macro tile we're in.
    let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
        + (x >> MACRO_TILE_WIDTH_LOG2))
        << 6;
    // Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
    // because that bit becomes the `y_lsb` interleave bit below).
    let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
    let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;

    let bank = (y >> 4) & 0b1;
    let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
    let y_lsb = y & 1;

    tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
}

/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
#[inline]
pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
    let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
    (pitch_pixels + mask) & !mask
}

/// Detile a 2D tiled surface into a linear destination buffer. The
/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
/// one block in the tiled source, and the detiler writes it into `dst`
/// at the linear (x, y) position.
///
/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
/// a single 16-bit block, etc.). `dst` must be at least
/// `width * height * bpp` bytes long.
///
/// Returns `Err(())` if the source doesn't contain enough bytes for the
/// largest offset the formula would produce (defensive — callers can
/// downgrade silently).
pub fn detile_2d(
    src: &[u8],
    dst: &mut [u8],
    width: u32,
    height: u32,
    pitch_pixels: u32,
    bpp: u32,
) -> Result<(), ()> {
    let bpp_log2 = bpp.trailing_zeros();
    let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
    let dst_pitch_bytes = (width * bpp) as usize;
    let bpp_u = bpp as usize;

    for y in 0..height {
        for x in 0..width {
            let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
            if src_off + bpp_u > src.len() {
                return Err(());
            }
            let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
            if dst_off + bpp_u > dst.len() {
                return Err(());
            }
            dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
        }
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    /// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
    #[test]
    fn origin_is_zero() {
        assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
    }

    /// Round-trip: detiling a tiled buffer that was filled using the same
    /// formula produces the identity linear image.
    #[test]
    fn roundtrip_small_pattern() {
        let w = 32u32;
        let h = 16u32;
        let bpp = 4u32;
        let pitch = align_pitch_to_macro_tile(w);
        // Allocate a tiled buffer large enough for the largest offset.
        let max_off = (0..h)
            .flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
            .max()
            .unwrap();
        let mut tiled = vec![0u8; max_off];
        // Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
        // each logical (x, y) position in the tiled buffer.
        for y in 0..h {
            for x in 0..w {
                let off = tiled_2d_offset(x, y, pitch, 2) as usize;
                tiled[off + 0] = x as u8;
                tiled[off + 1] = y as u8;
                tiled[off + 2] = (x ^ y) as u8;
                tiled[off + 3] = 0xFF;
            }
        }
        let mut linear = vec![0u8; (w * h * bpp) as usize];
        detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
        // Verify every logical pixel landed at the right linear offset.
        for y in 0..h {
            for x in 0..w {
                let lin = ((y * w + x) * bpp) as usize;
                assert_eq!(linear[lin + 0], x as u8);
                assert_eq!(linear[lin + 1], y as u8);
                assert_eq!(linear[lin + 2], (x ^ y) as u8);
                assert_eq!(linear[lin + 3], 0xFF);
            }
        }
    }

    /// Within a single macro-tile row, stepping `x` by 1 changes the low
    /// 3 bits of `x` which feed the `inner_blocks` field — different
    /// offsets are expected (no aliasing).
    #[test]
    fn neighbouring_pixels_have_distinct_offsets() {
        let mut seen = std::collections::HashSet::new();
        for y in 0..16 {
            for x in 0..32 {
                assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
            }
        }
    }

    /// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
    /// rounds to 1312.
    #[test]
    fn align_pitch_rounds_up_to_32() {
        assert_eq!(align_pitch_to_macro_tile(1280), 1280);
        assert_eq!(align_pitch_to_macro_tile(1281), 1312);
        assert_eq!(align_pitch_to_macro_tile(31), 32);
    }
}