//! Host pipeline that consumes PM4 `DRAW_INDX*` captures.
//!
//! Drives [`xenia_gpu::shaders::XENOS_INTERP_WGSL`]. This file owns the
//! wgpu bind-group + pipeline + buffer surface the Xenos WGSL interpreter
//! binds to. The WGSL module is expected to declare:
//!
//! ```text
//! @group(0) @binding(0) var<uniform>  xenos_draw   : XenosDrawConstants;   // 16 B
//! @group(0) @binding(1) var<uniform>  xenos_consts : XenosConstants;       // ~9.2 KB
//! @group(0) @binding(2) var<storage, read> vs_ucode      : array<u32>;
//! @group(0) @binding(3) var<storage, read> ps_ucode      : array<u32>;
//! @group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
//! ```
//!
//! Texture bindings (M6) are a single-slot stub for P3b:
//!
//! ```text
//! @group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
//! @group(1) @binding(1) var xenos_samp : sampler;
//! ```
//!
//! The bound texture is a 1×1 magenta placeholder. Real per-slot guest
//! texture uploads + format decode land with the texture cache (P5).

use bytemuck::{Pod, Zeroable};
use wgpu::util::DeviceExt;

use xenia_gpu::shaders::XENOS_INTERP_WGSL;
use xenia_gpu::xenos_constants::XenosConstantsBlock;

/// Per-draw constants mirroring the WGSL `XenosDrawConstants` uniform
/// block. Ordering / padding matches `xenos_interp.wgsl` exactly.
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable)]
struct DrawConstants {
    draw_index: u32,
    vertex_count: u32,
    prim_kind: u32,
    /// iterate-3O: guest dword base of the uploaded `vertex_buffer` window.
    /// The WGSL subtracts this from the absolute vertex-fetch address.
    vertex_base_dwords: u32,
    /// iterate-3S: guest→host NDC XY transform (mirrors canary
    /// `GetHostViewportInfo`). `clip.xy = pos.xy * ndc_scale + ndc_offset*pos.w`.
    /// Y is pre-flipped for wgpu. 16 bytes so the block stays 16-byte aligned.
    ndc_scale: [f32; 2],
    ndc_offset: [f32; 2],
}

/// iterate-3Y: the per-draw host color/blend/write-mask render state, decoded
/// from the guest registers (`RB_BLENDCONTROL0` / `RB_COLOR_MASK`). Used both
/// as part of the pipeline-cache key and to build the `wgpu::ColorTargetState`.
/// Mirrors canary's `GetColorBlendStateForRenderTarget` (D3D12
/// `pipeline_cache.cc`): the factors come straight from `RB_BLENDCONTROL`,
/// and a zero write-mask forces the no-blend `One,Zero` equation.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub struct RenderState {
    /// `RB_BLENDCONTROL0` raw value (RT0). `0x00010001` (One,Zero / One,Zero,
    /// Add) is the opaque case.
    pub blend_control: u32,
    /// RT0 nibble of `RB_COLOR_MASK` (bit0=R … bit3=A). 0 = write nothing.
    pub color_mask: u8,
}

impl RenderState {
    /// Fully-opaque, all-channels state (the legacy fixed behaviour). Used for
    /// procedural/synthetic draws that have no captured guest state.
    pub const OPAQUE: RenderState = RenderState {
        blend_control: 0x0001_0001,
        color_mask: 0xF,
    };

    /// Map a Xenos `BlendFactor` (5-bit field) to a wgpu `BlendFactor`,
    /// mirroring canary `kBlendFactorMap` (D3D12 `pipeline_cache.cc:1504`).
    fn map_factor(f: u32) -> wgpu::BlendFactor {
        match f {
            0 => wgpu::BlendFactor::Zero,
            1 => wgpu::BlendFactor::One,
            4 => wgpu::BlendFactor::Src,
            5 => wgpu::BlendFactor::OneMinusSrc,
            6 => wgpu::BlendFactor::SrcAlpha,
            7 => wgpu::BlendFactor::OneMinusSrcAlpha,
            8 => wgpu::BlendFactor::Dst,
            9 => wgpu::BlendFactor::OneMinusDst,
            10 => wgpu::BlendFactor::DstAlpha,
            11 => wgpu::BlendFactor::OneMinusDstAlpha,
            12 => wgpu::BlendFactor::Constant,
            13 => wgpu::BlendFactor::OneMinusConstant,
            14 => wgpu::BlendFactor::Constant,
            15 => wgpu::BlendFactor::OneMinusConstant,
            16 => wgpu::BlendFactor::SrcAlphaSaturated,
            // 2/3 and >16 are undefined on Xenos; canary maps to Zero.
            _ => wgpu::BlendFactor::Zero,
        }
    }

    /// Map a Xenos `BlendFactor` for the *alpha* channel, mirroring canary
    /// `kBlendFactorAlphaMap` (color-mode factors collapse to alpha).
    fn map_factor_alpha(f: u32) -> wgpu::BlendFactor {
        match f {
            4 => wgpu::BlendFactor::SrcAlpha,
            5 => wgpu::BlendFactor::OneMinusSrcAlpha,
            8 => wgpu::BlendFactor::DstAlpha,
            9 => wgpu::BlendFactor::OneMinusDstAlpha,
            other => Self::map_factor(other),
        }
    }

    fn map_op(o: u32) -> wgpu::BlendOperation {
        match o {
            0 => wgpu::BlendOperation::Add,
            1 => wgpu::BlendOperation::Subtract,
            2 => wgpu::BlendOperation::Min,
            3 => wgpu::BlendOperation::Max,
            4 => wgpu::BlendOperation::ReverseSubtract,
            _ => wgpu::BlendOperation::Add,
        }
    }

    /// Build the `wgpu::ColorTargetState` for this draw.
    fn color_target(&self, format: wgpu::TextureFormat) -> wgpu::ColorTargetState {
        let bc = self.blend_control;
        let color_src = bc & 0x1F;
        let color_op = (bc >> 5) & 0x7;
        let color_dst = (bc >> 8) & 0x1F;
        let alpha_src = (bc >> 16) & 0x1F;
        let alpha_op = (bc >> 21) & 0x7;
        let alpha_dst = (bc >> 24) & 0x1F;

        // wgpu requires `blend: None` when nothing would be written; also the
        // `One,Zero,Add` identity is the opaque case (canary's no-blend), which
        // we express as `blend: None` so it's a plain overwrite.
        let is_opaque = color_src == 1
            && color_dst == 0
            && color_op == 0
            && alpha_src == 1
            && alpha_dst == 0
            && alpha_op == 0;
        let blend = if is_opaque {
            None
        } else {
            Some(wgpu::BlendState {
                color: wgpu::BlendComponent {
                    src_factor: Self::map_factor(color_src),
                    dst_factor: Self::map_factor(color_dst),
                    operation: Self::map_op(color_op),
                },
                alpha: wgpu::BlendComponent {
                    src_factor: Self::map_factor_alpha(alpha_src),
                    dst_factor: Self::map_factor_alpha(alpha_dst),
                    operation: Self::map_op(alpha_op),
                },
            })
        };

        let mut write_mask = wgpu::ColorWrites::empty();
        if self.color_mask & 0x1 != 0 {
            write_mask |= wgpu::ColorWrites::RED;
        }
        if self.color_mask & 0x2 != 0 {
            write_mask |= wgpu::ColorWrites::GREEN;
        }
        if self.color_mask & 0x4 != 0 {
            write_mask |= wgpu::ColorWrites::BLUE;
        }
        if self.color_mask & 0x8 != 0 {
            write_mask |= wgpu::ColorWrites::ALPHA;
        }

        wgpu::ColorTargetState {
            format,
            blend,
            write_mask,
        }
    }
}

/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
#[derive(Clone, Copy, Debug)]
pub struct DrawRequest {
    /// Monotonic draw counter; shader uses it for per-draw colour rotation.
    pub draw_index: u32,
    /// Host-normalised vertex count (after primitive-processor rewrite).
    pub vertex_count: u32,
    /// Xenos primitive-type code; shader may branch on it in P3b+.
    pub prim_kind: u32,
    /// iterate-3O: guest dword base of the per-draw vertex window uploaded to
    /// `vertex_buffer` (b4). 0 = no real vertex window (procedural fallback).
    pub vertex_base_dwords: u32,
    /// iterate-3S: guest→host NDC XY transform (Y pre-flipped). When all-zero
    /// the shader leaves the position untransformed (procedural fallback).
    pub ndc_scale: [f32; 2],
    pub ndc_offset: [f32; 2],
}

/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
/// shaders are ≪ 4 KB; 64 KB is orders-of-magnitude slack.
const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS
/// 16 MB of vertex data — enough for any realistic Xenos draw.
const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;

pub struct XenosPipeline {
    /// Interpreter pipeline with the legacy fixed (alpha-blend) state. Kept as
    /// the default; per-state variants are built lazily in `interp_cache`.
    pipeline: wgpu::RenderPipeline,
    /// iterate-3Y: the interpreter WGSL module, retained so per-render-state
    /// interpreter pipelines can be compiled on demand.
    interp_shader: wgpu::ShaderModule,
    /// iterate-3Y: interpreter pipelines keyed on the per-draw `RenderState`
    /// (blend + write mask), so flat/alpha/opaque draws composite correctly
    /// even when their (vs,ps) didn't translate.
    interp_cache: std::collections::HashMap<RenderState, wgpu::RenderPipeline>,
    draw_ctx_buffer: wgpu::Buffer,
    constants_buffer: wgpu::Buffer,
    vs_ucode_buffer: wgpu::Buffer,
    ps_ucode_buffer: wgpu::Buffer,
    vertex_buffer: wgpu::Buffer,
    bind_group: wgpu::BindGroup,
    /// P5: swapped per-draw when a new cached texture becomes active.
    tex_bind_group: wgpu::BindGroup,
    /// Layout + sampler retained so `set_texture_view` can rebuild
    /// `tex_bind_group` on the fly without re-reading the pipeline.
    tex_bgl: wgpu::BindGroupLayout,
    sampler: wgpu::Sampler,
    /// Fallback 1×1 magenta texture — used when no guest texture has been
    /// uploaded yet or when a draw references an unsupported format.
    dummy_view: wgpu::TextureView,
    /// P7 — retained pipeline layout + compiled-pipeline cache for
    /// Xenos→WGSL translator output. Keyed on `(vs_blob_key, ps_blob_key)`
    /// so every (vs, ps) pair gets compiled once and re-used for every
    /// subsequent draw. Interpreter pipeline remains the fallback.
    pipeline_layout: wgpu::PipelineLayout,
    /// iterate-3Y: cached translator pipelines keyed on the shader pair AND the
    /// per-draw render state, so the same (vs,ps) with different blend/mask
    /// composites correctly. The translated WGSL module is itself cached per
    /// (vs,ps) so re-translation only happens once.
    translated_cache: std::collections::HashMap<(u32, u32, RenderState), wgpu::RenderPipeline>,
    translated_modules: std::collections::HashMap<(u32, u32), wgpu::ShaderModule>,
    pub target_format: wgpu::TextureFormat,
}

impl XenosPipeline {
    pub fn new(
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        target_format: wgpu::TextureFormat,
    ) -> Self {
        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
            label: Some("xenos_interp.wgsl"),
            source: wgpu::ShaderSource::Wgsl(XENOS_INTERP_WGSL.into()),
        });

        let bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
            label: Some("xenos bind group layout"),
            entries: &[
                // b0: draw_ctx (16 B uniform)
                wgpu::BindGroupLayoutEntry {
                    binding: 0,
                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Uniform,
                        has_dynamic_offset: false,
                        min_binding_size: std::num::NonZeroU64::new(
                            std::mem::size_of::<DrawConstants>() as u64,
                        ),
                    },
                    count: None,
                },
                // b1: XenosConstants read-only storage (~9.2 KB). Not uniform
                // because the block contains packed `array<u32>` fields and
                // WGSL's uniform address space would require 16-byte stride.
                wgpu::BindGroupLayoutEntry {
                    binding: 1,
                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Storage { read_only: true },
                        has_dynamic_offset: false,
                        min_binding_size: std::num::NonZeroU64::new(
                            XenosConstantsBlock::SIZE as u64,
                        ),
                    },
                    count: None,
                },
                // b2: vs_ucode (read-only storage)
                wgpu::BindGroupLayoutEntry {
                    binding: 2,
                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Storage { read_only: true },
                        has_dynamic_offset: false,
                        min_binding_size: None,
                    },
                    count: None,
                },
                // b3: ps_ucode (read-only storage)
                wgpu::BindGroupLayoutEntry {
                    binding: 3,
                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Storage { read_only: true },
                        has_dynamic_offset: false,
                        min_binding_size: None,
                    },
                    count: None,
                },
                // b4: vertex_buffer (read-only storage)
                wgpu::BindGroupLayoutEntry {
                    binding: 4,
                    visibility: wgpu::ShaderStages::VERTEX,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Storage { read_only: true },
                        has_dynamic_offset: false,
                        min_binding_size: None,
                    },
                    count: None,
                },
            ],
        });
        let tex_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
            label: Some("xenos tex bind group layout"),
            entries: &[
                wgpu::BindGroupLayoutEntry {
                    binding: 0,
                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
                    ty: wgpu::BindingType::Texture {
                        sample_type: wgpu::TextureSampleType::Float { filterable: true },
                        view_dimension: wgpu::TextureViewDimension::D2,
                        multisampled: false,
                    },
                    count: None,
                },
                wgpu::BindGroupLayoutEntry {
                    binding: 1,
                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
                    ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering),
                    count: None,
                },
            ],
        });

        let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
            label: Some("xenos pipeline layout"),
            bind_group_layouts: &[&bgl, &tex_bgl],
            push_constant_ranges: &[],
        });

        // Buffer allocation. `queue.write_buffer` uses COPY_DST; all
        // interpreter-facing buffers need it.
        let initial_draw = DrawConstants {
            draw_index: 0,
            vertex_count: 3,
            prim_kind: 4,
            vertex_base_dwords: 0,
            ndc_scale: [0.0, 0.0],
            ndc_offset: [0.0, 0.0],
        };
        let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
            label: Some("xenos draw ctx"),
            contents: bytemuck::bytes_of(&initial_draw),
            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
        });
        let constants_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("xenos constants"),
            size: XenosConstantsBlock::SIZE as u64,
            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
            mapped_at_creation: false,
        });
        let vs_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("xenos vs ucode"),
            size: UCODE_BUFFER_MAX_DWORDS * 4,
            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
            mapped_at_creation: false,
        });
        let ps_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("xenos ps ucode"),
            size: UCODE_BUFFER_MAX_DWORDS * 4,
            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
            mapped_at_creation: false,
        });
        let vertex_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("xenos vertex buffer"),
            size: VERTEX_BUFFER_MAX_BYTES,
            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
            mapped_at_creation: false,
        });

        // Dummy 1×1 magenta texture — placeholder until P5's texture cache
        // lands. Every `interpret_texture_fetch` samples this for now so the
        // interpreter can exercise textureSample paths without a real cache.
        let dummy_tex = device.create_texture(&wgpu::TextureDescriptor {
            label: Some("xenos dummy texture"),
            size: wgpu::Extent3d {
                width: 1,
                height: 1,
                depth_or_array_layers: 1,
            },
            mip_level_count: 1,
            sample_count: 1,
            dimension: wgpu::TextureDimension::D2,
            format: wgpu::TextureFormat::Rgba8Unorm,
            usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
            view_formats: &[],
        });
        // iterate-3Y: transparent black (0,0,0,0). When a textured draw's
        // real texture can't be resolved (e.g. its sampler slot is shadowed by
        // a vertex-fetch constant), sampling a *transparent* texel makes the
        // draw a no-op under its real premultiplied-alpha blend — instead of
        // fabricating an opaque magenta that overpaints everything (the old
        // debug stub). This removes a fake rather than adding one: we never
        // invent visible pixels for an unresolved texture.
        queue.write_texture(
            wgpu::ImageCopyTexture {
                texture: &dummy_tex,
                mip_level: 0,
                origin: wgpu::Origin3d::ZERO,
                aspect: wgpu::TextureAspect::All,
            },
            &[0x00u8, 0x00, 0x00, 0x00],
            wgpu::ImageDataLayout {
                offset: 0,
                bytes_per_row: Some(4),
                rows_per_image: Some(1),
            },
            wgpu::Extent3d {
                width: 1,
                height: 1,
                depth_or_array_layers: 1,
            },
        );
        let dummy_view = dummy_tex.create_view(&wgpu::TextureViewDescriptor::default());
        let dummy_sampler = device.create_sampler(&wgpu::SamplerDescriptor {
            label: Some("xenos dummy sampler"),
            address_mode_u: wgpu::AddressMode::ClampToEdge,
            address_mode_v: wgpu::AddressMode::ClampToEdge,
            address_mode_w: wgpu::AddressMode::ClampToEdge,
            mag_filter: wgpu::FilterMode::Linear,
            min_filter: wgpu::FilterMode::Linear,
            mipmap_filter: wgpu::FilterMode::Nearest,
            ..Default::default()
        });
        let tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("xenos tex bind group"),
            layout: &tex_bgl,
            entries: &[
                wgpu::BindGroupEntry {
                    binding: 0,
                    resource: wgpu::BindingResource::TextureView(&dummy_view),
                },
                wgpu::BindGroupEntry {
                    binding: 1,
                    resource: wgpu::BindingResource::Sampler(&dummy_sampler),
                },
            ],
        });

        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("xenos bind group"),
            layout: &bgl,
            entries: &[
                wgpu::BindGroupEntry {
                    binding: 0,
                    resource: draw_ctx_buffer.as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 1,
                    resource: constants_buffer.as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 2,
                    resource: vs_ucode_buffer.as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 3,
                    resource: ps_ucode_buffer.as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 4,
                    resource: vertex_buffer.as_entire_binding(),
                },
            ],
        });

        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("xenos_interp pipeline"),
            layout: Some(&layout),
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
                compilation_options: Default::default(),
                targets: &[Some(wgpu::ColorTargetState {
                    format: target_format,
                    blend: Some(wgpu::BlendState {
                        color: wgpu::BlendComponent {
                            src_factor: wgpu::BlendFactor::SrcAlpha,
                            dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
                            operation: wgpu::BlendOperation::Add,
                        },
                        alpha: wgpu::BlendComponent::OVER,
                    }),
                    write_mask: wgpu::ColorWrites::ALL,
                })],
            }),
            primitive: wgpu::PrimitiveState {
                topology: wgpu::PrimitiveTopology::TriangleList,
                strip_index_format: None,
                front_face: wgpu::FrontFace::Ccw,
                cull_mode: None,
                polygon_mode: wgpu::PolygonMode::Fill,
                unclipped_depth: false,
                conservative: false,
            },
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
            cache: None,
        });

        Self {
            pipeline,
            interp_shader: shader,
            interp_cache: std::collections::HashMap::new(),
            draw_ctx_buffer,
            constants_buffer,
            vs_ucode_buffer,
            ps_ucode_buffer,
            vertex_buffer,
            bind_group,
            tex_bind_group,
            tex_bgl,
            sampler: dummy_sampler,
            dummy_view,
            pipeline_layout: layout,
            translated_cache: std::collections::HashMap::new(),
            translated_modules: std::collections::HashMap::new(),
            target_format,
        }
    }

    /// P7 — has the translator already produced a WGSL *module* for this
    /// (vs, ps) pair? (A per-render-state pipeline may still need building.)
    pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
        self.translated_modules
            .contains_key(&(vs_blob_key, ps_blob_key))
    }

    /// P7 — compile a translator-produced WGSL module and cache it keyed on
    /// `(vs_blob_key, ps_blob_key)`. The actual `RenderPipeline` (which also
    /// depends on the per-draw blend/mask state) is built lazily by
    /// [`render_one_translated`]. Returns `true` on success.
    pub fn insert_translated(
        &mut self,
        device: &wgpu::Device,
        vs_blob_key: u32,
        ps_blob_key: u32,
        wgsl: &str,
    ) -> bool {
        let key = (vs_blob_key, ps_blob_key);
        if self.translated_modules.contains_key(&key) {
            return true;
        }
        let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
            device.create_shader_module(wgpu::ShaderModuleDescriptor {
                label: Some("xenos translated module"),
                source: wgpu::ShaderSource::Wgsl(wgsl.to_string().into()),
            })
        })) {
            Ok(m) => m,
            Err(_) => {
                metrics::counter!("gpu.shader.compile_err", "stage" => "module")
                    .increment(1);
                return false;
            }
        };
        self.translated_modules.insert(key, shader);
        metrics::counter!("gpu.shader.compile_ok").increment(1);
        true
    }

    /// iterate-3Y: ensure a translator pipeline exists for `(vs,ps,rstate)`,
    /// building it from the cached module + the per-draw color/blend target.
    fn ensure_translated_for_state(
        &mut self,
        device: &wgpu::Device,
        vs_key: u32,
        ps_key: u32,
        rstate: RenderState,
    ) -> bool {
        let pkey = (vs_key, ps_key, rstate);
        if self.translated_cache.contains_key(&pkey) {
            return true;
        }
        let Some(module) = self.translated_modules.get(&(vs_key, ps_key)) else {
            return false;
        };
        let target = rstate.color_target(self.target_format);
        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("xenos translated pipeline"),
            layout: Some(&self.pipeline_layout),
            vertex: wgpu::VertexState {
                module,
                entry_point: "vs_main",
                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module,
                entry_point: "fs_main",
                compilation_options: Default::default(),
                targets: &[Some(target)],
            }),
            primitive: wgpu::PrimitiveState {
                topology: wgpu::PrimitiveTopology::TriangleList,
                strip_index_format: None,
                front_face: wgpu::FrontFace::Ccw,
                cull_mode: None,
                polygon_mode: wgpu::PolygonMode::Fill,
                unclipped_depth: false,
                conservative: false,
            },
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
            cache: None,
        });
        self.translated_cache.insert(pkey, pipeline);
        true
    }

    /// iterate-3Y: ensure an interpreter pipeline exists for `rstate`.
    fn ensure_interp_for_state(&mut self, device: &wgpu::Device, rstate: RenderState) {
        if self.interp_cache.contains_key(&rstate) {
            return;
        }
        let target = rstate.color_target(self.target_format);
        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("xenos interp pipeline (per-state)"),
            layout: Some(&self.pipeline_layout),
            vertex: wgpu::VertexState {
                module: &self.interp_shader,
                entry_point: "vs_main",
                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module: &self.interp_shader,
                entry_point: "fs_main",
                compilation_options: Default::default(),
                targets: &[Some(target)],
            }),
            primitive: wgpu::PrimitiveState {
                topology: wgpu::PrimitiveTopology::TriangleList,
                strip_index_format: None,
                front_face: wgpu::FrontFace::Ccw,
                cull_mode: None,
                polygon_mode: wgpu::PolygonMode::Fill,
                unclipped_depth: false,
                conservative: false,
            },
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
            cache: None,
        });
        self.interp_cache.insert(rstate, pipeline);
    }

    /// iterate-3Y: render one draw through the translator pipeline built for
    /// this draw's render state. Returns `false` if no module is cached for
    /// `(vs,ps)` (caller should fall back to the interpreter).
    pub fn render_one_translated(
        &mut self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        encoder: &mut wgpu::CommandEncoder,
        target_view: &wgpu::TextureView,
        req: DrawRequest,
        vs_key: u32,
        ps_key: u32,
        rstate: RenderState,
    ) -> bool {
        if !self.ensure_translated_for_state(device, vs_key, ps_key, rstate) {
            return false;
        }
        let cb = DrawConstants {
            draw_index: req.draw_index,
            vertex_count: req.vertex_count.max(3),
            prim_kind: req.prim_kind,
            vertex_base_dwords: req.vertex_base_dwords,
            ndc_scale: req.ndc_scale,
            ndc_offset: req.ndc_offset,
        };
        queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
        let pipeline = self
            .translated_cache
            .get(&(vs_key, ps_key, rstate))
            .expect("just ensured");
        let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
            label: Some("xenos translated draw"),
            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                view: target_view,
                resolve_target: None,
                ops: wgpu::Operations {
                    load: wgpu::LoadOp::Load,
                    store: wgpu::StoreOp::Store,
                },
            })],
            depth_stencil_attachment: None,
            timestamp_writes: None,
            occlusion_query_set: None,
        });
        pass.set_pipeline(pipeline);
        pass.set_bind_group(0, &self.bind_group, &[]);
        pass.set_bind_group(1, &self.tex_bind_group, &[]);
        let rounded = req.vertex_count.div_ceil(3) * 3;
        pass.draw(0..rounded.max(3), 0..1);
        true
    }

    /// Number of distinct translator pipelines cached. Surfaced to the HUD.
    pub fn translated_pipeline_count(&self) -> usize {
        self.translated_cache.len()
    }

    /// P5 — swap the active texture bound at `@group(1) @binding(0)`.
    /// `view` is typically a wgpu texture view obtained from the
    /// [`TextureCacheHost`]. Pass `None` to revert to the built-in dummy
    /// magenta stub.
    pub fn set_texture_view(&mut self, device: &wgpu::Device, view: Option<&wgpu::TextureView>) {
        let bound = view.unwrap_or(&self.dummy_view);
        self.tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("xenos tex bind group (rebind)"),
            layout: &self.tex_bgl,
            entries: &[
                wgpu::BindGroupEntry {
                    binding: 0,
                    resource: wgpu::BindingResource::TextureView(bound),
                },
                wgpu::BindGroupEntry {
                    binding: 1,
                    resource: wgpu::BindingResource::Sampler(&self.sampler),
                },
            ],
        });
    }

    /// Clear `target_view` to `color`, store.
    pub fn clear(
        &self,
        encoder: &mut wgpu::CommandEncoder,
        target_view: &wgpu::TextureView,
        color: [f64; 4],
    ) {
        let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
            label: Some("xenos frontbuffer clear"),
            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                view: target_view,
                resolve_target: None,
                ops: wgpu::Operations {
                    load: wgpu::LoadOp::Clear(wgpu::Color {
                        r: color[0],
                        g: color[1],
                        b: color[2],
                        a: color[3],
                    }),
                    store: wgpu::StoreOp::Store,
                },
            })],
            depth_stencil_attachment: None,
            timestamp_writes: None,
            occlusion_query_set: None,
        });
        let _ = &mut pass;
    }

    /// Upload shader microcode + constants once (before the batch of draws
    /// that share them). Skips zero-length blobs.
    pub fn upload_shader_and_constants(
        &self,
        queue: &wgpu::Queue,
        vs_ucode: &[u32],
        ps_ucode: &[u32],
        constants: &XenosConstantsBlock,
    ) {
        queue.write_buffer(&self.constants_buffer, 0, bytemuck::bytes_of(constants));
        if !vs_ucode.is_empty() {
            let bytes: &[u8] = bytemuck::cast_slice(vs_ucode);
            let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
            queue.write_buffer(&self.vs_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
        }
        if !ps_ucode.is_empty() {
            let bytes: &[u8] = bytemuck::cast_slice(ps_ucode);
            let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
            queue.write_buffer(&self.ps_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
        }
    }

    /// Upload vertex data (as raw big-endian dwords — the WGSL side will
    /// bswap as needed during format unpacking).
    pub fn upload_vertex_data(&self, queue: &wgpu::Queue, data: &[u32]) {
        if data.is_empty() {
            return;
        }
        let bytes: &[u8] = bytemuck::cast_slice(data);
        let max = VERTEX_BUFFER_MAX_BYTES as usize;
        queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
    }

    /// Render one captured draw through the interpreter, using the per-draw
    /// `rstate` (blend/write-mask) so flat draws composite correctly even
    /// when their (vs,ps) didn't translate. `RenderState::OPAQUE` reproduces
    /// the legacy fixed behaviour for procedural/synthetic draws.
    pub fn render_one(
        &mut self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        encoder: &mut wgpu::CommandEncoder,
        target_view: &wgpu::TextureView,
        req: DrawRequest,
        rstate: RenderState,
    ) {
        self.ensure_interp_for_state(device, rstate);
        let cb = DrawConstants {
            draw_index: req.draw_index,
            vertex_count: req.vertex_count.max(3),
            prim_kind: req.prim_kind,
            vertex_base_dwords: req.vertex_base_dwords,
            ndc_scale: req.ndc_scale,
            ndc_offset: req.ndc_offset,
        };
        queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));

        let pipeline = self
            .interp_cache
            .get(&rstate)
            .expect("just ensured");
        let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
            label: Some("xenos draw"),
            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
                view: target_view,
                resolve_target: None,
                ops: wgpu::Operations {
                    load: wgpu::LoadOp::Load,
                    store: wgpu::StoreOp::Store,
                },
            })],
            depth_stencil_attachment: None,
            timestamp_writes: None,
            occlusion_query_set: None,
        });
        pass.set_pipeline(pipeline);
        pass.set_bind_group(0, &self.bind_group, &[]);
        pass.set_bind_group(1, &self.tex_bind_group, &[]);
        let rounded = req.vertex_count.div_ceil(3) * 3;
        pass.draw(0..rounded.max(3), 0..1);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn draw_constants_layout_matches_wgsl_uniform() {
        assert_eq!(std::mem::size_of::<DrawConstants>(), 32);
    }
}