Add xenia-ui crate; switch analysis store to DuckDB

Workspace gains a new xenia-ui member that owns the winit/wgpu window, the Xenos display pipeline (xenos_pipeline + render + texture_cache_host), HUD font/blit shaders, and the input-bridge plumbing the app uses to surface guest framebuffers and overlays. Workspace dependencies grow accordingly: rusqlite is replaced with duckdb (analysis pipeline now writes DuckDB stores), and tracing / metrics / pprof / winit / wgpu / gilrs / pollster / crossbeam / bytemuck are added at workspace level so xenia-ui and xenia-app share versions. Cargo.lock regenerated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:26:48 +02:00
parent f166d061be
commit e2b8860e10
13 changed files with 7534 additions and 42 deletions
--- a/crates/xenia-ui/src/xenos_pipeline.rs
+++ b/crates/xenia-ui/src/xenos_pipeline.rs
@@ -0,0 +1,643 @@
+//! Host pipeline that consumes PM4 `DRAW_INDX*` captures.
+//!
+//! Drives [`xenia_gpu::shaders::XENOS_INTERP_WGSL`]. This file owns the
+//! wgpu bind-group + pipeline + buffer surface the Xenos WGSL interpreter
+//! binds to. The WGSL module is expected to declare:
+//!
+//! ```text
+//! @group(0) @binding(0) var<uniform>  xenos_draw   : XenosDrawConstants;   // 16 B
+//! @group(0) @binding(1) var<uniform>  xenos_consts : XenosConstants;       // ~9.2 KB
+//! @group(0) @binding(2) var<storage, read> vs_ucode      : array<u32>;
+//! @group(0) @binding(3) var<storage, read> ps_ucode      : array<u32>;
+//! @group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
+//! ```
+//!
+//! Texture bindings (M6) are a single-slot stub for P3b:
+//!
+//! ```text
+//! @group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
+//! @group(1) @binding(1) var xenos_samp : sampler;
+//! ```
+//!
+//! The bound texture is a 1×1 magenta placeholder. Real per-slot guest
+//! texture uploads + format decode land with the texture cache (P5).
+
+use bytemuck::{Pod, Zeroable};
+use wgpu::util::DeviceExt;
+
+use xenia_gpu::shaders::XENOS_INTERP_WGSL;
+use xenia_gpu::xenos_constants::XenosConstantsBlock;
+
+/// Per-draw constants mirroring the WGSL `XenosDrawConstants` uniform
+/// block. Ordering / padding matches `xenos_interp.wgsl` exactly.
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct DrawConstants {
+    draw_index: u32,
+    vertex_count: u32,
+    prim_kind: u32,
+    _pad: u32,
+}
+
+/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
+#[derive(Clone, Copy, Debug)]
+pub struct DrawRequest {
+    /// Monotonic draw counter; shader uses it for per-draw colour rotation.
+    pub draw_index: u32,
+    /// Host-normalised vertex count (after primitive-processor rewrite).
+    pub vertex_count: u32,
+    /// Xenos primitive-type code; shader may branch on it in P3b+.
+    pub prim_kind: u32,
+}
+
+/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
+/// shaders are ≪ 4 KB; 64 KB is orders-of-magnitude slack.
+const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS
+/// 16 MB of vertex data — enough for any realistic Xenos draw.
+const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;
+
+pub struct XenosPipeline {
+    pipeline: wgpu::RenderPipeline,
+    draw_ctx_buffer: wgpu::Buffer,
+    constants_buffer: wgpu::Buffer,
+    vs_ucode_buffer: wgpu::Buffer,
+    ps_ucode_buffer: wgpu::Buffer,
+    vertex_buffer: wgpu::Buffer,
+    bind_group: wgpu::BindGroup,
+    /// P5: swapped per-draw when a new cached texture becomes active.
+    tex_bind_group: wgpu::BindGroup,
+    /// Layout + sampler retained so `set_texture_view` can rebuild
+    /// `tex_bind_group` on the fly without re-reading the pipeline.
+    tex_bgl: wgpu::BindGroupLayout,
+    sampler: wgpu::Sampler,
+    /// Fallback 1×1 magenta texture — used when no guest texture has been
+    /// uploaded yet or when a draw references an unsupported format.
+    dummy_view: wgpu::TextureView,
+    /// P7 — retained pipeline layout + compiled-pipeline cache for
+    /// Xenos→WGSL translator output. Keyed on `(vs_blob_key, ps_blob_key)`
+    /// so every (vs, ps) pair gets compiled once and re-used for every
+    /// subsequent draw. Interpreter pipeline remains the fallback.
+    pipeline_layout: wgpu::PipelineLayout,
+    translated_cache: std::collections::HashMap<(u32, u32), wgpu::RenderPipeline>,
+    pub target_format: wgpu::TextureFormat,
+}
+
+impl XenosPipeline {
+    pub fn new(
+        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        target_format: wgpu::TextureFormat,
+    ) -> Self {
+        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("xenos_interp.wgsl"),
+            source: wgpu::ShaderSource::Wgsl(XENOS_INTERP_WGSL.into()),
+        });
+
+        let bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("xenos bind group layout"),
+            entries: &[
+                // b0: draw_ctx (16 B uniform)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Uniform,
+                        has_dynamic_offset: false,
+                        min_binding_size: std::num::NonZeroU64::new(
+                            std::mem::size_of::<DrawConstants>() as u64,
+                        ),
+                    },
+                    count: None,
+                },
+                // b1: XenosConstants read-only storage (~9.2 KB). Not uniform
+                // because the block contains packed `array<u32>` fields and
+                // WGSL's uniform address space would require 16-byte stride.
+                wgpu::BindGroupLayoutEntry {
+                    binding: 1,
+                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: std::num::NonZeroU64::new(
+                            XenosConstantsBlock::SIZE as u64,
+                        ),
+                    },
+                    count: None,
+                },
+                // b2: vs_ucode (read-only storage)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 2,
+                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // b3: ps_ucode (read-only storage)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 3,
+                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // b4: vertex_buffer (read-only storage)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 4,
+                    visibility: wgpu::ShaderStages::VERTEX,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+            ],
+        });
+        let tex_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("xenos tex bind group layout"),
+            entries: &[
+                wgpu::BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                    ty: wgpu::BindingType::Texture {
+                        sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                        view_dimension: wgpu::TextureViewDimension::D2,
+                        multisampled: false,
+                    },
+                    count: None,
+                },
+                wgpu::BindGroupLayoutEntry {
+                    binding: 1,
+                    visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
+                    ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering),
+                    count: None,
+                },
+            ],
+        });
+
+        let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("xenos pipeline layout"),
+            bind_group_layouts: &[&bgl, &tex_bgl],
+            push_constant_ranges: &[],
+        });
+
+        // Buffer allocation. `queue.write_buffer` uses COPY_DST; all
+        // interpreter-facing buffers need it.
+        let initial_draw = DrawConstants {
+            draw_index: 0,
+            vertex_count: 3,
+            prim_kind: 4,
+            _pad: 0,
+        };
+        let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("xenos draw ctx"),
+            contents: bytemuck::bytes_of(&initial_draw),
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+        });
+        let constants_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("xenos constants"),
+            size: XenosConstantsBlock::SIZE as u64,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        let vs_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("xenos vs ucode"),
+            size: UCODE_BUFFER_MAX_DWORDS * 4,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        let ps_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("xenos ps ucode"),
+            size: UCODE_BUFFER_MAX_DWORDS * 4,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        let vertex_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("xenos vertex buffer"),
+            size: VERTEX_BUFFER_MAX_BYTES,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        // Dummy 1×1 magenta texture — placeholder until P5's texture cache
+        // lands. Every `interpret_texture_fetch` samples this for now so the
+        // interpreter can exercise textureSample paths without a real cache.
+        let dummy_tex = device.create_texture(&wgpu::TextureDescriptor {
+            label: Some("xenos dummy texture"),
+            size: wgpu::Extent3d {
+                width: 1,
+                height: 1,
+                depth_or_array_layers: 1,
+            },
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: wgpu::TextureFormat::Rgba8Unorm,
+            usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
+            view_formats: &[],
+        });
+        // Magenta (255, 0, 255, 255) so a missing-texture read visibly stands
+        // out on-screen when the interpreter does sample it.
+        queue.write_texture(
+            wgpu::ImageCopyTexture {
+                texture: &dummy_tex,
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+                aspect: wgpu::TextureAspect::All,
+            },
+            &[0xFFu8, 0x00, 0xFF, 0xFF],
+            wgpu::ImageDataLayout {
+                offset: 0,
+                bytes_per_row: Some(4),
+                rows_per_image: Some(1),
+            },
+            wgpu::Extent3d {
+                width: 1,
+                height: 1,
+                depth_or_array_layers: 1,
+            },
+        );
+        let dummy_view = dummy_tex.create_view(&wgpu::TextureViewDescriptor::default());
+        let dummy_sampler = device.create_sampler(&wgpu::SamplerDescriptor {
+            label: Some("xenos dummy sampler"),
+            address_mode_u: wgpu::AddressMode::ClampToEdge,
+            address_mode_v: wgpu::AddressMode::ClampToEdge,
+            address_mode_w: wgpu::AddressMode::ClampToEdge,
+            mag_filter: wgpu::FilterMode::Linear,
+            min_filter: wgpu::FilterMode::Linear,
+            mipmap_filter: wgpu::FilterMode::Nearest,
+            ..Default::default()
+        });
+        let tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("xenos tex bind group"),
+            layout: &tex_bgl,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: wgpu::BindingResource::TextureView(&dummy_view),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: wgpu::BindingResource::Sampler(&dummy_sampler),
+                },
+            ],
+        });
+
+        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("xenos bind group"),
+            layout: &bgl,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: draw_ctx_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: constants_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 2,
+                    resource: vs_ucode_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 3,
+                    resource: ps_ucode_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 4,
+                    resource: vertex_buffer.as_entire_binding(),
+                },
+            ],
+        });
+
+        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: Some("xenos_interp pipeline"),
+            layout: Some(&layout),
+            vertex: wgpu::VertexState {
+                module: &shader,
+                entry_point: "vs_main",
+                compilation_options: Default::default(),
+                buffers: &[],
+            },
+            fragment: Some(wgpu::FragmentState {
+                module: &shader,
+                entry_point: "fs_main",
+                compilation_options: Default::default(),
+                targets: &[Some(wgpu::ColorTargetState {
+                    format: target_format,
+                    blend: Some(wgpu::BlendState {
+                        color: wgpu::BlendComponent {
+                            src_factor: wgpu::BlendFactor::SrcAlpha,
+                            dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
+                            operation: wgpu::BlendOperation::Add,
+                        },
+                        alpha: wgpu::BlendComponent::OVER,
+                    }),
+                    write_mask: wgpu::ColorWrites::ALL,
+                })],
+            }),
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleList,
+                strip_index_format: None,
+                front_face: wgpu::FrontFace::Ccw,
+                cull_mode: None,
+                polygon_mode: wgpu::PolygonMode::Fill,
+                unclipped_depth: false,
+                conservative: false,
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+            cache: None,
+        });
+
+        Self {
+            pipeline,
+            draw_ctx_buffer,
+            constants_buffer,
+            vs_ucode_buffer,
+            ps_ucode_buffer,
+            vertex_buffer,
+            bind_group,
+            tex_bind_group,
+            tex_bgl,
+            sampler: dummy_sampler,
+            dummy_view,
+            pipeline_layout: layout,
+            translated_cache: std::collections::HashMap::new(),
+            target_format,
+        }
+    }
+
+    /// P7 — does the translator cache already have a pipeline for this
+    /// (vs, ps) pair?
+    pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
+        self.translated_cache
+            .contains_key(&(vs_blob_key, ps_blob_key))
+    }
+
+    /// P7 — fetch a cached translator pipeline. `None` if not yet built.
+    pub fn translated_pipeline(
+        &self,
+        vs_blob_key: u32,
+        ps_blob_key: u32,
+    ) -> Option<&wgpu::RenderPipeline> {
+        self.translated_cache
+            .get(&(vs_blob_key, ps_blob_key))
+    }
+
+    /// P7 — compile a translator-produced WGSL module into a
+    /// `wgpu::RenderPipeline` and insert it into the cache keyed on
+    /// `(vs_blob_key, ps_blob_key)`. Returns `true` on success. Duplicate
+    /// inserts are no-ops. Emits `gpu.shader.compile_ok` on success.
+    pub fn insert_translated(
+        &mut self,
+        device: &wgpu::Device,
+        vs_blob_key: u32,
+        ps_blob_key: u32,
+        wgsl: &str,
+    ) -> bool {
+        let key = (vs_blob_key, ps_blob_key);
+        if self.translated_cache.contains_key(&key) {
+            return true;
+        }
+        let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            device.create_shader_module(wgpu::ShaderModuleDescriptor {
+                label: Some("xenos translated module"),
+                source: wgpu::ShaderSource::Wgsl(wgsl.to_string().into()),
+            })
+        })) {
+            Ok(m) => m,
+            Err(_) => {
+                metrics::counter!("gpu.shader.compile_err", "stage" => "module")
+                    .increment(1);
+                return false;
+            }
+        };
+        let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: Some("xenos translated pipeline"),
+            layout: Some(&self.pipeline_layout),
+            vertex: wgpu::VertexState {
+                module: &shader,
+                entry_point: "vs_main",
+                compilation_options: Default::default(),
+                buffers: &[],
+            },
+            fragment: Some(wgpu::FragmentState {
+                module: &shader,
+                entry_point: "fs_main",
+                compilation_options: Default::default(),
+                targets: &[Some(wgpu::ColorTargetState {
+                    format: self.target_format,
+                    blend: Some(wgpu::BlendState {
+                        color: wgpu::BlendComponent {
+                            src_factor: wgpu::BlendFactor::SrcAlpha,
+                            dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
+                            operation: wgpu::BlendOperation::Add,
+                        },
+                        alpha: wgpu::BlendComponent::OVER,
+                    }),
+                    write_mask: wgpu::ColorWrites::ALL,
+                })],
+            }),
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleList,
+                strip_index_format: None,
+                front_face: wgpu::FrontFace::Ccw,
+                cull_mode: None,
+                polygon_mode: wgpu::PolygonMode::Fill,
+                unclipped_depth: false,
+                conservative: false,
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+            cache: None,
+        });
+        self.translated_cache.insert(key, pipeline);
+        metrics::counter!("gpu.shader.compile_ok").increment(1);
+        true
+    }
+
+    /// Render one draw with the translator-produced pipeline instead of
+    /// the interpreter. Mirrors [`render_one`] except the bound pipeline
+    /// is swapped for `pipeline`.
+    pub fn render_one_with_pipeline(
+        &self,
+        queue: &wgpu::Queue,
+        encoder: &mut wgpu::CommandEncoder,
+        target_view: &wgpu::TextureView,
+        req: DrawRequest,
+        pipeline: &wgpu::RenderPipeline,
+    ) {
+        let cb = DrawConstants {
+            draw_index: req.draw_index,
+            vertex_count: req.vertex_count.max(3),
+            prim_kind: req.prim_kind,
+            _pad: 0,
+        };
+        queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
+
+        let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: Some("xenos translated draw"),
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: target_view,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Load,
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            depth_stencil_attachment: None,
+            timestamp_writes: None,
+            occlusion_query_set: None,
+        });
+        pass.set_pipeline(pipeline);
+        pass.set_bind_group(0, &self.bind_group, &[]);
+        pass.set_bind_group(1, &self.tex_bind_group, &[]);
+        let rounded = req.vertex_count.div_ceil(3) * 3;
+        pass.draw(0..rounded.max(3), 0..1);
+    }
+
+    /// Number of distinct translator pipelines cached. Surfaced to the HUD.
+    pub fn translated_pipeline_count(&self) -> usize {
+        self.translated_cache.len()
+    }
+
+    /// P5 — swap the active texture bound at `@group(1) @binding(0)`.
+    /// `view` is typically a wgpu texture view obtained from the
+    /// [`TextureCacheHost`]. Pass `None` to revert to the built-in dummy
+    /// magenta stub.
+    pub fn set_texture_view(&mut self, device: &wgpu::Device, view: Option<&wgpu::TextureView>) {
+        let bound = view.unwrap_or(&self.dummy_view);
+        self.tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("xenos tex bind group (rebind)"),
+            layout: &self.tex_bgl,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: wgpu::BindingResource::TextureView(bound),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: wgpu::BindingResource::Sampler(&self.sampler),
+                },
+            ],
+        });
+    }
+
+    /// Clear `target_view` to `color`, store.
+    pub fn clear(
+        &self,
+        encoder: &mut wgpu::CommandEncoder,
+        target_view: &wgpu::TextureView,
+        color: [f64; 4],
+    ) {
+        let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: Some("xenos frontbuffer clear"),
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: target_view,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Clear(wgpu::Color {
+                        r: color[0],
+                        g: color[1],
+                        b: color[2],
+                        a: color[3],
+                    }),
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            depth_stencil_attachment: None,
+            timestamp_writes: None,
+            occlusion_query_set: None,
+        });
+        let _ = &mut pass;
+    }
+
+    /// Upload shader microcode + constants once (before the batch of draws
+    /// that share them). Skips zero-length blobs.
+    pub fn upload_shader_and_constants(
+        &self,
+        queue: &wgpu::Queue,
+        vs_ucode: &[u32],
+        ps_ucode: &[u32],
+        constants: &XenosConstantsBlock,
+    ) {
+        queue.write_buffer(&self.constants_buffer, 0, bytemuck::bytes_of(constants));
+        if !vs_ucode.is_empty() {
+            let bytes: &[u8] = bytemuck::cast_slice(vs_ucode);
+            let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
+            queue.write_buffer(&self.vs_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
+        }
+        if !ps_ucode.is_empty() {
+            let bytes: &[u8] = bytemuck::cast_slice(ps_ucode);
+            let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
+            queue.write_buffer(&self.ps_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
+        }
+    }
+
+    /// Upload vertex data (as raw big-endian dwords — the WGSL side will
+    /// bswap as needed during format unpacking).
+    pub fn upload_vertex_data(&self, queue: &wgpu::Queue, data: &[u32]) {
+        if data.is_empty() {
+            return;
+        }
+        let bytes: &[u8] = bytemuck::cast_slice(data);
+        let max = VERTEX_BUFFER_MAX_BYTES as usize;
+        queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
+    }
+
+    /// Render one captured draw.
+    pub fn render_one(
+        &self,
+        queue: &wgpu::Queue,
+        encoder: &mut wgpu::CommandEncoder,
+        target_view: &wgpu::TextureView,
+        req: DrawRequest,
+    ) {
+        let cb = DrawConstants {
+            draw_index: req.draw_index,
+            vertex_count: req.vertex_count.max(3),
+            prim_kind: req.prim_kind,
+            _pad: 0,
+        };
+        queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
+
+        let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: Some("xenos draw"),
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: target_view,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Load,
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            depth_stencil_attachment: None,
+            timestamp_writes: None,
+            occlusion_query_set: None,
+        });
+        pass.set_pipeline(&self.pipeline);
+        pass.set_bind_group(0, &self.bind_group, &[]);
+        pass.set_bind_group(1, &self.tex_bind_group, &[]);
+        let rounded = req.vertex_count.div_ceil(3) * 3;
+        pass.draw(0..rounded.max(3), 0..1);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn draw_constants_layout_matches_wgsl_uniform() {
+        assert_eq!(std::mem::size_of::<DrawConstants>(), 16);
+    }
+}