Add xenia-ui crate; switch analysis store to DuckDB

Workspace gains a new xenia-ui member that owns the winit/wgpu
window, the Xenos display pipeline (xenos_pipeline + render +
texture_cache_host), HUD font/blit shaders, and the input-bridge
plumbing the app uses to surface guest framebuffers and overlays.

Workspace dependencies grow accordingly: rusqlite is replaced with
duckdb (analysis pipeline now writes DuckDB stores), and tracing /
metrics / pprof / winit / wgpu / gilrs / pollster / crossbeam /
bytemuck are added at workspace level so xenia-ui and xenia-app
share versions. Cargo.lock regenerated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:26:48 +02:00
parent f166d061be
commit e2b8860e10
13 changed files with 7534 additions and 42 deletions

View File

@@ -0,0 +1,643 @@
//! Host pipeline that consumes PM4 `DRAW_INDX*` captures.
//!
//! Drives [`xenia_gpu::shaders::XENOS_INTERP_WGSL`]. This file owns the
//! wgpu bind-group + pipeline + buffer surface the Xenos WGSL interpreter
//! binds to. The WGSL module is expected to declare:
//!
//! ```text
//! @group(0) @binding(0) var<uniform> xenos_draw : XenosDrawConstants; // 16 B
//! @group(0) @binding(1) var<uniform> xenos_consts : XenosConstants; // ~9.2 KB
//! @group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
//! @group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
//! @group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
//! ```
//!
//! Texture bindings (M6) are a single-slot stub for P3b:
//!
//! ```text
//! @group(1) @binding(0) var xenos_tex : texture_2d<f32>;
//! @group(1) @binding(1) var xenos_samp : sampler;
//! ```
//!
//! The bound texture is a 1×1 magenta placeholder. Real per-slot guest
//! texture uploads + format decode land with the texture cache (P5).
use bytemuck::{Pod, Zeroable};
use wgpu::util::DeviceExt;
use xenia_gpu::shaders::XENOS_INTERP_WGSL;
use xenia_gpu::xenos_constants::XenosConstantsBlock;
/// Per-draw constants mirroring the WGSL `XenosDrawConstants` uniform
/// block. Ordering / padding matches `xenos_interp.wgsl` exactly.
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable)]
struct DrawConstants {
draw_index: u32,
vertex_count: u32,
prim_kind: u32,
_pad: u32,
}
/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
#[derive(Clone, Copy, Debug)]
pub struct DrawRequest {
/// Monotonic draw counter; shader uses it for per-draw colour rotation.
pub draw_index: u32,
/// Host-normalised vertex count (after primitive-processor rewrite).
pub vertex_count: u32,
/// Xenos primitive-type code; shader may branch on it in P3b+.
pub prim_kind: u32,
}
/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
/// shaders are ≪ 4 KB; 64 KB is orders-of-magnitude slack.
const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS
/// 16 MB of vertex data — enough for any realistic Xenos draw.
const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;
pub struct XenosPipeline {
pipeline: wgpu::RenderPipeline,
draw_ctx_buffer: wgpu::Buffer,
constants_buffer: wgpu::Buffer,
vs_ucode_buffer: wgpu::Buffer,
ps_ucode_buffer: wgpu::Buffer,
vertex_buffer: wgpu::Buffer,
bind_group: wgpu::BindGroup,
/// P5: swapped per-draw when a new cached texture becomes active.
tex_bind_group: wgpu::BindGroup,
/// Layout + sampler retained so `set_texture_view` can rebuild
/// `tex_bind_group` on the fly without re-reading the pipeline.
tex_bgl: wgpu::BindGroupLayout,
sampler: wgpu::Sampler,
/// Fallback 1×1 magenta texture — used when no guest texture has been
/// uploaded yet or when a draw references an unsupported format.
dummy_view: wgpu::TextureView,
/// P7 — retained pipeline layout + compiled-pipeline cache for
/// Xenos→WGSL translator output. Keyed on `(vs_blob_key, ps_blob_key)`
/// so every (vs, ps) pair gets compiled once and re-used for every
/// subsequent draw. Interpreter pipeline remains the fallback.
pipeline_layout: wgpu::PipelineLayout,
translated_cache: std::collections::HashMap<(u32, u32), wgpu::RenderPipeline>,
pub target_format: wgpu::TextureFormat,
}
impl XenosPipeline {
pub fn new(
device: &wgpu::Device,
queue: &wgpu::Queue,
target_format: wgpu::TextureFormat,
) -> Self {
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("xenos_interp.wgsl"),
source: wgpu::ShaderSource::Wgsl(XENOS_INTERP_WGSL.into()),
});
let bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("xenos bind group layout"),
entries: &[
// b0: draw_ctx (16 B uniform)
wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: std::num::NonZeroU64::new(
std::mem::size_of::<DrawConstants>() as u64,
),
},
count: None,
},
// b1: XenosConstants read-only storage (~9.2 KB). Not uniform
// because the block contains packed `array<u32>` fields and
// WGSL's uniform address space would require 16-byte stride.
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: std::num::NonZeroU64::new(
XenosConstantsBlock::SIZE as u64,
),
},
count: None,
},
// b2: vs_ucode (read-only storage)
wgpu::BindGroupLayoutEntry {
binding: 2,
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
// b3: ps_ucode (read-only storage)
wgpu::BindGroupLayoutEntry {
binding: 3,
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
// b4: vertex_buffer (read-only storage)
wgpu::BindGroupLayoutEntry {
binding: 4,
visibility: wgpu::ShaderStages::VERTEX,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
],
});
let tex_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("xenos tex bind group layout"),
entries: &[
wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: true },
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering),
count: None,
},
],
});
let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("xenos pipeline layout"),
bind_group_layouts: &[&bgl, &tex_bgl],
push_constant_ranges: &[],
});
// Buffer allocation. `queue.write_buffer` uses COPY_DST; all
// interpreter-facing buffers need it.
let initial_draw = DrawConstants {
draw_index: 0,
vertex_count: 3,
prim_kind: 4,
_pad: 0,
};
let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("xenos draw ctx"),
contents: bytemuck::bytes_of(&initial_draw),
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
});
let constants_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("xenos constants"),
size: XenosConstantsBlock::SIZE as u64,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let vs_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("xenos vs ucode"),
size: UCODE_BUFFER_MAX_DWORDS * 4,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let ps_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("xenos ps ucode"),
size: UCODE_BUFFER_MAX_DWORDS * 4,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let vertex_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("xenos vertex buffer"),
size: VERTEX_BUFFER_MAX_BYTES,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
// Dummy 1×1 magenta texture — placeholder until P5's texture cache
// lands. Every `interpret_texture_fetch` samples this for now so the
// interpreter can exercise textureSample paths without a real cache.
let dummy_tex = device.create_texture(&wgpu::TextureDescriptor {
label: Some("xenos dummy texture"),
size: wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8Unorm,
usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
view_formats: &[],
});
// Magenta (255, 0, 255, 255) so a missing-texture read visibly stands
// out on-screen when the interpreter does sample it.
queue.write_texture(
wgpu::ImageCopyTexture {
texture: &dummy_tex,
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
&[0xFFu8, 0x00, 0xFF, 0xFF],
wgpu::ImageDataLayout {
offset: 0,
bytes_per_row: Some(4),
rows_per_image: Some(1),
},
wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
);
let dummy_view = dummy_tex.create_view(&wgpu::TextureViewDescriptor::default());
let dummy_sampler = device.create_sampler(&wgpu::SamplerDescriptor {
label: Some("xenos dummy sampler"),
address_mode_u: wgpu::AddressMode::ClampToEdge,
address_mode_v: wgpu::AddressMode::ClampToEdge,
address_mode_w: wgpu::AddressMode::ClampToEdge,
mag_filter: wgpu::FilterMode::Linear,
min_filter: wgpu::FilterMode::Linear,
mipmap_filter: wgpu::FilterMode::Nearest,
..Default::default()
});
let tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("xenos tex bind group"),
layout: &tex_bgl,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureView(&dummy_view),
},
wgpu::BindGroupEntry {
binding: 1,
resource: wgpu::BindingResource::Sampler(&dummy_sampler),
},
],
});
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("xenos bind group"),
layout: &bgl,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: draw_ctx_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: constants_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 2,
resource: vs_ucode_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 3,
resource: ps_ucode_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 4,
resource: vertex_buffer.as_entire_binding(),
},
],
});
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
label: Some("xenos_interp pipeline"),
layout: Some(&layout),
vertex: wgpu::VertexState {
module: &shader,
entry_point: "vs_main",
compilation_options: Default::default(),
buffers: &[],
},
fragment: Some(wgpu::FragmentState {
module: &shader,
entry_point: "fs_main",
compilation_options: Default::default(),
targets: &[Some(wgpu::ColorTargetState {
format: target_format,
blend: Some(wgpu::BlendState {
color: wgpu::BlendComponent {
src_factor: wgpu::BlendFactor::SrcAlpha,
dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
operation: wgpu::BlendOperation::Add,
},
alpha: wgpu::BlendComponent::OVER,
}),
write_mask: wgpu::ColorWrites::ALL,
})],
}),
primitive: wgpu::PrimitiveState {
topology: wgpu::PrimitiveTopology::TriangleList,
strip_index_format: None,
front_face: wgpu::FrontFace::Ccw,
cull_mode: None,
polygon_mode: wgpu::PolygonMode::Fill,
unclipped_depth: false,
conservative: false,
},
depth_stencil: None,
multisample: wgpu::MultisampleState::default(),
multiview: None,
cache: None,
});
Self {
pipeline,
draw_ctx_buffer,
constants_buffer,
vs_ucode_buffer,
ps_ucode_buffer,
vertex_buffer,
bind_group,
tex_bind_group,
tex_bgl,
sampler: dummy_sampler,
dummy_view,
pipeline_layout: layout,
translated_cache: std::collections::HashMap::new(),
target_format,
}
}
/// P7 — does the translator cache already have a pipeline for this
/// (vs, ps) pair?
pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
self.translated_cache
.contains_key(&(vs_blob_key, ps_blob_key))
}
/// P7 — fetch a cached translator pipeline. `None` if not yet built.
pub fn translated_pipeline(
&self,
vs_blob_key: u32,
ps_blob_key: u32,
) -> Option<&wgpu::RenderPipeline> {
self.translated_cache
.get(&(vs_blob_key, ps_blob_key))
}
/// P7 — compile a translator-produced WGSL module into a
/// `wgpu::RenderPipeline` and insert it into the cache keyed on
/// `(vs_blob_key, ps_blob_key)`. Returns `true` on success. Duplicate
/// inserts are no-ops. Emits `gpu.shader.compile_ok` on success.
pub fn insert_translated(
&mut self,
device: &wgpu::Device,
vs_blob_key: u32,
ps_blob_key: u32,
wgsl: &str,
) -> bool {
let key = (vs_blob_key, ps_blob_key);
if self.translated_cache.contains_key(&key) {
return true;
}
let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("xenos translated module"),
source: wgpu::ShaderSource::Wgsl(wgsl.to_string().into()),
})
})) {
Ok(m) => m,
Err(_) => {
metrics::counter!("gpu.shader.compile_err", "stage" => "module")
.increment(1);
return false;
}
};
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
label: Some("xenos translated pipeline"),
layout: Some(&self.pipeline_layout),
vertex: wgpu::VertexState {
module: &shader,
entry_point: "vs_main",
compilation_options: Default::default(),
buffers: &[],
},
fragment: Some(wgpu::FragmentState {
module: &shader,
entry_point: "fs_main",
compilation_options: Default::default(),
targets: &[Some(wgpu::ColorTargetState {
format: self.target_format,
blend: Some(wgpu::BlendState {
color: wgpu::BlendComponent {
src_factor: wgpu::BlendFactor::SrcAlpha,
dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
operation: wgpu::BlendOperation::Add,
},
alpha: wgpu::BlendComponent::OVER,
}),
write_mask: wgpu::ColorWrites::ALL,
})],
}),
primitive: wgpu::PrimitiveState {
topology: wgpu::PrimitiveTopology::TriangleList,
strip_index_format: None,
front_face: wgpu::FrontFace::Ccw,
cull_mode: None,
polygon_mode: wgpu::PolygonMode::Fill,
unclipped_depth: false,
conservative: false,
},
depth_stencil: None,
multisample: wgpu::MultisampleState::default(),
multiview: None,
cache: None,
});
self.translated_cache.insert(key, pipeline);
metrics::counter!("gpu.shader.compile_ok").increment(1);
true
}
/// Render one draw with the translator-produced pipeline instead of
/// the interpreter. Mirrors [`render_one`] except the bound pipeline
/// is swapped for `pipeline`.
pub fn render_one_with_pipeline(
&self,
queue: &wgpu::Queue,
encoder: &mut wgpu::CommandEncoder,
target_view: &wgpu::TextureView,
req: DrawRequest,
pipeline: &wgpu::RenderPipeline,
) {
let cb = DrawConstants {
draw_index: req.draw_index,
vertex_count: req.vertex_count.max(3),
prim_kind: req.prim_kind,
_pad: 0,
};
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
label: Some("xenos translated draw"),
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
view: target_view,
resolve_target: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Load,
store: wgpu::StoreOp::Store,
},
})],
depth_stencil_attachment: None,
timestamp_writes: None,
occlusion_query_set: None,
});
pass.set_pipeline(pipeline);
pass.set_bind_group(0, &self.bind_group, &[]);
pass.set_bind_group(1, &self.tex_bind_group, &[]);
let rounded = req.vertex_count.div_ceil(3) * 3;
pass.draw(0..rounded.max(3), 0..1);
}
/// Number of distinct translator pipelines cached. Surfaced to the HUD.
pub fn translated_pipeline_count(&self) -> usize {
self.translated_cache.len()
}
/// P5 — swap the active texture bound at `@group(1) @binding(0)`.
/// `view` is typically a wgpu texture view obtained from the
/// [`TextureCacheHost`]. Pass `None` to revert to the built-in dummy
/// magenta stub.
pub fn set_texture_view(&mut self, device: &wgpu::Device, view: Option<&wgpu::TextureView>) {
let bound = view.unwrap_or(&self.dummy_view);
self.tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("xenos tex bind group (rebind)"),
layout: &self.tex_bgl,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureView(bound),
},
wgpu::BindGroupEntry {
binding: 1,
resource: wgpu::BindingResource::Sampler(&self.sampler),
},
],
});
}
/// Clear `target_view` to `color`, store.
pub fn clear(
&self,
encoder: &mut wgpu::CommandEncoder,
target_view: &wgpu::TextureView,
color: [f64; 4],
) {
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
label: Some("xenos frontbuffer clear"),
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
view: target_view,
resolve_target: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Clear(wgpu::Color {
r: color[0],
g: color[1],
b: color[2],
a: color[3],
}),
store: wgpu::StoreOp::Store,
},
})],
depth_stencil_attachment: None,
timestamp_writes: None,
occlusion_query_set: None,
});
let _ = &mut pass;
}
/// Upload shader microcode + constants once (before the batch of draws
/// that share them). Skips zero-length blobs.
pub fn upload_shader_and_constants(
&self,
queue: &wgpu::Queue,
vs_ucode: &[u32],
ps_ucode: &[u32],
constants: &XenosConstantsBlock,
) {
queue.write_buffer(&self.constants_buffer, 0, bytemuck::bytes_of(constants));
if !vs_ucode.is_empty() {
let bytes: &[u8] = bytemuck::cast_slice(vs_ucode);
let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
queue.write_buffer(&self.vs_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
}
if !ps_ucode.is_empty() {
let bytes: &[u8] = bytemuck::cast_slice(ps_ucode);
let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
queue.write_buffer(&self.ps_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
}
}
/// Upload vertex data (as raw big-endian dwords — the WGSL side will
/// bswap as needed during format unpacking).
pub fn upload_vertex_data(&self, queue: &wgpu::Queue, data: &[u32]) {
if data.is_empty() {
return;
}
let bytes: &[u8] = bytemuck::cast_slice(data);
let max = VERTEX_BUFFER_MAX_BYTES as usize;
queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
}
/// Render one captured draw.
pub fn render_one(
&self,
queue: &wgpu::Queue,
encoder: &mut wgpu::CommandEncoder,
target_view: &wgpu::TextureView,
req: DrawRequest,
) {
let cb = DrawConstants {
draw_index: req.draw_index,
vertex_count: req.vertex_count.max(3),
prim_kind: req.prim_kind,
_pad: 0,
};
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
label: Some("xenos draw"),
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
view: target_view,
resolve_target: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Load,
store: wgpu::StoreOp::Store,
},
})],
depth_stencil_attachment: None,
timestamp_writes: None,
occlusion_query_set: None,
});
pass.set_pipeline(&self.pipeline);
pass.set_bind_group(0, &self.bind_group, &[]);
pass.set_bind_group(1, &self.tex_bind_group, &[]);
let rounded = req.vertex_count.div_ceil(3) * 3;
pass.draw(0..rounded.max(3), 0..1);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn draw_constants_layout_matches_wgsl_uniform() {
assert_eq!(std::mem::size_of::<DrawConstants>(), 16);
}
}