The publisher logo rendered its real artwork in isolation (3X) but was
overpainted in the full composite: every replayed draw used ONE fixed
SrcAlpha/OneMinusSrcAlpha pipeline + an opaque-magenta texture stub, so the
textured RectangleList draws whose sampler slot is shadowed by a vertex-fetch
constant (no resolvable texture) wrote opaque magenta over the logo.
Per-draw render-state inventory at the splash (env-gated probe, removed):
- logo QuadList vs=0x03b7b020 ps=0x03b79001: bc0=0x07010701
(One,OneMinusSrcAlpha — premultiplied alpha), cmask=0xF, ntex=1 (real K8888)
- RectangleList vs=0xd4c14f46 ps=0x03b79001: SAME premult blend, ntex=0
(slot 0 holds a type=3 vertex constant → texture decode rejects) → magenta
- opaque fill vs=0x36660986 ps=0xed732b5a: bc0=0x00010001 (One,Zero) — green
Draw order: the logo is drawn LAST per group, so order was not the problem;
the fixed pipeline state was.
Change (UI-side capture/replay only):
- draw_capture: capture RB_BLENDCONTROL0 + RB_COLOR_MASK (+ colorcontrol /
depthcontrol for follow-ups) per draw.
- xenos_pipeline: new RenderState{blend_control,color_mask}; map Xenos blend
factors/ops -> wgpu mirroring canary kBlendFactorMap/kBlendFactorAlphaMap;
One,Zero,Add => blend:None (opaque); zero-channel mask => ColorWrites; cache
translator AND interpreter pipelines keyed on (vs,ps,RenderState) /
RenderState so each draw composites with its real state.
- render: pass each capture's RenderState through both replay paths.
- dummy texture magenta(255,0,255,255) -> transparent(0,0,0,0): an
unresolvable texture now contributes nothing under its real premult blend
instead of fabricating opaque magenta (removes a fake, adds none).
Readback (env-gated, removed): full 1280x720 composite now shows the logo's
real artwork (maxR=255, 50-102 distinct colors/cell) in a centered strip; no
magenta anywhere. Background is uniform green (the 0xed732b5a opaque fill) — a
separate vertex-color/shader fidelity issue, NOT compositing (next iterate).
Determinism: UI-only; draw_capture additions only run when frame_captures=Some.
check -n50m --gpu-inline --stable-digest --expect = "matches golden" (2x).
cargo test --workspace = 682 passed. Temp probes removed.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
872 lines
35 KiB
Rust
872 lines
35 KiB
Rust
//! Host pipeline that consumes PM4 `DRAW_INDX*` captures.
|
||
//!
|
||
//! Drives [`xenia_gpu::shaders::XENOS_INTERP_WGSL`]. This file owns the
|
||
//! wgpu bind-group + pipeline + buffer surface the Xenos WGSL interpreter
|
||
//! binds to. The WGSL module is expected to declare:
|
||
//!
|
||
//! ```text
|
||
//! @group(0) @binding(0) var<uniform> xenos_draw : XenosDrawConstants; // 16 B
|
||
//! @group(0) @binding(1) var<uniform> xenos_consts : XenosConstants; // ~9.2 KB
|
||
//! @group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
|
||
//! @group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
|
||
//! @group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
|
||
//! ```
|
||
//!
|
||
//! Texture bindings (M6) are a single-slot stub for P3b:
|
||
//!
|
||
//! ```text
|
||
//! @group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||
//! @group(1) @binding(1) var xenos_samp : sampler;
|
||
//! ```
|
||
//!
|
||
//! The bound texture is a 1×1 magenta placeholder. Real per-slot guest
|
||
//! texture uploads + format decode land with the texture cache (P5).
|
||
|
||
use bytemuck::{Pod, Zeroable};
|
||
use wgpu::util::DeviceExt;
|
||
|
||
use xenia_gpu::shaders::XENOS_INTERP_WGSL;
|
||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||
|
||
/// Per-draw constants mirroring the WGSL `XenosDrawConstants` uniform
|
||
/// block. Ordering / padding matches `xenos_interp.wgsl` exactly.
|
||
#[repr(C)]
|
||
#[derive(Clone, Copy, Pod, Zeroable)]
|
||
struct DrawConstants {
|
||
draw_index: u32,
|
||
vertex_count: u32,
|
||
prim_kind: u32,
|
||
/// iterate-3O: guest dword base of the uploaded `vertex_buffer` window.
|
||
/// The WGSL subtracts this from the absolute vertex-fetch address.
|
||
vertex_base_dwords: u32,
|
||
/// iterate-3S: guest→host NDC XY transform (mirrors canary
|
||
/// `GetHostViewportInfo`). `clip.xy = pos.xy * ndc_scale + ndc_offset*pos.w`.
|
||
/// Y is pre-flipped for wgpu. 16 bytes so the block stays 16-byte aligned.
|
||
ndc_scale: [f32; 2],
|
||
ndc_offset: [f32; 2],
|
||
}
|
||
|
||
/// iterate-3Y: the per-draw host color/blend/write-mask render state, decoded
|
||
/// from the guest registers (`RB_BLENDCONTROL0` / `RB_COLOR_MASK`). Used both
|
||
/// as part of the pipeline-cache key and to build the `wgpu::ColorTargetState`.
|
||
/// Mirrors canary's `GetColorBlendStateForRenderTarget` (D3D12
|
||
/// `pipeline_cache.cc`): the factors come straight from `RB_BLENDCONTROL`,
|
||
/// and a zero write-mask forces the no-blend `One,Zero` equation.
|
||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
||
pub struct RenderState {
|
||
/// `RB_BLENDCONTROL0` raw value (RT0). `0x00010001` (One,Zero / One,Zero,
|
||
/// Add) is the opaque case.
|
||
pub blend_control: u32,
|
||
/// RT0 nibble of `RB_COLOR_MASK` (bit0=R … bit3=A). 0 = write nothing.
|
||
pub color_mask: u8,
|
||
}
|
||
|
||
impl RenderState {
|
||
/// Fully-opaque, all-channels state (the legacy fixed behaviour). Used for
|
||
/// procedural/synthetic draws that have no captured guest state.
|
||
pub const OPAQUE: RenderState = RenderState {
|
||
blend_control: 0x0001_0001,
|
||
color_mask: 0xF,
|
||
};
|
||
|
||
/// Map a Xenos `BlendFactor` (5-bit field) to a wgpu `BlendFactor`,
|
||
/// mirroring canary `kBlendFactorMap` (D3D12 `pipeline_cache.cc:1504`).
|
||
fn map_factor(f: u32) -> wgpu::BlendFactor {
|
||
match f {
|
||
0 => wgpu::BlendFactor::Zero,
|
||
1 => wgpu::BlendFactor::One,
|
||
4 => wgpu::BlendFactor::Src,
|
||
5 => wgpu::BlendFactor::OneMinusSrc,
|
||
6 => wgpu::BlendFactor::SrcAlpha,
|
||
7 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||
8 => wgpu::BlendFactor::Dst,
|
||
9 => wgpu::BlendFactor::OneMinusDst,
|
||
10 => wgpu::BlendFactor::DstAlpha,
|
||
11 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||
12 => wgpu::BlendFactor::Constant,
|
||
13 => wgpu::BlendFactor::OneMinusConstant,
|
||
14 => wgpu::BlendFactor::Constant,
|
||
15 => wgpu::BlendFactor::OneMinusConstant,
|
||
16 => wgpu::BlendFactor::SrcAlphaSaturated,
|
||
// 2/3 and >16 are undefined on Xenos; canary maps to Zero.
|
||
_ => wgpu::BlendFactor::Zero,
|
||
}
|
||
}
|
||
|
||
/// Map a Xenos `BlendFactor` for the *alpha* channel, mirroring canary
|
||
/// `kBlendFactorAlphaMap` (color-mode factors collapse to alpha).
|
||
fn map_factor_alpha(f: u32) -> wgpu::BlendFactor {
|
||
match f {
|
||
4 => wgpu::BlendFactor::SrcAlpha,
|
||
5 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||
8 => wgpu::BlendFactor::DstAlpha,
|
||
9 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||
other => Self::map_factor(other),
|
||
}
|
||
}
|
||
|
||
fn map_op(o: u32) -> wgpu::BlendOperation {
|
||
match o {
|
||
0 => wgpu::BlendOperation::Add,
|
||
1 => wgpu::BlendOperation::Subtract,
|
||
2 => wgpu::BlendOperation::Min,
|
||
3 => wgpu::BlendOperation::Max,
|
||
4 => wgpu::BlendOperation::ReverseSubtract,
|
||
_ => wgpu::BlendOperation::Add,
|
||
}
|
||
}
|
||
|
||
/// Build the `wgpu::ColorTargetState` for this draw.
|
||
fn color_target(&self, format: wgpu::TextureFormat) -> wgpu::ColorTargetState {
|
||
let bc = self.blend_control;
|
||
let color_src = bc & 0x1F;
|
||
let color_op = (bc >> 5) & 0x7;
|
||
let color_dst = (bc >> 8) & 0x1F;
|
||
let alpha_src = (bc >> 16) & 0x1F;
|
||
let alpha_op = (bc >> 21) & 0x7;
|
||
let alpha_dst = (bc >> 24) & 0x1F;
|
||
|
||
// wgpu requires `blend: None` when nothing would be written; also the
|
||
// `One,Zero,Add` identity is the opaque case (canary's no-blend), which
|
||
// we express as `blend: None` so it's a plain overwrite.
|
||
let is_opaque = color_src == 1
|
||
&& color_dst == 0
|
||
&& color_op == 0
|
||
&& alpha_src == 1
|
||
&& alpha_dst == 0
|
||
&& alpha_op == 0;
|
||
let blend = if is_opaque {
|
||
None
|
||
} else {
|
||
Some(wgpu::BlendState {
|
||
color: wgpu::BlendComponent {
|
||
src_factor: Self::map_factor(color_src),
|
||
dst_factor: Self::map_factor(color_dst),
|
||
operation: Self::map_op(color_op),
|
||
},
|
||
alpha: wgpu::BlendComponent {
|
||
src_factor: Self::map_factor_alpha(alpha_src),
|
||
dst_factor: Self::map_factor_alpha(alpha_dst),
|
||
operation: Self::map_op(alpha_op),
|
||
},
|
||
})
|
||
};
|
||
|
||
let mut write_mask = wgpu::ColorWrites::empty();
|
||
if self.color_mask & 0x1 != 0 {
|
||
write_mask |= wgpu::ColorWrites::RED;
|
||
}
|
||
if self.color_mask & 0x2 != 0 {
|
||
write_mask |= wgpu::ColorWrites::GREEN;
|
||
}
|
||
if self.color_mask & 0x4 != 0 {
|
||
write_mask |= wgpu::ColorWrites::BLUE;
|
||
}
|
||
if self.color_mask & 0x8 != 0 {
|
||
write_mask |= wgpu::ColorWrites::ALPHA;
|
||
}
|
||
|
||
wgpu::ColorTargetState {
|
||
format,
|
||
blend,
|
||
write_mask,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
|
||
#[derive(Clone, Copy, Debug)]
|
||
pub struct DrawRequest {
|
||
/// Monotonic draw counter; shader uses it for per-draw colour rotation.
|
||
pub draw_index: u32,
|
||
/// Host-normalised vertex count (after primitive-processor rewrite).
|
||
pub vertex_count: u32,
|
||
/// Xenos primitive-type code; shader may branch on it in P3b+.
|
||
pub prim_kind: u32,
|
||
/// iterate-3O: guest dword base of the per-draw vertex window uploaded to
|
||
/// `vertex_buffer` (b4). 0 = no real vertex window (procedural fallback).
|
||
pub vertex_base_dwords: u32,
|
||
/// iterate-3S: guest→host NDC XY transform (Y pre-flipped). When all-zero
|
||
/// the shader leaves the position untransformed (procedural fallback).
|
||
pub ndc_scale: [f32; 2],
|
||
pub ndc_offset: [f32; 2],
|
||
}
|
||
|
||
/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
|
||
/// shaders are ≪ 4 KB; 64 KB is orders-of-magnitude slack.
|
||
const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS
|
||
/// 16 MB of vertex data — enough for any realistic Xenos draw.
|
||
const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;
|
||
|
||
pub struct XenosPipeline {
|
||
/// Interpreter pipeline with the legacy fixed (alpha-blend) state. Kept as
|
||
/// the default; per-state variants are built lazily in `interp_cache`.
|
||
pipeline: wgpu::RenderPipeline,
|
||
/// iterate-3Y: the interpreter WGSL module, retained so per-render-state
|
||
/// interpreter pipelines can be compiled on demand.
|
||
interp_shader: wgpu::ShaderModule,
|
||
/// iterate-3Y: interpreter pipelines keyed on the per-draw `RenderState`
|
||
/// (blend + write mask), so flat/alpha/opaque draws composite correctly
|
||
/// even when their (vs,ps) didn't translate.
|
||
interp_cache: std::collections::HashMap<RenderState, wgpu::RenderPipeline>,
|
||
draw_ctx_buffer: wgpu::Buffer,
|
||
constants_buffer: wgpu::Buffer,
|
||
vs_ucode_buffer: wgpu::Buffer,
|
||
ps_ucode_buffer: wgpu::Buffer,
|
||
vertex_buffer: wgpu::Buffer,
|
||
bind_group: wgpu::BindGroup,
|
||
/// P5: swapped per-draw when a new cached texture becomes active.
|
||
tex_bind_group: wgpu::BindGroup,
|
||
/// Layout + sampler retained so `set_texture_view` can rebuild
|
||
/// `tex_bind_group` on the fly without re-reading the pipeline.
|
||
tex_bgl: wgpu::BindGroupLayout,
|
||
sampler: wgpu::Sampler,
|
||
/// Fallback 1×1 magenta texture — used when no guest texture has been
|
||
/// uploaded yet or when a draw references an unsupported format.
|
||
dummy_view: wgpu::TextureView,
|
||
/// P7 — retained pipeline layout + compiled-pipeline cache for
|
||
/// Xenos→WGSL translator output. Keyed on `(vs_blob_key, ps_blob_key)`
|
||
/// so every (vs, ps) pair gets compiled once and re-used for every
|
||
/// subsequent draw. Interpreter pipeline remains the fallback.
|
||
pipeline_layout: wgpu::PipelineLayout,
|
||
/// iterate-3Y: cached translator pipelines keyed on the shader pair AND the
|
||
/// per-draw render state, so the same (vs,ps) with different blend/mask
|
||
/// composites correctly. The translated WGSL module is itself cached per
|
||
/// (vs,ps) so re-translation only happens once.
|
||
translated_cache: std::collections::HashMap<(u32, u32, RenderState), wgpu::RenderPipeline>,
|
||
translated_modules: std::collections::HashMap<(u32, u32), wgpu::ShaderModule>,
|
||
pub target_format: wgpu::TextureFormat,
|
||
}
|
||
|
||
impl XenosPipeline {
|
||
pub fn new(
|
||
device: &wgpu::Device,
|
||
queue: &wgpu::Queue,
|
||
target_format: wgpu::TextureFormat,
|
||
) -> Self {
|
||
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
|
||
label: Some("xenos_interp.wgsl"),
|
||
source: wgpu::ShaderSource::Wgsl(XENOS_INTERP_WGSL.into()),
|
||
});
|
||
|
||
let bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||
label: Some("xenos bind group layout"),
|
||
entries: &[
|
||
// b0: draw_ctx (16 B uniform)
|
||
wgpu::BindGroupLayoutEntry {
|
||
binding: 0,
|
||
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
|
||
ty: wgpu::BindingType::Buffer {
|
||
ty: wgpu::BufferBindingType::Uniform,
|
||
has_dynamic_offset: false,
|
||
min_binding_size: std::num::NonZeroU64::new(
|
||
std::mem::size_of::<DrawConstants>() as u64,
|
||
),
|
||
},
|
||
count: None,
|
||
},
|
||
// b1: XenosConstants read-only storage (~9.2 KB). Not uniform
|
||
// because the block contains packed `array<u32>` fields and
|
||
// WGSL's uniform address space would require 16-byte stride.
|
||
wgpu::BindGroupLayoutEntry {
|
||
binding: 1,
|
||
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
|
||
ty: wgpu::BindingType::Buffer {
|
||
ty: wgpu::BufferBindingType::Storage { read_only: true },
|
||
has_dynamic_offset: false,
|
||
min_binding_size: std::num::NonZeroU64::new(
|
||
XenosConstantsBlock::SIZE as u64,
|
||
),
|
||
},
|
||
count: None,
|
||
},
|
||
// b2: vs_ucode (read-only storage)
|
||
wgpu::BindGroupLayoutEntry {
|
||
binding: 2,
|
||
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
|
||
ty: wgpu::BindingType::Buffer {
|
||
ty: wgpu::BufferBindingType::Storage { read_only: true },
|
||
has_dynamic_offset: false,
|
||
min_binding_size: None,
|
||
},
|
||
count: None,
|
||
},
|
||
// b3: ps_ucode (read-only storage)
|
||
wgpu::BindGroupLayoutEntry {
|
||
binding: 3,
|
||
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
|
||
ty: wgpu::BindingType::Buffer {
|
||
ty: wgpu::BufferBindingType::Storage { read_only: true },
|
||
has_dynamic_offset: false,
|
||
min_binding_size: None,
|
||
},
|
||
count: None,
|
||
},
|
||
// b4: vertex_buffer (read-only storage)
|
||
wgpu::BindGroupLayoutEntry {
|
||
binding: 4,
|
||
visibility: wgpu::ShaderStages::VERTEX,
|
||
ty: wgpu::BindingType::Buffer {
|
||
ty: wgpu::BufferBindingType::Storage { read_only: true },
|
||
has_dynamic_offset: false,
|
||
min_binding_size: None,
|
||
},
|
||
count: None,
|
||
},
|
||
],
|
||
});
|
||
let tex_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||
label: Some("xenos tex bind group layout"),
|
||
entries: &[
|
||
wgpu::BindGroupLayoutEntry {
|
||
binding: 0,
|
||
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
|
||
ty: wgpu::BindingType::Texture {
|
||
sample_type: wgpu::TextureSampleType::Float { filterable: true },
|
||
view_dimension: wgpu::TextureViewDimension::D2,
|
||
multisampled: false,
|
||
},
|
||
count: None,
|
||
},
|
||
wgpu::BindGroupLayoutEntry {
|
||
binding: 1,
|
||
visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT,
|
||
ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering),
|
||
count: None,
|
||
},
|
||
],
|
||
});
|
||
|
||
let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
|
||
label: Some("xenos pipeline layout"),
|
||
bind_group_layouts: &[&bgl, &tex_bgl],
|
||
push_constant_ranges: &[],
|
||
});
|
||
|
||
// Buffer allocation. `queue.write_buffer` uses COPY_DST; all
|
||
// interpreter-facing buffers need it.
|
||
let initial_draw = DrawConstants {
|
||
draw_index: 0,
|
||
vertex_count: 3,
|
||
prim_kind: 4,
|
||
vertex_base_dwords: 0,
|
||
ndc_scale: [0.0, 0.0],
|
||
ndc_offset: [0.0, 0.0],
|
||
};
|
||
let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||
label: Some("xenos draw ctx"),
|
||
contents: bytemuck::bytes_of(&initial_draw),
|
||
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
|
||
});
|
||
let constants_buffer = device.create_buffer(&wgpu::BufferDescriptor {
|
||
label: Some("xenos constants"),
|
||
size: XenosConstantsBlock::SIZE as u64,
|
||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
|
||
mapped_at_creation: false,
|
||
});
|
||
let vs_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
|
||
label: Some("xenos vs ucode"),
|
||
size: UCODE_BUFFER_MAX_DWORDS * 4,
|
||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
|
||
mapped_at_creation: false,
|
||
});
|
||
let ps_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor {
|
||
label: Some("xenos ps ucode"),
|
||
size: UCODE_BUFFER_MAX_DWORDS * 4,
|
||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
|
||
mapped_at_creation: false,
|
||
});
|
||
let vertex_buffer = device.create_buffer(&wgpu::BufferDescriptor {
|
||
label: Some("xenos vertex buffer"),
|
||
size: VERTEX_BUFFER_MAX_BYTES,
|
||
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
|
||
mapped_at_creation: false,
|
||
});
|
||
|
||
// Dummy 1×1 magenta texture — placeholder until P5's texture cache
|
||
// lands. Every `interpret_texture_fetch` samples this for now so the
|
||
// interpreter can exercise textureSample paths without a real cache.
|
||
let dummy_tex = device.create_texture(&wgpu::TextureDescriptor {
|
||
label: Some("xenos dummy texture"),
|
||
size: wgpu::Extent3d {
|
||
width: 1,
|
||
height: 1,
|
||
depth_or_array_layers: 1,
|
||
},
|
||
mip_level_count: 1,
|
||
sample_count: 1,
|
||
dimension: wgpu::TextureDimension::D2,
|
||
format: wgpu::TextureFormat::Rgba8Unorm,
|
||
usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
|
||
view_formats: &[],
|
||
});
|
||
// iterate-3Y: transparent black (0,0,0,0). When a textured draw's
|
||
// real texture can't be resolved (e.g. its sampler slot is shadowed by
|
||
// a vertex-fetch constant), sampling a *transparent* texel makes the
|
||
// draw a no-op under its real premultiplied-alpha blend — instead of
|
||
// fabricating an opaque magenta that overpaints everything (the old
|
||
// debug stub). This removes a fake rather than adding one: we never
|
||
// invent visible pixels for an unresolved texture.
|
||
queue.write_texture(
|
||
wgpu::ImageCopyTexture {
|
||
texture: &dummy_tex,
|
||
mip_level: 0,
|
||
origin: wgpu::Origin3d::ZERO,
|
||
aspect: wgpu::TextureAspect::All,
|
||
},
|
||
&[0x00u8, 0x00, 0x00, 0x00],
|
||
wgpu::ImageDataLayout {
|
||
offset: 0,
|
||
bytes_per_row: Some(4),
|
||
rows_per_image: Some(1),
|
||
},
|
||
wgpu::Extent3d {
|
||
width: 1,
|
||
height: 1,
|
||
depth_or_array_layers: 1,
|
||
},
|
||
);
|
||
let dummy_view = dummy_tex.create_view(&wgpu::TextureViewDescriptor::default());
|
||
let dummy_sampler = device.create_sampler(&wgpu::SamplerDescriptor {
|
||
label: Some("xenos dummy sampler"),
|
||
address_mode_u: wgpu::AddressMode::ClampToEdge,
|
||
address_mode_v: wgpu::AddressMode::ClampToEdge,
|
||
address_mode_w: wgpu::AddressMode::ClampToEdge,
|
||
mag_filter: wgpu::FilterMode::Linear,
|
||
min_filter: wgpu::FilterMode::Linear,
|
||
mipmap_filter: wgpu::FilterMode::Nearest,
|
||
..Default::default()
|
||
});
|
||
let tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
|
||
label: Some("xenos tex bind group"),
|
||
layout: &tex_bgl,
|
||
entries: &[
|
||
wgpu::BindGroupEntry {
|
||
binding: 0,
|
||
resource: wgpu::BindingResource::TextureView(&dummy_view),
|
||
},
|
||
wgpu::BindGroupEntry {
|
||
binding: 1,
|
||
resource: wgpu::BindingResource::Sampler(&dummy_sampler),
|
||
},
|
||
],
|
||
});
|
||
|
||
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
|
||
label: Some("xenos bind group"),
|
||
layout: &bgl,
|
||
entries: &[
|
||
wgpu::BindGroupEntry {
|
||
binding: 0,
|
||
resource: draw_ctx_buffer.as_entire_binding(),
|
||
},
|
||
wgpu::BindGroupEntry {
|
||
binding: 1,
|
||
resource: constants_buffer.as_entire_binding(),
|
||
},
|
||
wgpu::BindGroupEntry {
|
||
binding: 2,
|
||
resource: vs_ucode_buffer.as_entire_binding(),
|
||
},
|
||
wgpu::BindGroupEntry {
|
||
binding: 3,
|
||
resource: ps_ucode_buffer.as_entire_binding(),
|
||
},
|
||
wgpu::BindGroupEntry {
|
||
binding: 4,
|
||
resource: vertex_buffer.as_entire_binding(),
|
||
},
|
||
],
|
||
});
|
||
|
||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||
label: Some("xenos_interp pipeline"),
|
||
layout: Some(&layout),
|
||
vertex: wgpu::VertexState {
|
||
module: &shader,
|
||
entry_point: "vs_main",
|
||
compilation_options: Default::default(),
|
||
buffers: &[],
|
||
},
|
||
fragment: Some(wgpu::FragmentState {
|
||
module: &shader,
|
||
entry_point: "fs_main",
|
||
compilation_options: Default::default(),
|
||
targets: &[Some(wgpu::ColorTargetState {
|
||
format: target_format,
|
||
blend: Some(wgpu::BlendState {
|
||
color: wgpu::BlendComponent {
|
||
src_factor: wgpu::BlendFactor::SrcAlpha,
|
||
dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
|
||
operation: wgpu::BlendOperation::Add,
|
||
},
|
||
alpha: wgpu::BlendComponent::OVER,
|
||
}),
|
||
write_mask: wgpu::ColorWrites::ALL,
|
||
})],
|
||
}),
|
||
primitive: wgpu::PrimitiveState {
|
||
topology: wgpu::PrimitiveTopology::TriangleList,
|
||
strip_index_format: None,
|
||
front_face: wgpu::FrontFace::Ccw,
|
||
cull_mode: None,
|
||
polygon_mode: wgpu::PolygonMode::Fill,
|
||
unclipped_depth: false,
|
||
conservative: false,
|
||
},
|
||
depth_stencil: None,
|
||
multisample: wgpu::MultisampleState::default(),
|
||
multiview: None,
|
||
cache: None,
|
||
});
|
||
|
||
Self {
|
||
pipeline,
|
||
interp_shader: shader,
|
||
interp_cache: std::collections::HashMap::new(),
|
||
draw_ctx_buffer,
|
||
constants_buffer,
|
||
vs_ucode_buffer,
|
||
ps_ucode_buffer,
|
||
vertex_buffer,
|
||
bind_group,
|
||
tex_bind_group,
|
||
tex_bgl,
|
||
sampler: dummy_sampler,
|
||
dummy_view,
|
||
pipeline_layout: layout,
|
||
translated_cache: std::collections::HashMap::new(),
|
||
translated_modules: std::collections::HashMap::new(),
|
||
target_format,
|
||
}
|
||
}
|
||
|
||
/// P7 — has the translator already produced a WGSL *module* for this
|
||
/// (vs, ps) pair? (A per-render-state pipeline may still need building.)
|
||
pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
|
||
self.translated_modules
|
||
.contains_key(&(vs_blob_key, ps_blob_key))
|
||
}
|
||
|
||
/// P7 — compile a translator-produced WGSL module and cache it keyed on
|
||
/// `(vs_blob_key, ps_blob_key)`. The actual `RenderPipeline` (which also
|
||
/// depends on the per-draw blend/mask state) is built lazily by
|
||
/// [`render_one_translated`]. Returns `true` on success.
|
||
pub fn insert_translated(
|
||
&mut self,
|
||
device: &wgpu::Device,
|
||
vs_blob_key: u32,
|
||
ps_blob_key: u32,
|
||
wgsl: &str,
|
||
) -> bool {
|
||
let key = (vs_blob_key, ps_blob_key);
|
||
if self.translated_modules.contains_key(&key) {
|
||
return true;
|
||
}
|
||
let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||
device.create_shader_module(wgpu::ShaderModuleDescriptor {
|
||
label: Some("xenos translated module"),
|
||
source: wgpu::ShaderSource::Wgsl(wgsl.to_string().into()),
|
||
})
|
||
})) {
|
||
Ok(m) => m,
|
||
Err(_) => {
|
||
metrics::counter!("gpu.shader.compile_err", "stage" => "module")
|
||
.increment(1);
|
||
return false;
|
||
}
|
||
};
|
||
self.translated_modules.insert(key, shader);
|
||
metrics::counter!("gpu.shader.compile_ok").increment(1);
|
||
true
|
||
}
|
||
|
||
/// iterate-3Y: ensure a translator pipeline exists for `(vs,ps,rstate)`,
|
||
/// building it from the cached module + the per-draw color/blend target.
|
||
fn ensure_translated_for_state(
|
||
&mut self,
|
||
device: &wgpu::Device,
|
||
vs_key: u32,
|
||
ps_key: u32,
|
||
rstate: RenderState,
|
||
) -> bool {
|
||
let pkey = (vs_key, ps_key, rstate);
|
||
if self.translated_cache.contains_key(&pkey) {
|
||
return true;
|
||
}
|
||
let Some(module) = self.translated_modules.get(&(vs_key, ps_key)) else {
|
||
return false;
|
||
};
|
||
let target = rstate.color_target(self.target_format);
|
||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||
label: Some("xenos translated pipeline"),
|
||
layout: Some(&self.pipeline_layout),
|
||
vertex: wgpu::VertexState {
|
||
module,
|
||
entry_point: "vs_main",
|
||
compilation_options: Default::default(),
|
||
buffers: &[],
|
||
},
|
||
fragment: Some(wgpu::FragmentState {
|
||
module,
|
||
entry_point: "fs_main",
|
||
compilation_options: Default::default(),
|
||
targets: &[Some(target)],
|
||
}),
|
||
primitive: wgpu::PrimitiveState {
|
||
topology: wgpu::PrimitiveTopology::TriangleList,
|
||
strip_index_format: None,
|
||
front_face: wgpu::FrontFace::Ccw,
|
||
cull_mode: None,
|
||
polygon_mode: wgpu::PolygonMode::Fill,
|
||
unclipped_depth: false,
|
||
conservative: false,
|
||
},
|
||
depth_stencil: None,
|
||
multisample: wgpu::MultisampleState::default(),
|
||
multiview: None,
|
||
cache: None,
|
||
});
|
||
self.translated_cache.insert(pkey, pipeline);
|
||
true
|
||
}
|
||
|
||
/// iterate-3Y: ensure an interpreter pipeline exists for `rstate`.
|
||
fn ensure_interp_for_state(&mut self, device: &wgpu::Device, rstate: RenderState) {
|
||
if self.interp_cache.contains_key(&rstate) {
|
||
return;
|
||
}
|
||
let target = rstate.color_target(self.target_format);
|
||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||
label: Some("xenos interp pipeline (per-state)"),
|
||
layout: Some(&self.pipeline_layout),
|
||
vertex: wgpu::VertexState {
|
||
module: &self.interp_shader,
|
||
entry_point: "vs_main",
|
||
compilation_options: Default::default(),
|
||
buffers: &[],
|
||
},
|
||
fragment: Some(wgpu::FragmentState {
|
||
module: &self.interp_shader,
|
||
entry_point: "fs_main",
|
||
compilation_options: Default::default(),
|
||
targets: &[Some(target)],
|
||
}),
|
||
primitive: wgpu::PrimitiveState {
|
||
topology: wgpu::PrimitiveTopology::TriangleList,
|
||
strip_index_format: None,
|
||
front_face: wgpu::FrontFace::Ccw,
|
||
cull_mode: None,
|
||
polygon_mode: wgpu::PolygonMode::Fill,
|
||
unclipped_depth: false,
|
||
conservative: false,
|
||
},
|
||
depth_stencil: None,
|
||
multisample: wgpu::MultisampleState::default(),
|
||
multiview: None,
|
||
cache: None,
|
||
});
|
||
self.interp_cache.insert(rstate, pipeline);
|
||
}
|
||
|
||
/// iterate-3Y: render one draw through the translator pipeline built for
|
||
/// this draw's render state. Returns `false` if no module is cached for
|
||
/// `(vs,ps)` (caller should fall back to the interpreter).
|
||
pub fn render_one_translated(
|
||
&mut self,
|
||
device: &wgpu::Device,
|
||
queue: &wgpu::Queue,
|
||
encoder: &mut wgpu::CommandEncoder,
|
||
target_view: &wgpu::TextureView,
|
||
req: DrawRequest,
|
||
vs_key: u32,
|
||
ps_key: u32,
|
||
rstate: RenderState,
|
||
) -> bool {
|
||
if !self.ensure_translated_for_state(device, vs_key, ps_key, rstate) {
|
||
return false;
|
||
}
|
||
let cb = DrawConstants {
|
||
draw_index: req.draw_index,
|
||
vertex_count: req.vertex_count.max(3),
|
||
prim_kind: req.prim_kind,
|
||
vertex_base_dwords: req.vertex_base_dwords,
|
||
ndc_scale: req.ndc_scale,
|
||
ndc_offset: req.ndc_offset,
|
||
};
|
||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||
let pipeline = self
|
||
.translated_cache
|
||
.get(&(vs_key, ps_key, rstate))
|
||
.expect("just ensured");
|
||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||
label: Some("xenos translated draw"),
|
||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||
view: target_view,
|
||
resolve_target: None,
|
||
ops: wgpu::Operations {
|
||
load: wgpu::LoadOp::Load,
|
||
store: wgpu::StoreOp::Store,
|
||
},
|
||
})],
|
||
depth_stencil_attachment: None,
|
||
timestamp_writes: None,
|
||
occlusion_query_set: None,
|
||
});
|
||
pass.set_pipeline(pipeline);
|
||
pass.set_bind_group(0, &self.bind_group, &[]);
|
||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||
pass.draw(0..rounded.max(3), 0..1);
|
||
true
|
||
}
|
||
|
||
/// Number of distinct translator pipelines cached. Surfaced to the HUD.
|
||
pub fn translated_pipeline_count(&self) -> usize {
|
||
self.translated_cache.len()
|
||
}
|
||
|
||
/// P5 — swap the active texture bound at `@group(1) @binding(0)`.
|
||
/// `view` is typically a wgpu texture view obtained from the
|
||
/// [`TextureCacheHost`]. Pass `None` to revert to the built-in dummy
|
||
/// magenta stub.
|
||
pub fn set_texture_view(&mut self, device: &wgpu::Device, view: Option<&wgpu::TextureView>) {
|
||
let bound = view.unwrap_or(&self.dummy_view);
|
||
self.tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
|
||
label: Some("xenos tex bind group (rebind)"),
|
||
layout: &self.tex_bgl,
|
||
entries: &[
|
||
wgpu::BindGroupEntry {
|
||
binding: 0,
|
||
resource: wgpu::BindingResource::TextureView(bound),
|
||
},
|
||
wgpu::BindGroupEntry {
|
||
binding: 1,
|
||
resource: wgpu::BindingResource::Sampler(&self.sampler),
|
||
},
|
||
],
|
||
});
|
||
}
|
||
|
||
/// Clear `target_view` to `color`, store.
|
||
pub fn clear(
|
||
&self,
|
||
encoder: &mut wgpu::CommandEncoder,
|
||
target_view: &wgpu::TextureView,
|
||
color: [f64; 4],
|
||
) {
|
||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||
label: Some("xenos frontbuffer clear"),
|
||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||
view: target_view,
|
||
resolve_target: None,
|
||
ops: wgpu::Operations {
|
||
load: wgpu::LoadOp::Clear(wgpu::Color {
|
||
r: color[0],
|
||
g: color[1],
|
||
b: color[2],
|
||
a: color[3],
|
||
}),
|
||
store: wgpu::StoreOp::Store,
|
||
},
|
||
})],
|
||
depth_stencil_attachment: None,
|
||
timestamp_writes: None,
|
||
occlusion_query_set: None,
|
||
});
|
||
let _ = &mut pass;
|
||
}
|
||
|
||
/// Upload shader microcode + constants once (before the batch of draws
|
||
/// that share them). Skips zero-length blobs.
|
||
pub fn upload_shader_and_constants(
|
||
&self,
|
||
queue: &wgpu::Queue,
|
||
vs_ucode: &[u32],
|
||
ps_ucode: &[u32],
|
||
constants: &XenosConstantsBlock,
|
||
) {
|
||
queue.write_buffer(&self.constants_buffer, 0, bytemuck::bytes_of(constants));
|
||
if !vs_ucode.is_empty() {
|
||
let bytes: &[u8] = bytemuck::cast_slice(vs_ucode);
|
||
let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
|
||
queue.write_buffer(&self.vs_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
|
||
}
|
||
if !ps_ucode.is_empty() {
|
||
let bytes: &[u8] = bytemuck::cast_slice(ps_ucode);
|
||
let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize;
|
||
queue.write_buffer(&self.ps_ucode_buffer, 0, &bytes[..bytes.len().min(max)]);
|
||
}
|
||
}
|
||
|
||
/// Upload vertex data (as raw big-endian dwords — the WGSL side will
|
||
/// bswap as needed during format unpacking).
|
||
pub fn upload_vertex_data(&self, queue: &wgpu::Queue, data: &[u32]) {
|
||
if data.is_empty() {
|
||
return;
|
||
}
|
||
let bytes: &[u8] = bytemuck::cast_slice(data);
|
||
let max = VERTEX_BUFFER_MAX_BYTES as usize;
|
||
queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
|
||
}
|
||
|
||
/// Render one captured draw through the interpreter, using the per-draw
|
||
/// `rstate` (blend/write-mask) so flat draws composite correctly even
|
||
/// when their (vs,ps) didn't translate. `RenderState::OPAQUE` reproduces
|
||
/// the legacy fixed behaviour for procedural/synthetic draws.
|
||
pub fn render_one(
|
||
&mut self,
|
||
device: &wgpu::Device,
|
||
queue: &wgpu::Queue,
|
||
encoder: &mut wgpu::CommandEncoder,
|
||
target_view: &wgpu::TextureView,
|
||
req: DrawRequest,
|
||
rstate: RenderState,
|
||
) {
|
||
self.ensure_interp_for_state(device, rstate);
|
||
let cb = DrawConstants {
|
||
draw_index: req.draw_index,
|
||
vertex_count: req.vertex_count.max(3),
|
||
prim_kind: req.prim_kind,
|
||
vertex_base_dwords: req.vertex_base_dwords,
|
||
ndc_scale: req.ndc_scale,
|
||
ndc_offset: req.ndc_offset,
|
||
};
|
||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||
|
||
let pipeline = self
|
||
.interp_cache
|
||
.get(&rstate)
|
||
.expect("just ensured");
|
||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||
label: Some("xenos draw"),
|
||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||
view: target_view,
|
||
resolve_target: None,
|
||
ops: wgpu::Operations {
|
||
load: wgpu::LoadOp::Load,
|
||
store: wgpu::StoreOp::Store,
|
||
},
|
||
})],
|
||
depth_stencil_attachment: None,
|
||
timestamp_writes: None,
|
||
occlusion_query_set: None,
|
||
});
|
||
pass.set_pipeline(pipeline);
|
||
pass.set_bind_group(0, &self.bind_group, &[]);
|
||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||
pass.draw(0..rounded.max(3), 0..1);
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn draw_constants_layout_matches_wgsl_uniform() {
|
||
assert_eq!(std::mem::size_of::<DrawConstants>(), 32);
|
||
}
|
||
}
|