xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).

Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.

Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions

View File

@@ -0,0 +1,36 @@
//! Embedded WGSL shader sources used by the host pipeline.
/// Xenos uber-shader scaffold (P3). See the comment at the top of
/// [`XENOS_INTERP_WGSL`] for scope/limits and the plan's P3b target state.
pub const XENOS_INTERP_WGSL: &str = include_str!("xenos_interp.wgsl");
#[cfg(test)]
mod tests {
use super::*;
/// Parsing through naga validates the shader against WGSL spec + wgpu's
/// type system. We don't need a full pipeline to catch typos and layout
/// mistakes — this test is fast and catches regressions at `cargo test`
/// time.
#[test]
fn xenos_interp_wgsl_parses() {
let module = naga::front::wgsl::parse_str(XENOS_INTERP_WGSL)
.expect("xenos_interp.wgsl must parse cleanly");
// Sanity: we declared two entry points.
assert!(!module.entry_points.is_empty());
assert!(
module
.entry_points
.iter()
.any(|e| e.name == "vs_main" && e.stage == naga::ShaderStage::Vertex),
"missing vs_main entry"
);
assert!(
module
.entry_points
.iter()
.any(|e| e.name == "fs_main" && e.stage == naga::ShaderStage::Fragment),
"missing fs_main entry"
);
}
}

View File

@@ -0,0 +1,974 @@
// xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
//
// Bindings (stable across P3b milestones):
// @group(0) @binding(0) draw_ctx (uniform, 16 B — XenosDrawConstants)
// @group(0) @binding(1) xenos_consts (uniform, ~9.2 KB — XenosConstants)
// @group(0) @binding(2) vs_ucode (storage<read>, packed VS shader)
// @group(0) @binding(3) ps_ucode (storage<read>, packed PS shader)
// @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
//
// Packed shader layout (both vs_ucode & ps_ucode):
// [0] = cf_count
// [1 .. 1 + cf_count*3] = CF table: (kind, primary, aux) × cf_count
// [1 + cf_count*3 ..] = instruction triples (3 dwords each)
//
// M3 state (this file): CF walker + operand decode helpers + register file
// scaffold are complete. ALU / fetch bodies are still stubs that fall back
// to the procedural-circle visualisation; M4-M7 fill them in.
struct XenosDrawConstants {
draw_index: u32,
vertex_count: u32,
prim_kind: u32,
_pad: u32,
};
struct XenosConstants {
alu: array<vec4<f32>, 512>,
fetch: array<u32, 256>,
bool_consts: array<u32, 8>,
loop_consts: array<u32, 32>,
};
@group(0) @binding(0) var<uniform> draw_ctx : XenosDrawConstants;
// `xenos_consts` is a read-only storage buffer (not uniform) because the
// block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
// address space requires 16-byte element stride, which would triple the
// allocation; storage accepts the natural 4-byte stride.
@group(0) @binding(1) var<storage, read> xenos_consts : XenosConstants;
@group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
@group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
// all texture slots; the P5 texture cache will replace this with per-slot
// bindings.
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
@group(1) @binding(1) var xenos_samp : sampler;
// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
const CF_KIND_EXEC: u32 = 0u;
const CF_KIND_EXEC_END: u32 = 1u;
const CF_KIND_ALLOC: u32 = 2u;
const CF_KIND_EXIT: u32 = 3u;
const CF_KIND_LOOP_START: u32 = 4u;
const CF_KIND_LOOP_END: u32 = 5u;
const CF_KIND_COND_JMP: u32 = 6u;
const CF_KIND_COND_CALL: u32 = 7u;
const CF_KIND_RETURN: u32 = 8u;
const CF_KIND_UNKNOWN: u32 = 15u;
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
const ALLOC_KIND_POSITION: u32 = 0u;
const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
const ALLOC_KIND_COLORS: u32 = 2u;
// Per-invocation Xenos register file + scalar `ps` + predicate.
var<private> registers: array<vec4<f32>, 128>;
var<private> ps: f32;
var<private> predicate: bool;
// Currently-active export alloc kind; set by Alloc clauses.
var<private> current_alloc: u32;
// P3c additions:
// `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
// interpreter exits. (`discard` inside a helper function is
// allowed in WGSL, but keeping it at the entry level makes
// control flow easier to read.)
// `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
// clauses. Xenos supports up to 4 nested loops.
// `reject_mask` — bitfield of op categories we failed to interpret, so the
// PS fallback color + host-side diagnostics can surface it.
var<private> kill_flag: bool;
var<private> loop_depth: u32;
var<private> loop_counters: array<u32, 4>;
var<private> loop_starts: array<u32, 4>;
var<private> reject_mask: u32;
const REJECT_ALU_VEC: u32 = 1u;
const REJECT_ALU_SCL: u32 = 2u;
const REJECT_TEX_NON2D: u32 = 4u;
const REJECT_VFETCH_FMT: u32 = 8u;
const REJECT_CF_JUMP: u32 = 16u;
const REJECT_CF_CALL: u32 = 32u;
const REJECT_LOOP_OVERFLOW:u32 = 64u;
struct VsOut {
@builtin(position) position: vec4<f32>,
@location(0) color: vec4<f32>,
};
struct FsOut {
@location(0) color0: vec4<f32>,
};
// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.
fn vs_cf_count() -> u32 { return vs_ucode[0]; }
fn ps_cf_count() -> u32 { return ps_ucode[0]; }
fn vs_cf_kind(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
fn vs_cf_aux(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 2u]; }
fn ps_cf_kind(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
fn ps_cf_aux(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 2u]; }
fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }
// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
fn vs_instr_dword(t: u32, which: u32) -> u32 {
return vs_ucode[vs_instr_base() + t * 3u + which];
}
fn ps_instr_dword(t: u32, which: u32) -> u32 {
return ps_ucode[ps_instr_base() + t * 3u + which];
}
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
// Swizzle is 8 bits: 2 bits per output lane, xyzw order.
let sx = (swizzle >> 0u) & 3u;
let sy = (swizzle >> 2u) & 3u;
let sz = (swizzle >> 4u) & 3u;
let sw = (swizzle >> 6u) & 3u;
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
}
fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
var r = v;
if take_abs {
r = abs(r);
}
if negate {
r = -r;
}
return r;
}
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
if idx >= 128u {
return;
}
let prev = registers[idx];
var out = prev;
if (mask & 1u) != 0u { out.x = value.x; }
if (mask & 2u) != 0u { out.y = value.y; }
if (mask & 4u) != 0u { out.z = value.z; }
if (mask & 8u) != 0u { out.w = value.w; }
registers[idx] = out;
}
// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
// `xenia_gpu::ucode::alu::{vop,sop}`).
const VOP_ADD: u32 = 0u;
const VOP_MUL: u32 = 1u;
const VOP_MAX: u32 = 2u;
const VOP_MIN: u32 = 3u;
const VOP_SEQ: u32 = 4u;
const VOP_SGT: u32 = 5u;
const VOP_SGE: u32 = 6u;
const VOP_SNE: u32 = 7u;
const VOP_FRC: u32 = 8u;
const VOP_TRUNC: u32 = 9u;
const VOP_FLOOR: u32 = 10u;
const VOP_MAD: u32 = 11u;
const VOP_CND_EQ: u32 = 12u;
const VOP_CND_GE: u32 = 13u;
const VOP_CND_GT: u32 = 14u;
const VOP_DP4: u32 = 15u;
const VOP_DP3: u32 = 16u;
const VOP_DP2_ADD: u32 = 17u;
const VOP_CUBE: u32 = 18u;
const VOP_MAX4: u32 = 19u;
const VOP_KILL_EQ: u32 = 24u;
const VOP_KILL_GT: u32 = 25u;
const VOP_KILL_GE: u32 = 26u;
const VOP_KILL_NE: u32 = 27u;
const VOP_DST: u32 = 28u;
const VOP_MAX_A: u32 = 29u;
const SOP_ADDS: u32 = 0u;
const SOP_ADDS_PREV: u32 = 1u;
const SOP_MULS: u32 = 2u;
const SOP_MULS_PREV: u32 = 3u;
const SOP_MAXS: u32 = 5u;
const SOP_MINS: u32 = 6u;
const SOP_SEQS: u32 = 7u;
const SOP_SGTS: u32 = 8u;
const SOP_SGES: u32 = 9u;
const SOP_SNES: u32 = 10u;
const SOP_FRCS: u32 = 11u;
const SOP_TRUNCS: u32 = 12u;
const SOP_FLOORS: u32 = 13u;
const SOP_EXP: u32 = 14u;
const SOP_LOGC: u32 = 15u;
const SOP_LOG: u32 = 16u;
const SOP_RCPC: u32 = 17u;
const SOP_RCPF: u32 = 18u;
const SOP_RCP: u32 = 19u;
const SOP_RSQC: u32 = 20u;
const SOP_RSQF: u32 = 21u;
const SOP_RSQ: u32 = 22u;
const SOP_SUBS: u32 = 25u;
const SOP_SUBS_PREV: u32 = 26u;
const SOP_SETP_EQ: u32 = 27u;
const SOP_SETP_NE: u32 = 28u;
const SOP_SETP_GT: u32 = 29u;
const SOP_SETP_GE: u32 = 30u;
const SOP_SETP_INV: u32 = 31u;
const SOP_SETP_POP: u32 = 32u;
const SOP_SETP_CLR: u32 = 33u;
const SOP_SETP_RSTR: u32 = 34u;
const SOP_KILLS_EQ: u32 = 35u;
const SOP_KILLS_GT: u32 = 36u;
const SOP_KILLS_GE: u32 = 37u;
const SOP_KILLS_NE: u32 = 38u;
const SOP_KILLS_ONE: u32 = 39u;
const SOP_SQRT: u32 = 40u;
const SOP_SIN: u32 = 48u;
const SOP_COS: u32 = 49u;
const SOP_RETAIN_PREV: u32 = 50u;
// Read a vec4 source from the register file. Treats the src index as a
// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
// M4+ will extend this to decode the full operand header.
fn read_src(idx: u32) -> vec4<f32> {
return registers[idx & 0x7Fu];
}
fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
switch op {
case VOP_ADD: { return a + b; }
case VOP_MUL: { return a * b; }
case VOP_MAX: { return max(a, b); }
case VOP_MIN: { return min(a, b); }
case VOP_SEQ: {
return vec4<f32>(
select(0.0, 1.0, a.x == b.x),
select(0.0, 1.0, a.y == b.y),
select(0.0, 1.0, a.z == b.z),
select(0.0, 1.0, a.w == b.w),
);
}
case VOP_SGT: {
return vec4<f32>(
select(0.0, 1.0, a.x > b.x),
select(0.0, 1.0, a.y > b.y),
select(0.0, 1.0, a.z > b.z),
select(0.0, 1.0, a.w > b.w),
);
}
case VOP_SGE: {
return vec4<f32>(
select(0.0, 1.0, a.x >= b.x),
select(0.0, 1.0, a.y >= b.y),
select(0.0, 1.0, a.z >= b.z),
select(0.0, 1.0, a.w >= b.w),
);
}
case VOP_SNE: {
return vec4<f32>(
select(0.0, 1.0, a.x != b.x),
select(0.0, 1.0, a.y != b.y),
select(0.0, 1.0, a.z != b.z),
select(0.0, 1.0, a.w != b.w),
);
}
case VOP_FRC: { return fract(a); }
case VOP_TRUNC: { return trunc(a); }
case VOP_FLOOR: { return floor(a); }
case VOP_MAD: { return a * b + c; }
case VOP_CND_EQ: {
// dst = (src0 == 0) ? src1 : src2
return vec4<f32>(
select(c.x, b.x, a.x == 0.0),
select(c.y, b.y, a.y == 0.0),
select(c.z, b.z, a.z == 0.0),
select(c.w, b.w, a.w == 0.0),
);
}
case VOP_CND_GE: {
return vec4<f32>(
select(c.x, b.x, a.x >= 0.0),
select(c.y, b.y, a.y >= 0.0),
select(c.z, b.z, a.z >= 0.0),
select(c.w, b.w, a.w >= 0.0),
);
}
case VOP_CND_GT: {
return vec4<f32>(
select(c.x, b.x, a.x > 0.0),
select(c.y, b.y, a.y > 0.0),
select(c.z, b.z, a.z > 0.0),
select(c.w, b.w, a.w > 0.0),
);
}
case VOP_DP4: {
let d = dot(a, b);
return vec4<f32>(d, d, d, d);
}
case VOP_DP3: {
let d = dot(a.xyz, b.xyz);
return vec4<f32>(d, d, d, d);
}
case VOP_DP2_ADD: {
let d = a.x * b.x + a.y * b.y + c.x;
return vec4<f32>(d, d, d, d);
}
case VOP_MAX4: {
let m = max(max(a.x, a.y), max(a.z, a.w));
return vec4<f32>(m, m, m, m);
}
case VOP_KILL_EQ: {
if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_KILL_GT: {
if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_KILL_GE: {
if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_KILL_NE: {
if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_DST: {
// dest = (1, src0.y * src1.y, src0.z, src1.w)
return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
}
case VOP_CUBE, VOP_MAX_A: {
// Cube face projection + MAX+AR are rare in P3c's target set;
// forward to max() fallback so MAX_A degrades gracefully and
// CUBE does *something* useful (max-axis selection).
reject_mask |= REJECT_ALU_VEC;
return max(a, b);
}
default: {
// Unsupported — identity fallback + diagnostic flag.
reject_mask |= REJECT_ALU_VEC;
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
}
}
// Scalar op executor. Takes:
// `src_a` / `src_b` — the two scalar source components (most ops use
// only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
// `prev` — current `ps` chain value.
// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
switch op {
case SOP_ADDS: { return src_a + src_b; }
case SOP_ADDS_PREV: { return src_a + prev; }
case SOP_MULS: { return src_a * src_b; }
case SOP_MULS_PREV: { return src_a * prev; }
case SOP_SUBS: { return src_a - src_b; }
case SOP_SUBS_PREV: { return src_a - prev; }
case SOP_MAXS: { return max(src_a, src_b); }
case SOP_MINS: { return min(src_a, src_b); }
case SOP_SEQS: { return select(0.0, 1.0, src_a == 0.0); }
case SOP_SGTS: { return select(0.0, 1.0, src_a > 0.0); }
case SOP_SGES: { return select(0.0, 1.0, src_a >= 0.0); }
case SOP_SNES: { return select(0.0, 1.0, src_a != 0.0); }
case SOP_FRCS: { return fract(src_a); }
case SOP_TRUNCS: { return trunc(src_a); }
case SOP_FLOORS: { return floor(src_a); }
case SOP_EXP: {
// exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
// WGSL's exp2 already produces for 0.
return exp2(src_a);
}
case SOP_LOG, SOP_LOGC: {
// log/logc both compute log2. Canary's LOGC clamps -INF to
// -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
// inputs > 0, and we leave src <= 0 to be the shader author's
// problem (identity fallback via select).
return select(log2(src_a), 0.0, src_a == 1.0);
}
case SOP_RCP, SOP_RCPC, SOP_RCPF: {
// IEEE reciprocal with src==0 guard; the clamp-variants differ
// only in how they treat INF/NaN, which is shader-author-rare.
return select(0.0, 1.0 / src_a, src_a != 0.0);
}
case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
return select(0.0, inverseSqrt(src_a), src_a > 0.0);
}
case SOP_SQRT: { return select(0.0, sqrt(src_a), src_a >= 0.0); }
case SOP_SIN: { return sin(src_a); }
case SOP_COS: { return cos(src_a); }
// Predicate writes — update `predicate` and produce a result that
// the surrounding ALU slot can still consume via `ps`. Canary's
// setp-variant dst-write semantics are preserved.
case SOP_SETP_EQ: {
predicate = (src_a == 0.0);
return select(1.0, 0.0, src_a == 0.0);
}
case SOP_SETP_NE: {
predicate = (src_a != 0.0);
return select(1.0, 0.0, src_a != 0.0);
}
case SOP_SETP_GT: {
predicate = (src_a > 0.0);
return select(1.0, 0.0, src_a > 0.0);
}
case SOP_SETP_GE: {
predicate = (src_a >= 0.0);
return select(1.0, 0.0, src_a >= 0.0);
}
case SOP_SETP_INV: {
if src_a == 1.0 {
predicate = true;
return 0.0;
} else {
predicate = false;
return select(src_a, 1.0, src_a == 0.0);
}
}
case SOP_SETP_POP: {
if src_a - 1.0 <= 0.0 {
predicate = true;
return 0.0;
} else {
predicate = false;
return src_a - 1.0;
}
}
case SOP_SETP_CLR: {
predicate = false;
// FLT_MAX sentinel. WGSL's bitcast keeps this portable.
return bitcast<f32>(0x7F7FFFFFu);
}
case SOP_SETP_RSTR: {
if src_a == 0.0 {
predicate = true;
return 0.0;
} else {
predicate = false;
return src_a;
}
}
// Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
// canary, and `discard` runs at fragment exit when the flag is set.
case SOP_KILLS_EQ: {
if src_a == 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_GT: {
if src_a > 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_GE: {
if src_a >= 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_NE: {
if src_a != 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_ONE: {
if src_a == 1.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_RETAIN_PREV: { return prev; }
default: {
reject_mask |= REJECT_ALU_SCL;
return 0.0;
}
}
}
fn interpret_alu(t: u32, is_vertex: bool) {
// Read the 3-dword instruction triple.
var w0: u32;
var w1: u32;
var w2: u32;
if is_vertex {
w0 = vs_instr_dword(t, 0u);
w1 = vs_instr_dword(t, 1u);
w2 = vs_instr_dword(t, 2u);
} else {
w0 = ps_instr_dword(t, 0u);
w1 = ps_instr_dword(t, 1u);
w2 = ps_instr_dword(t, 2u);
}
// Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
let vec_op = w2 & 0x3Fu;
let scl_op = (w2 >> 6u) & 0x3Fu;
let vec_dst = (w2 >> 16u) & 0x7Fu;
let scl_dst = (w2 >> 24u) & 0x7Fu;
let vec_wm = (w2 >> 12u) & 0xFu;
let scl_wm = (w2 >> 8u) & 0xFu;
let src_a = w0 & 0xFFu;
let src_b = (w0 >> 8u) & 0xFFu;
let src_c = (w0 >> 16u) & 0xFFu;
let predicated = ((w0 >> 27u) & 1u) != 0u;
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
// `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
// treats every source as a full r#, no modifiers — M4+ decodes it.
_ = w1;
// Honor per-instruction predicate: skip when predicated and the
// predicate doesn't match the required condition.
if predicated && (predicate != predicate_condition) {
return;
}
// Vector pipe.
let a = read_src(src_a);
let b = read_src(src_b);
let c = read_src(src_c);
let vec_result = exec_vector_op(vec_op, a, b, c);
if vec_wm != 0u {
write_reg_masked(vec_dst, vec_wm, vec_result);
}
// Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
// read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
// is set the operand selector chooses `ps` as the primary source.
let scl_src_a = select(a.x, ps, scalar_src_is_ps);
let scl_src_b = b.x;
let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
ps = new_ps;
if scl_wm != 0u {
write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
}
}
// Xenos VertexFormat values from `xenos.h:641`.
const VFMT_8_8_8_8: u32 = 6u;
const VFMT_2_10_10_10: u32 = 7u;
const VFMT_10_11_11: u32 = 16u;
const VFMT_11_11_10: u32 = 17u;
const VFMT_16_16: u32 = 25u;
const VFMT_16_16_16_16: u32 = 26u;
const VFMT_16_16_FLOAT: u32 = 31u;
const VFMT_16_16_16_16_FLOAT:u32 = 32u;
const VFMT_32: u32 = 33u;
const VFMT_32_32: u32 = 34u;
const VFMT_32_32_32_32: u32 = 35u;
const VFMT_32_FLOAT: u32 = 36u;
const VFMT_32_32_FLOAT: u32 = 37u;
const VFMT_32_32_32_32_FLOAT:u32 = 38u;
const VFMT_32_32_32_FLOAT: u32 = 57u;
// Decode vertex fetch instruction fields (canary's VertexFetchInstruction
// layout in `ucode.h:690`):
// w0 [4:0] opcode
// w0 [10:5] src_reg[5:0]
// w0 [17:11] dst_reg[6:0] + must-be-one
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
// w1 [21:16] format[5:0]
// w2 [7:0] stride (in dwords)
// w2 [30:8] offset (signed, in dwords)
//
// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
// sign/normalization flags are used for the most-common normalized-unsigned
// path. Rejects set `REJECT_VFETCH_FMT`.
fn interpret_vertex_fetch(t: u32) {
let w0 = vs_instr_dword(t, 0u);
let w1 = vs_instr_dword(t, 1u);
let w2 = vs_instr_dword(t, 2u);
let fetch_const = (w0 >> 5u) & 0x1Fu;
let dst_reg = (w0 >> 10u) & 0x7Fu;
let src_reg = (w0 >> 17u) & 0x7Fu;
let format = (w1 >> 16u) & 0x3Fu;
let stride = w2 & 0xFFu;
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
// dword 1 carries (endian[1:0], size[25:2]).
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
let vidx = u32(registers[src_reg & 0x7Fu].x);
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
let effective_stride = select(stride, 4u, stride == 0u);
let addr = base_dwords + vidx * effective_stride;
let n = arrayLength(&vertex_buffer);
var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);
switch format {
case VFMT_32_32_32_32_FLOAT: {
if addr + 3u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
bitcast<f32>(vertex_buffer[addr + 2u]),
bitcast<f32>(vertex_buffer[addr + 3u]),
);
}
}
case VFMT_32_32_32_FLOAT: {
if addr + 2u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
bitcast<f32>(vertex_buffer[addr + 2u]),
1.0,
);
}
}
case VFMT_32_32_FLOAT: {
if addr + 1u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
0.0,
1.0,
);
}
}
case VFMT_32_FLOAT: {
if addr < n {
result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
}
}
case VFMT_8_8_8_8: {
if addr < n {
result = unpack4x8unorm(vertex_buffer[addr]);
}
}
case VFMT_16_16_FLOAT: {
if addr < n {
let h = unpack2x16float(vertex_buffer[addr]);
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
}
}
case VFMT_16_16_16_16_FLOAT: {
if addr + 1u < n {
let h0 = unpack2x16float(vertex_buffer[addr]);
let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
}
}
case VFMT_16_16: {
if addr < n {
// Default to signed normalized; unsigned variants differ
// only for sign-extension and are less common on Xenos VBs.
let h = unpack2x16snorm(vertex_buffer[addr]);
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
}
}
case VFMT_16_16_16_16: {
if addr + 1u < n {
let h0 = unpack2x16snorm(vertex_buffer[addr]);
let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
}
}
case VFMT_2_10_10_10: {
// Unpack 10-bit R/G/B (signed or unsigned, default unsigned
// normalized) + 2-bit A. Unnormalized not exercised here.
if addr < n {
let packed = vertex_buffer[addr];
let r = f32(packed & 0x3FFu) / 1023.0;
let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
let a = f32((packed >> 30u) & 0x3u) / 3.0;
result = vec4<f32>(r, g, b, a);
}
}
default: {
reject_mask |= REJECT_VFETCH_FMT;
// Identity fallback preserves vertex-index visibility.
if addr < n {
result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
}
}
}
registers[dst_reg & 0x7Fu] = result;
}
// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
// dummy bound at group(1); the real per-slot texture cache lands with P5.
// Reads (u, v) from the source register's .xy and writes the sample into
// the destination register. `textureSampleLevel` works in both VS and PS
// (no implicit derivatives), so no per-stage specialisation needed.
fn interpret_texture_fetch(t: u32, is_vertex: bool) {
var w0: u32 = 0u;
if is_vertex {
w0 = vs_instr_dword(t, 0u);
} else {
w0 = ps_instr_dword(t, 0u);
}
let dst_reg = (w0 >> 10u) & 0x7Fu;
let src_reg = (w0 >> 17u) & 0x7Fu;
let uv = registers[src_reg & 0x7Fu].xy;
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
registers[dst_reg & 0x7Fu] = sample;
}
// Walk an Exec clause's instruction triples.
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
// (we ignore in MVP); bit 1 = is-fetch.
fn exec_vs(address: u32, count: u32, sequence: u32) {
for (var i: u32 = 0u; i < count; i = i + 1u) {
let t = address + i;
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
if is_fetch {
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
// 0x00 = vertex fetch, 0x01 = texture fetch.
if opcode == 0u {
interpret_vertex_fetch(t);
} else if opcode == 1u {
interpret_texture_fetch(t, true);
}
} else {
interpret_alu(t, true);
}
}
}
fn exec_ps(address: u32, count: u32, sequence: u32) {
for (var i: u32 = 0u; i < count; i = i + 1u) {
let t = address + i;
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
if is_fetch {
interpret_texture_fetch(t, false);
} else {
interpret_alu(t, false);
}
}
}
// Reset the per-invocation register state to a known baseline.
fn reset_state() {
for (var i: u32 = 0u; i < 128u; i = i + 1u) {
registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
ps = 0.0;
predicate = false;
current_alloc = 0u;
kill_flag = false;
loop_depth = 0u;
reject_mask = 0u;
for (var i: u32 = 0u; i < 4u; i = i + 1u) {
loop_counters[i] = 0u;
loop_starts[i] = 0u;
}
}
// ── Stage entry points.
// M7 register slots for exports. VS writes position at oPos (convention:
// Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
// and a set of interpolators. We track both via `current_alloc`: writes
// inside each alloc range are tagged and copied out at Exit.
const OPOS_REG: u32 = 32u; // synthetic slot used by the interpreter
const OCOLOR_REG: u32 = 33u; // color0 scratch slot
@vertex
fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
reset_state();
// Seed r0 with the vertex index so simple shaders (or the procedural
// fallback) have access without a real vertex fetch.
registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
// Seed the export slots with a procedural fallback: if the shader
// never writes oPos / oColor, this keeps the output visible rather
// than collapsing to (0,0) which would skip rasterization.
let total = max(draw_ctx.vertex_count, 1u);
let t_param = f32(vidx) / f32(total);
let angle = t_param * 6.2831853;
let radius = 0.35;
registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
let d = f32(draw_ctx.draw_index);
registers[OCOLOR_REG] = vec4<f32>(
0.5 + 0.5 * sin(d * 0.37),
0.5 + 0.5 * sin(d * 0.51 + 2.0),
0.5 + 0.5 * sin(d * 0.73 + 4.0),
1.0,
);
// Dead-binding guard for VERTEX-stage-only vertex_buffer access.
let vb_live = f32(vertex_buffer[0]) * 0.0;
// Walk the VS CF table.
walk_cf_vs();
var out: VsOut;
// Use registers[OPOS_REG] as position; the procedural fallback above
// seeded it so an un-interpreted shader still draws a recognisable
// circle.
out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
return out;
}
@fragment
fn fs_main(in: VsOut) -> FsOut {
reset_state();
walk_cf_ps();
// Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
// `discard` at the entry level (outside any helper) is the only way to
// guarantee early-out in WGSL.
if kill_flag {
discard;
}
var out: FsOut;
out.color0 = in.color;
return out;
}
// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
// we don't have). A hard iteration cap keeps the GPU from hanging on
// malformed or extreme shaders.
const CF_WALKER_MAX_ITER: u32 = 4096u;
fn walk_cf_vs() {
let cf_n = vs_cf_count();
var cf_i: u32 = 0u;
var iter: u32 = 0u;
loop {
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
iter = iter + 1u;
let kind = vs_cf_kind(cf_i);
let primary = vs_cf_primary(cf_i);
let aux = vs_cf_aux(cf_i);
var advance: bool = true;
var stop: bool = false;
switch kind {
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
let count = aux & 0xFFu;
let sequence = aux >> 8u;
exec_vs(primary, count, sequence);
if kind == CF_KIND_EXEC_END { stop = true; }
}
case CF_KIND_ALLOC: { current_alloc = primary; }
case CF_KIND_EXIT: { stop = true; }
case CF_KIND_LOOP_START: {
let loop_id = aux & 0x1Fu;
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
if loop_count > 16u {
loop_count = 16u;
reject_mask |= REJECT_LOOP_OVERFLOW;
}
if loop_count > 0u && loop_depth < 4u {
loop_starts[loop_depth] = cf_i;
loop_counters[loop_depth] = loop_count;
loop_depth = loop_depth + 1u;
}
// count==0 → fall through; matching LOOP_END will pop.
}
case CF_KIND_LOOP_END: {
if loop_depth > 0u {
let d = loop_depth - 1u;
if loop_counters[d] > 1u {
loop_counters[d] = loop_counters[d] - 1u;
cf_i = loop_starts[d] + 1u;
advance = false;
} else {
loop_counters[d] = 0u;
loop_depth = d;
}
}
}
case CF_KIND_COND_JMP: {
let pred_bits = aux;
let is_pred = (pred_bits & 1u) != 0u;
let pred_cnd = (pred_bits & 2u) != 0u;
if !is_pred || predicate == pred_cnd {
cf_i = primary;
advance = false;
}
}
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
// No call stack — mark and continue.
reject_mask |= REJECT_CF_CALL;
}
default: { reject_mask |= REJECT_CF_JUMP; }
}
if stop { break; }
if advance { cf_i = cf_i + 1u; }
}
}
fn walk_cf_ps() {
let cf_n = ps_cf_count();
var cf_i: u32 = 0u;
var iter: u32 = 0u;
loop {
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
iter = iter + 1u;
let kind = ps_cf_kind(cf_i);
let primary = ps_cf_primary(cf_i);
let aux = ps_cf_aux(cf_i);
var advance: bool = true;
var stop: bool = false;
switch kind {
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
let count = aux & 0xFFu;
let sequence = aux >> 8u;
exec_ps(primary, count, sequence);
if kind == CF_KIND_EXEC_END { stop = true; }
}
case CF_KIND_ALLOC: { current_alloc = primary; }
case CF_KIND_EXIT: { stop = true; }
case CF_KIND_LOOP_START: {
let loop_id = aux & 0x1Fu;
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
if loop_count > 16u {
loop_count = 16u;
reject_mask |= REJECT_LOOP_OVERFLOW;
}
if loop_count > 0u && loop_depth < 4u {
loop_starts[loop_depth] = cf_i;
loop_counters[loop_depth] = loop_count;
loop_depth = loop_depth + 1u;
}
}
case CF_KIND_LOOP_END: {
if loop_depth > 0u {
let d = loop_depth - 1u;
if loop_counters[d] > 1u {
loop_counters[d] = loop_counters[d] - 1u;
cf_i = loop_starts[d] + 1u;
advance = false;
} else {
loop_counters[d] = 0u;
loop_depth = d;
}
}
}
case CF_KIND_COND_JMP: {
let pred_bits = aux;
let is_pred = (pred_bits & 1u) != 0u;
let pred_cnd = (pred_bits & 2u) != 0u;
if !is_pred || predicate == pred_cnd {
cf_i = primary;
advance = false;
}
}
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
reject_mask |= REJECT_CF_CALL;
}
default: { reject_mask |= REJECT_CF_JUMP; }
}
if stop { break; }
if advance { cf_i = cf_i + 1u; }
}
}