Files
xenia-rs/crates/xenia-gpu/src/shaders/xenos_interp.wgsl
MechaCat02 79eb52c378 xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)
First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).

Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.

Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:29:38 +02:00

975 lines
35 KiB
WebGPU Shading Language
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
//
// Bindings (stable across P3b milestones):
// @group(0) @binding(0) draw_ctx (uniform, 16 B — XenosDrawConstants)
// @group(0) @binding(1) xenos_consts (uniform, ~9.2 KB — XenosConstants)
// @group(0) @binding(2) vs_ucode (storage<read>, packed VS shader)
// @group(0) @binding(3) ps_ucode (storage<read>, packed PS shader)
// @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
//
// Packed shader layout (both vs_ucode & ps_ucode):
// [0] = cf_count
// [1 .. 1 + cf_count*3] = CF table: (kind, primary, aux) × cf_count
// [1 + cf_count*3 ..] = instruction triples (3 dwords each)
//
// M3 state (this file): CF walker + operand decode helpers + register file
// scaffold are complete. ALU / fetch bodies are still stubs that fall back
// to the procedural-circle visualisation; M4-M7 fill them in.
struct XenosDrawConstants {
draw_index: u32,
vertex_count: u32,
prim_kind: u32,
_pad: u32,
};
struct XenosConstants {
alu: array<vec4<f32>, 512>,
fetch: array<u32, 256>,
bool_consts: array<u32, 8>,
loop_consts: array<u32, 32>,
};
@group(0) @binding(0) var<uniform> draw_ctx : XenosDrawConstants;
// `xenos_consts` is a read-only storage buffer (not uniform) because the
// block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
// address space requires 16-byte element stride, which would triple the
// allocation; storage accepts the natural 4-byte stride.
@group(0) @binding(1) var<storage, read> xenos_consts : XenosConstants;
@group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
@group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
// all texture slots; the P5 texture cache will replace this with per-slot
// bindings.
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
@group(1) @binding(1) var xenos_samp : sampler;
// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
const CF_KIND_EXEC: u32 = 0u;
const CF_KIND_EXEC_END: u32 = 1u;
const CF_KIND_ALLOC: u32 = 2u;
const CF_KIND_EXIT: u32 = 3u;
const CF_KIND_LOOP_START: u32 = 4u;
const CF_KIND_LOOP_END: u32 = 5u;
const CF_KIND_COND_JMP: u32 = 6u;
const CF_KIND_COND_CALL: u32 = 7u;
const CF_KIND_RETURN: u32 = 8u;
const CF_KIND_UNKNOWN: u32 = 15u;
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
const ALLOC_KIND_POSITION: u32 = 0u;
const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
const ALLOC_KIND_COLORS: u32 = 2u;
// Per-invocation Xenos register file + scalar `ps` + predicate.
var<private> registers: array<vec4<f32>, 128>;
var<private> ps: f32;
var<private> predicate: bool;
// Currently-active export alloc kind; set by Alloc clauses.
var<private> current_alloc: u32;
// P3c additions:
// `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
// interpreter exits. (`discard` inside a helper function is
// allowed in WGSL, but keeping it at the entry level makes
// control flow easier to read.)
// `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
// clauses. Xenos supports up to 4 nested loops.
// `reject_mask` — bitfield of op categories we failed to interpret, so the
// PS fallback color + host-side diagnostics can surface it.
var<private> kill_flag: bool;
var<private> loop_depth: u32;
var<private> loop_counters: array<u32, 4>;
var<private> loop_starts: array<u32, 4>;
var<private> reject_mask: u32;
const REJECT_ALU_VEC: u32 = 1u;
const REJECT_ALU_SCL: u32 = 2u;
const REJECT_TEX_NON2D: u32 = 4u;
const REJECT_VFETCH_FMT: u32 = 8u;
const REJECT_CF_JUMP: u32 = 16u;
const REJECT_CF_CALL: u32 = 32u;
const REJECT_LOOP_OVERFLOW:u32 = 64u;
struct VsOut {
@builtin(position) position: vec4<f32>,
@location(0) color: vec4<f32>,
};
struct FsOut {
@location(0) color0: vec4<f32>,
};
// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.
fn vs_cf_count() -> u32 { return vs_ucode[0]; }
fn ps_cf_count() -> u32 { return ps_ucode[0]; }
fn vs_cf_kind(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
fn vs_cf_aux(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 2u]; }
fn ps_cf_kind(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
fn ps_cf_aux(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 2u]; }
fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }
// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
fn vs_instr_dword(t: u32, which: u32) -> u32 {
return vs_ucode[vs_instr_base() + t * 3u + which];
}
fn ps_instr_dword(t: u32, which: u32) -> u32 {
return ps_ucode[ps_instr_base() + t * 3u + which];
}
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
// Swizzle is 8 bits: 2 bits per output lane, xyzw order.
let sx = (swizzle >> 0u) & 3u;
let sy = (swizzle >> 2u) & 3u;
let sz = (swizzle >> 4u) & 3u;
let sw = (swizzle >> 6u) & 3u;
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
}
fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
var r = v;
if take_abs {
r = abs(r);
}
if negate {
r = -r;
}
return r;
}
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
if idx >= 128u {
return;
}
let prev = registers[idx];
var out = prev;
if (mask & 1u) != 0u { out.x = value.x; }
if (mask & 2u) != 0u { out.y = value.y; }
if (mask & 4u) != 0u { out.z = value.z; }
if (mask & 8u) != 0u { out.w = value.w; }
registers[idx] = out;
}
// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
// `xenia_gpu::ucode::alu::{vop,sop}`).
const VOP_ADD: u32 = 0u;
const VOP_MUL: u32 = 1u;
const VOP_MAX: u32 = 2u;
const VOP_MIN: u32 = 3u;
const VOP_SEQ: u32 = 4u;
const VOP_SGT: u32 = 5u;
const VOP_SGE: u32 = 6u;
const VOP_SNE: u32 = 7u;
const VOP_FRC: u32 = 8u;
const VOP_TRUNC: u32 = 9u;
const VOP_FLOOR: u32 = 10u;
const VOP_MAD: u32 = 11u;
const VOP_CND_EQ: u32 = 12u;
const VOP_CND_GE: u32 = 13u;
const VOP_CND_GT: u32 = 14u;
const VOP_DP4: u32 = 15u;
const VOP_DP3: u32 = 16u;
const VOP_DP2_ADD: u32 = 17u;
const VOP_CUBE: u32 = 18u;
const VOP_MAX4: u32 = 19u;
const VOP_KILL_EQ: u32 = 24u;
const VOP_KILL_GT: u32 = 25u;
const VOP_KILL_GE: u32 = 26u;
const VOP_KILL_NE: u32 = 27u;
const VOP_DST: u32 = 28u;
const VOP_MAX_A: u32 = 29u;
const SOP_ADDS: u32 = 0u;
const SOP_ADDS_PREV: u32 = 1u;
const SOP_MULS: u32 = 2u;
const SOP_MULS_PREV: u32 = 3u;
const SOP_MAXS: u32 = 5u;
const SOP_MINS: u32 = 6u;
const SOP_SEQS: u32 = 7u;
const SOP_SGTS: u32 = 8u;
const SOP_SGES: u32 = 9u;
const SOP_SNES: u32 = 10u;
const SOP_FRCS: u32 = 11u;
const SOP_TRUNCS: u32 = 12u;
const SOP_FLOORS: u32 = 13u;
const SOP_EXP: u32 = 14u;
const SOP_LOGC: u32 = 15u;
const SOP_LOG: u32 = 16u;
const SOP_RCPC: u32 = 17u;
const SOP_RCPF: u32 = 18u;
const SOP_RCP: u32 = 19u;
const SOP_RSQC: u32 = 20u;
const SOP_RSQF: u32 = 21u;
const SOP_RSQ: u32 = 22u;
const SOP_SUBS: u32 = 25u;
const SOP_SUBS_PREV: u32 = 26u;
const SOP_SETP_EQ: u32 = 27u;
const SOP_SETP_NE: u32 = 28u;
const SOP_SETP_GT: u32 = 29u;
const SOP_SETP_GE: u32 = 30u;
const SOP_SETP_INV: u32 = 31u;
const SOP_SETP_POP: u32 = 32u;
const SOP_SETP_CLR: u32 = 33u;
const SOP_SETP_RSTR: u32 = 34u;
const SOP_KILLS_EQ: u32 = 35u;
const SOP_KILLS_GT: u32 = 36u;
const SOP_KILLS_GE: u32 = 37u;
const SOP_KILLS_NE: u32 = 38u;
const SOP_KILLS_ONE: u32 = 39u;
const SOP_SQRT: u32 = 40u;
const SOP_SIN: u32 = 48u;
const SOP_COS: u32 = 49u;
const SOP_RETAIN_PREV: u32 = 50u;
// Read a vec4 source from the register file. Treats the src index as a
// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
// M4+ will extend this to decode the full operand header.
fn read_src(idx: u32) -> vec4<f32> {
return registers[idx & 0x7Fu];
}
fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
switch op {
case VOP_ADD: { return a + b; }
case VOP_MUL: { return a * b; }
case VOP_MAX: { return max(a, b); }
case VOP_MIN: { return min(a, b); }
case VOP_SEQ: {
return vec4<f32>(
select(0.0, 1.0, a.x == b.x),
select(0.0, 1.0, a.y == b.y),
select(0.0, 1.0, a.z == b.z),
select(0.0, 1.0, a.w == b.w),
);
}
case VOP_SGT: {
return vec4<f32>(
select(0.0, 1.0, a.x > b.x),
select(0.0, 1.0, a.y > b.y),
select(0.0, 1.0, a.z > b.z),
select(0.0, 1.0, a.w > b.w),
);
}
case VOP_SGE: {
return vec4<f32>(
select(0.0, 1.0, a.x >= b.x),
select(0.0, 1.0, a.y >= b.y),
select(0.0, 1.0, a.z >= b.z),
select(0.0, 1.0, a.w >= b.w),
);
}
case VOP_SNE: {
return vec4<f32>(
select(0.0, 1.0, a.x != b.x),
select(0.0, 1.0, a.y != b.y),
select(0.0, 1.0, a.z != b.z),
select(0.0, 1.0, a.w != b.w),
);
}
case VOP_FRC: { return fract(a); }
case VOP_TRUNC: { return trunc(a); }
case VOP_FLOOR: { return floor(a); }
case VOP_MAD: { return a * b + c; }
case VOP_CND_EQ: {
// dst = (src0 == 0) ? src1 : src2
return vec4<f32>(
select(c.x, b.x, a.x == 0.0),
select(c.y, b.y, a.y == 0.0),
select(c.z, b.z, a.z == 0.0),
select(c.w, b.w, a.w == 0.0),
);
}
case VOP_CND_GE: {
return vec4<f32>(
select(c.x, b.x, a.x >= 0.0),
select(c.y, b.y, a.y >= 0.0),
select(c.z, b.z, a.z >= 0.0),
select(c.w, b.w, a.w >= 0.0),
);
}
case VOP_CND_GT: {
return vec4<f32>(
select(c.x, b.x, a.x > 0.0),
select(c.y, b.y, a.y > 0.0),
select(c.z, b.z, a.z > 0.0),
select(c.w, b.w, a.w > 0.0),
);
}
case VOP_DP4: {
let d = dot(a, b);
return vec4<f32>(d, d, d, d);
}
case VOP_DP3: {
let d = dot(a.xyz, b.xyz);
return vec4<f32>(d, d, d, d);
}
case VOP_DP2_ADD: {
let d = a.x * b.x + a.y * b.y + c.x;
return vec4<f32>(d, d, d, d);
}
case VOP_MAX4: {
let m = max(max(a.x, a.y), max(a.z, a.w));
return vec4<f32>(m, m, m, m);
}
case VOP_KILL_EQ: {
if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_KILL_GT: {
if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_KILL_GE: {
if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_KILL_NE: {
if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
kill_flag = true;
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
}
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
case VOP_DST: {
// dest = (1, src0.y * src1.y, src0.z, src1.w)
return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
}
case VOP_CUBE, VOP_MAX_A: {
// Cube face projection + MAX+AR are rare in P3c's target set;
// forward to max() fallback so MAX_A degrades gracefully and
// CUBE does *something* useful (max-axis selection).
reject_mask |= REJECT_ALU_VEC;
return max(a, b);
}
default: {
// Unsupported — identity fallback + diagnostic flag.
reject_mask |= REJECT_ALU_VEC;
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
}
}
// Scalar op executor. Takes:
// `src_a` / `src_b` — the two scalar source components (most ops use
// only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
// `prev` — current `ps` chain value.
// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
switch op {
case SOP_ADDS: { return src_a + src_b; }
case SOP_ADDS_PREV: { return src_a + prev; }
case SOP_MULS: { return src_a * src_b; }
case SOP_MULS_PREV: { return src_a * prev; }
case SOP_SUBS: { return src_a - src_b; }
case SOP_SUBS_PREV: { return src_a - prev; }
case SOP_MAXS: { return max(src_a, src_b); }
case SOP_MINS: { return min(src_a, src_b); }
case SOP_SEQS: { return select(0.0, 1.0, src_a == 0.0); }
case SOP_SGTS: { return select(0.0, 1.0, src_a > 0.0); }
case SOP_SGES: { return select(0.0, 1.0, src_a >= 0.0); }
case SOP_SNES: { return select(0.0, 1.0, src_a != 0.0); }
case SOP_FRCS: { return fract(src_a); }
case SOP_TRUNCS: { return trunc(src_a); }
case SOP_FLOORS: { return floor(src_a); }
case SOP_EXP: {
// exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
// WGSL's exp2 already produces for 0.
return exp2(src_a);
}
case SOP_LOG, SOP_LOGC: {
// log/logc both compute log2. Canary's LOGC clamps -INF to
// -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
// inputs > 0, and we leave src <= 0 to be the shader author's
// problem (identity fallback via select).
return select(log2(src_a), 0.0, src_a == 1.0);
}
case SOP_RCP, SOP_RCPC, SOP_RCPF: {
// IEEE reciprocal with src==0 guard; the clamp-variants differ
// only in how they treat INF/NaN, which is shader-author-rare.
return select(0.0, 1.0 / src_a, src_a != 0.0);
}
case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
return select(0.0, inverseSqrt(src_a), src_a > 0.0);
}
case SOP_SQRT: { return select(0.0, sqrt(src_a), src_a >= 0.0); }
case SOP_SIN: { return sin(src_a); }
case SOP_COS: { return cos(src_a); }
// Predicate writes — update `predicate` and produce a result that
// the surrounding ALU slot can still consume via `ps`. Canary's
// setp-variant dst-write semantics are preserved.
case SOP_SETP_EQ: {
predicate = (src_a == 0.0);
return select(1.0, 0.0, src_a == 0.0);
}
case SOP_SETP_NE: {
predicate = (src_a != 0.0);
return select(1.0, 0.0, src_a != 0.0);
}
case SOP_SETP_GT: {
predicate = (src_a > 0.0);
return select(1.0, 0.0, src_a > 0.0);
}
case SOP_SETP_GE: {
predicate = (src_a >= 0.0);
return select(1.0, 0.0, src_a >= 0.0);
}
case SOP_SETP_INV: {
if src_a == 1.0 {
predicate = true;
return 0.0;
} else {
predicate = false;
return select(src_a, 1.0, src_a == 0.0);
}
}
case SOP_SETP_POP: {
if src_a - 1.0 <= 0.0 {
predicate = true;
return 0.0;
} else {
predicate = false;
return src_a - 1.0;
}
}
case SOP_SETP_CLR: {
predicate = false;
// FLT_MAX sentinel. WGSL's bitcast keeps this portable.
return bitcast<f32>(0x7F7FFFFFu);
}
case SOP_SETP_RSTR: {
if src_a == 0.0 {
predicate = true;
return 0.0;
} else {
predicate = false;
return src_a;
}
}
// Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
// canary, and `discard` runs at fragment exit when the flag is set.
case SOP_KILLS_EQ: {
if src_a == 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_GT: {
if src_a > 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_GE: {
if src_a >= 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_NE: {
if src_a != 0.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_KILLS_ONE: {
if src_a == 1.0 { kill_flag = true; return 1.0; }
return 0.0;
}
case SOP_RETAIN_PREV: { return prev; }
default: {
reject_mask |= REJECT_ALU_SCL;
return 0.0;
}
}
}
fn interpret_alu(t: u32, is_vertex: bool) {
// Read the 3-dword instruction triple.
var w0: u32;
var w1: u32;
var w2: u32;
if is_vertex {
w0 = vs_instr_dword(t, 0u);
w1 = vs_instr_dword(t, 1u);
w2 = vs_instr_dword(t, 2u);
} else {
w0 = ps_instr_dword(t, 0u);
w1 = ps_instr_dword(t, 1u);
w2 = ps_instr_dword(t, 2u);
}
// Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
let vec_op = w2 & 0x3Fu;
let scl_op = (w2 >> 6u) & 0x3Fu;
let vec_dst = (w2 >> 16u) & 0x7Fu;
let scl_dst = (w2 >> 24u) & 0x7Fu;
let vec_wm = (w2 >> 12u) & 0xFu;
let scl_wm = (w2 >> 8u) & 0xFu;
let src_a = w0 & 0xFFu;
let src_b = (w0 >> 8u) & 0xFFu;
let src_c = (w0 >> 16u) & 0xFFu;
let predicated = ((w0 >> 27u) & 1u) != 0u;
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
// `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
// treats every source as a full r#, no modifiers — M4+ decodes it.
_ = w1;
// Honor per-instruction predicate: skip when predicated and the
// predicate doesn't match the required condition.
if predicated && (predicate != predicate_condition) {
return;
}
// Vector pipe.
let a = read_src(src_a);
let b = read_src(src_b);
let c = read_src(src_c);
let vec_result = exec_vector_op(vec_op, a, b, c);
if vec_wm != 0u {
write_reg_masked(vec_dst, vec_wm, vec_result);
}
// Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
// read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
// is set the operand selector chooses `ps` as the primary source.
let scl_src_a = select(a.x, ps, scalar_src_is_ps);
let scl_src_b = b.x;
let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
ps = new_ps;
if scl_wm != 0u {
write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
}
}
// Xenos VertexFormat values from `xenos.h:641`.
const VFMT_8_8_8_8: u32 = 6u;
const VFMT_2_10_10_10: u32 = 7u;
const VFMT_10_11_11: u32 = 16u;
const VFMT_11_11_10: u32 = 17u;
const VFMT_16_16: u32 = 25u;
const VFMT_16_16_16_16: u32 = 26u;
const VFMT_16_16_FLOAT: u32 = 31u;
const VFMT_16_16_16_16_FLOAT:u32 = 32u;
const VFMT_32: u32 = 33u;
const VFMT_32_32: u32 = 34u;
const VFMT_32_32_32_32: u32 = 35u;
const VFMT_32_FLOAT: u32 = 36u;
const VFMT_32_32_FLOAT: u32 = 37u;
const VFMT_32_32_32_32_FLOAT:u32 = 38u;
const VFMT_32_32_32_FLOAT: u32 = 57u;
// Decode vertex fetch instruction fields (canary's VertexFetchInstruction
// layout in `ucode.h:690`):
// w0 [4:0] opcode
// w0 [10:5] src_reg[5:0]
// w0 [17:11] dst_reg[6:0] + must-be-one
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
// w1 [21:16] format[5:0]
// w2 [7:0] stride (in dwords)
// w2 [30:8] offset (signed, in dwords)
//
// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
// sign/normalization flags are used for the most-common normalized-unsigned
// path. Rejects set `REJECT_VFETCH_FMT`.
fn interpret_vertex_fetch(t: u32) {
let w0 = vs_instr_dword(t, 0u);
let w1 = vs_instr_dword(t, 1u);
let w2 = vs_instr_dword(t, 2u);
let fetch_const = (w0 >> 5u) & 0x1Fu;
let dst_reg = (w0 >> 10u) & 0x7Fu;
let src_reg = (w0 >> 17u) & 0x7Fu;
let format = (w1 >> 16u) & 0x3Fu;
let stride = w2 & 0xFFu;
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
// dword 1 carries (endian[1:0], size[25:2]).
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
let vidx = u32(registers[src_reg & 0x7Fu].x);
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
let effective_stride = select(stride, 4u, stride == 0u);
let addr = base_dwords + vidx * effective_stride;
let n = arrayLength(&vertex_buffer);
var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);
switch format {
case VFMT_32_32_32_32_FLOAT: {
if addr + 3u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
bitcast<f32>(vertex_buffer[addr + 2u]),
bitcast<f32>(vertex_buffer[addr + 3u]),
);
}
}
case VFMT_32_32_32_FLOAT: {
if addr + 2u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
bitcast<f32>(vertex_buffer[addr + 2u]),
1.0,
);
}
}
case VFMT_32_32_FLOAT: {
if addr + 1u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
0.0,
1.0,
);
}
}
case VFMT_32_FLOAT: {
if addr < n {
result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
}
}
case VFMT_8_8_8_8: {
if addr < n {
result = unpack4x8unorm(vertex_buffer[addr]);
}
}
case VFMT_16_16_FLOAT: {
if addr < n {
let h = unpack2x16float(vertex_buffer[addr]);
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
}
}
case VFMT_16_16_16_16_FLOAT: {
if addr + 1u < n {
let h0 = unpack2x16float(vertex_buffer[addr]);
let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
}
}
case VFMT_16_16: {
if addr < n {
// Default to signed normalized; unsigned variants differ
// only for sign-extension and are less common on Xenos VBs.
let h = unpack2x16snorm(vertex_buffer[addr]);
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
}
}
case VFMT_16_16_16_16: {
if addr + 1u < n {
let h0 = unpack2x16snorm(vertex_buffer[addr]);
let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
}
}
case VFMT_2_10_10_10: {
// Unpack 10-bit R/G/B (signed or unsigned, default unsigned
// normalized) + 2-bit A. Unnormalized not exercised here.
if addr < n {
let packed = vertex_buffer[addr];
let r = f32(packed & 0x3FFu) / 1023.0;
let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
let a = f32((packed >> 30u) & 0x3u) / 3.0;
result = vec4<f32>(r, g, b, a);
}
}
default: {
reject_mask |= REJECT_VFETCH_FMT;
// Identity fallback preserves vertex-index visibility.
if addr < n {
result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
}
}
}
registers[dst_reg & 0x7Fu] = result;
}
// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
// dummy bound at group(1); the real per-slot texture cache lands with P5.
// Reads (u, v) from the source register's .xy and writes the sample into
// the destination register. `textureSampleLevel` works in both VS and PS
// (no implicit derivatives), so no per-stage specialisation needed.
fn interpret_texture_fetch(t: u32, is_vertex: bool) {
var w0: u32 = 0u;
if is_vertex {
w0 = vs_instr_dword(t, 0u);
} else {
w0 = ps_instr_dword(t, 0u);
}
let dst_reg = (w0 >> 10u) & 0x7Fu;
let src_reg = (w0 >> 17u) & 0x7Fu;
let uv = registers[src_reg & 0x7Fu].xy;
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
registers[dst_reg & 0x7Fu] = sample;
}
// Walk an Exec clause's instruction triples.
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
// (we ignore in MVP); bit 1 = is-fetch.
fn exec_vs(address: u32, count: u32, sequence: u32) {
for (var i: u32 = 0u; i < count; i = i + 1u) {
let t = address + i;
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
if is_fetch {
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
// 0x00 = vertex fetch, 0x01 = texture fetch.
if opcode == 0u {
interpret_vertex_fetch(t);
} else if opcode == 1u {
interpret_texture_fetch(t, true);
}
} else {
interpret_alu(t, true);
}
}
}
fn exec_ps(address: u32, count: u32, sequence: u32) {
for (var i: u32 = 0u; i < count; i = i + 1u) {
let t = address + i;
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
if is_fetch {
interpret_texture_fetch(t, false);
} else {
interpret_alu(t, false);
}
}
}
// Reset the per-invocation register state to a known baseline.
fn reset_state() {
for (var i: u32 = 0u; i < 128u; i = i + 1u) {
registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
}
ps = 0.0;
predicate = false;
current_alloc = 0u;
kill_flag = false;
loop_depth = 0u;
reject_mask = 0u;
for (var i: u32 = 0u; i < 4u; i = i + 1u) {
loop_counters[i] = 0u;
loop_starts[i] = 0u;
}
}
// ── Stage entry points.
// M7 register slots for exports. VS writes position at oPos (convention:
// Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
// and a set of interpolators. We track both via `current_alloc`: writes
// inside each alloc range are tagged and copied out at Exit.
const OPOS_REG: u32 = 32u; // synthetic slot used by the interpreter
const OCOLOR_REG: u32 = 33u; // color0 scratch slot
@vertex
fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
reset_state();
// Seed r0 with the vertex index so simple shaders (or the procedural
// fallback) have access without a real vertex fetch.
registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
// Seed the export slots with a procedural fallback: if the shader
// never writes oPos / oColor, this keeps the output visible rather
// than collapsing to (0,0) which would skip rasterization.
let total = max(draw_ctx.vertex_count, 1u);
let t_param = f32(vidx) / f32(total);
let angle = t_param * 6.2831853;
let radius = 0.35;
registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
let d = f32(draw_ctx.draw_index);
registers[OCOLOR_REG] = vec4<f32>(
0.5 + 0.5 * sin(d * 0.37),
0.5 + 0.5 * sin(d * 0.51 + 2.0),
0.5 + 0.5 * sin(d * 0.73 + 4.0),
1.0,
);
// Dead-binding guard for VERTEX-stage-only vertex_buffer access.
let vb_live = f32(vertex_buffer[0]) * 0.0;
// Walk the VS CF table.
walk_cf_vs();
var out: VsOut;
// Use registers[OPOS_REG] as position; the procedural fallback above
// seeded it so an un-interpreted shader still draws a recognisable
// circle.
out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
return out;
}
@fragment
fn fs_main(in: VsOut) -> FsOut {
reset_state();
walk_cf_ps();
// Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
// `discard` at the entry level (outside any helper) is the only way to
// guarantee early-out in WGSL.
if kill_flag {
discard;
}
var out: FsOut;
out.color0 = in.color;
return out;
}
// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
// we don't have). A hard iteration cap keeps the GPU from hanging on
// malformed or extreme shaders.
const CF_WALKER_MAX_ITER: u32 = 4096u;
fn walk_cf_vs() {
let cf_n = vs_cf_count();
var cf_i: u32 = 0u;
var iter: u32 = 0u;
loop {
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
iter = iter + 1u;
let kind = vs_cf_kind(cf_i);
let primary = vs_cf_primary(cf_i);
let aux = vs_cf_aux(cf_i);
var advance: bool = true;
var stop: bool = false;
switch kind {
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
let count = aux & 0xFFu;
let sequence = aux >> 8u;
exec_vs(primary, count, sequence);
if kind == CF_KIND_EXEC_END { stop = true; }
}
case CF_KIND_ALLOC: { current_alloc = primary; }
case CF_KIND_EXIT: { stop = true; }
case CF_KIND_LOOP_START: {
let loop_id = aux & 0x1Fu;
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
if loop_count > 16u {
loop_count = 16u;
reject_mask |= REJECT_LOOP_OVERFLOW;
}
if loop_count > 0u && loop_depth < 4u {
loop_starts[loop_depth] = cf_i;
loop_counters[loop_depth] = loop_count;
loop_depth = loop_depth + 1u;
}
// count==0 → fall through; matching LOOP_END will pop.
}
case CF_KIND_LOOP_END: {
if loop_depth > 0u {
let d = loop_depth - 1u;
if loop_counters[d] > 1u {
loop_counters[d] = loop_counters[d] - 1u;
cf_i = loop_starts[d] + 1u;
advance = false;
} else {
loop_counters[d] = 0u;
loop_depth = d;
}
}
}
case CF_KIND_COND_JMP: {
let pred_bits = aux;
let is_pred = (pred_bits & 1u) != 0u;
let pred_cnd = (pred_bits & 2u) != 0u;
if !is_pred || predicate == pred_cnd {
cf_i = primary;
advance = false;
}
}
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
// No call stack — mark and continue.
reject_mask |= REJECT_CF_CALL;
}
default: { reject_mask |= REJECT_CF_JUMP; }
}
if stop { break; }
if advance { cf_i = cf_i + 1u; }
}
}
fn walk_cf_ps() {
let cf_n = ps_cf_count();
var cf_i: u32 = 0u;
var iter: u32 = 0u;
loop {
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
iter = iter + 1u;
let kind = ps_cf_kind(cf_i);
let primary = ps_cf_primary(cf_i);
let aux = ps_cf_aux(cf_i);
var advance: bool = true;
var stop: bool = false;
switch kind {
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
let count = aux & 0xFFu;
let sequence = aux >> 8u;
exec_ps(primary, count, sequence);
if kind == CF_KIND_EXEC_END { stop = true; }
}
case CF_KIND_ALLOC: { current_alloc = primary; }
case CF_KIND_EXIT: { stop = true; }
case CF_KIND_LOOP_START: {
let loop_id = aux & 0x1Fu;
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
if loop_count > 16u {
loop_count = 16u;
reject_mask |= REJECT_LOOP_OVERFLOW;
}
if loop_count > 0u && loop_depth < 4u {
loop_starts[loop_depth] = cf_i;
loop_counters[loop_depth] = loop_count;
loop_depth = loop_depth + 1u;
}
}
case CF_KIND_LOOP_END: {
if loop_depth > 0u {
let d = loop_depth - 1u;
if loop_counters[d] > 1u {
loop_counters[d] = loop_counters[d] - 1u;
cf_i = loop_starts[d] + 1u;
advance = false;
} else {
loop_counters[d] = 0u;
loop_depth = d;
}
}
}
case CF_KIND_COND_JMP: {
let pred_bits = aux;
let is_pred = (pred_bits & 1u) != 0u;
let pred_cnd = (pred_bits & 2u) != 0u;
if !is_pred || predicate == pred_cnd {
cf_i = primary;
advance = false;
}
}
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
reject_mask |= REJECT_CF_CALL;
}
default: { reject_mask |= REJECT_CF_JUMP; }
}
if stop { break; }
if advance { cf_i = cf_i + 1u; }
}
}