xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view, ring_drain, pm4) drains the command processor; gpu_system owns the threaded backend (DrainFence RPC + parker/fence helpers from M1) and the MMIO-mapped register block (mmio_region). Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode the Xbox 360 microcode, translator.rs lowers it onto the WGSL xenos_interp interpreter shader (shaders/xenos_interp.wgsl). shader_metrics.rs counts decode/translate work. Render state: draw_state, primitive, render_target_cache, texture_cache, tiled_address (Xenos's swizzled tiled-memory layout), xenos_constants (register field constants), edram (the 10 MiB EDRAM model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs owns the typed GPU-resource handles the kernel hands out. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions
--- a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl
+++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl
@@ -0,0 +1,974 @@
+// xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
+//
+// Bindings (stable across P3b milestones):
+//   @group(0) @binding(0) draw_ctx      (uniform, 16 B — XenosDrawConstants)
+//   @group(0) @binding(1) xenos_consts  (uniform, ~9.2 KB — XenosConstants)
+//   @group(0) @binding(2) vs_ucode      (storage<read>, packed VS shader)
+//   @group(0) @binding(3) ps_ucode      (storage<read>, packed PS shader)
+//   @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
+//
+// Packed shader layout (both vs_ucode & ps_ucode):
+//   [0]                            = cf_count
+//   [1 .. 1 + cf_count*3]          = CF table: (kind, primary, aux) × cf_count
+//   [1 + cf_count*3 ..]            = instruction triples (3 dwords each)
+//
+// M3 state (this file): CF walker + operand decode helpers + register file
+// scaffold are complete. ALU / fetch bodies are still stubs that fall back
+// to the procedural-circle visualisation; M4-M7 fill them in.
+
+struct XenosDrawConstants {
+    draw_index: u32,
+    vertex_count: u32,
+    prim_kind: u32,
+    _pad: u32,
+};
+
+struct XenosConstants {
+    alu:          array<vec4<f32>, 512>,
+    fetch:        array<u32, 256>,
+    bool_consts:  array<u32, 8>,
+    loop_consts:  array<u32, 32>,
+};
+
+@group(0) @binding(0) var<uniform>            draw_ctx      : XenosDrawConstants;
+// `xenos_consts` is a read-only storage buffer (not uniform) because the
+// block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
+// address space requires 16-byte element stride, which would triple the
+// allocation; storage accepts the natural 4-byte stride.
+@group(0) @binding(1) var<storage, read>      xenos_consts  : XenosConstants;
+@group(0) @binding(2) var<storage, read>      vs_ucode      : array<u32>;
+@group(0) @binding(3) var<storage, read>      ps_ucode      : array<u32>;
+@group(0) @binding(4) var<storage, read>      vertex_buffer : array<u32>;
+
+// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
+// all texture slots; the P5 texture cache will replace this with per-slot
+// bindings.
+@group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
+@group(1) @binding(1) var xenos_samp : sampler;
+
+// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
+const CF_KIND_EXEC:        u32 = 0u;
+const CF_KIND_EXEC_END:    u32 = 1u;
+const CF_KIND_ALLOC:       u32 = 2u;
+const CF_KIND_EXIT:        u32 = 3u;
+const CF_KIND_LOOP_START:  u32 = 4u;
+const CF_KIND_LOOP_END:    u32 = 5u;
+const CF_KIND_COND_JMP:    u32 = 6u;
+const CF_KIND_COND_CALL:   u32 = 7u;
+const CF_KIND_RETURN:      u32 = 8u;
+const CF_KIND_UNKNOWN:     u32 = 15u;
+
+// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
+const ALLOC_KIND_POSITION:      u32 = 0u;
+const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
+const ALLOC_KIND_COLORS:        u32 = 2u;
+
+// Per-invocation Xenos register file + scalar `ps` + predicate.
+var<private> registers: array<vec4<f32>, 128>;
+var<private> ps: f32;
+var<private> predicate: bool;
+
+// Currently-active export alloc kind; set by Alloc clauses.
+var<private> current_alloc: u32;
+
+// P3c additions:
+//   `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
+//                 interpreter exits. (`discard` inside a helper function is
+//                 allowed in WGSL, but keeping it at the entry level makes
+//                 control flow easier to read.)
+//   `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
+//                 clauses. Xenos supports up to 4 nested loops.
+//   `reject_mask` — bitfield of op categories we failed to interpret, so the
+//                 PS fallback color + host-side diagnostics can surface it.
+var<private> kill_flag: bool;
+var<private> loop_depth: u32;
+var<private> loop_counters: array<u32, 4>;
+var<private> loop_starts: array<u32, 4>;
+var<private> reject_mask: u32;
+
+const REJECT_ALU_VEC:      u32 = 1u;
+const REJECT_ALU_SCL:      u32 = 2u;
+const REJECT_TEX_NON2D:    u32 = 4u;
+const REJECT_VFETCH_FMT:   u32 = 8u;
+const REJECT_CF_JUMP:      u32 = 16u;
+const REJECT_CF_CALL:      u32 = 32u;
+const REJECT_LOOP_OVERFLOW:u32 = 64u;
+
+struct VsOut {
+    @builtin(position) position: vec4<f32>,
+    @location(0) color: vec4<f32>,
+};
+
+struct FsOut {
+    @location(0) color0: vec4<f32>,
+};
+
+// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.
+
+fn vs_cf_count() -> u32 { return vs_ucode[0]; }
+fn ps_cf_count() -> u32 { return ps_ucode[0]; }
+
+fn vs_cf_kind(i: u32) -> u32    { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
+fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
+fn vs_cf_aux(i: u32) -> u32     { return vs_ucode[1u + i * 3u + 2u]; }
+fn ps_cf_kind(i: u32) -> u32    { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
+fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
+fn ps_cf_aux(i: u32) -> u32     { return ps_ucode[1u + i * 3u + 2u]; }
+
+fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
+fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }
+
+// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
+fn vs_instr_dword(t: u32, which: u32) -> u32 {
+    return vs_ucode[vs_instr_base() + t * 3u + which];
+}
+fn ps_instr_dword(t: u32, which: u32) -> u32 {
+    return ps_ucode[ps_instr_base() + t * 3u + which];
+}
+
+// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
+
+fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
+    // Swizzle is 8 bits: 2 bits per output lane, xyzw order.
+    let sx = (swizzle >> 0u) & 3u;
+    let sy = (swizzle >> 2u) & 3u;
+    let sz = (swizzle >> 4u) & 3u;
+    let sw = (swizzle >> 6u) & 3u;
+    return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
+}
+
+fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
+    var r = v;
+    if take_abs {
+        r = abs(r);
+    }
+    if negate {
+        r = -r;
+    }
+    return r;
+}
+
+fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
+    if idx >= 128u {
+        return;
+    }
+    let prev = registers[idx];
+    var out = prev;
+    if (mask & 1u) != 0u { out.x = value.x; }
+    if (mask & 2u) != 0u { out.y = value.y; }
+    if (mask & 4u) != 0u { out.z = value.z; }
+    if (mask & 8u) != 0u { out.w = value.w; }
+    registers[idx] = out;
+}
+
+// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
+// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
+// `xenia_gpu::ucode::alu::{vop,sop}`).
+const VOP_ADD:          u32 = 0u;
+const VOP_MUL:          u32 = 1u;
+const VOP_MAX:          u32 = 2u;
+const VOP_MIN:          u32 = 3u;
+const VOP_SEQ:          u32 = 4u;
+const VOP_SGT:          u32 = 5u;
+const VOP_SGE:          u32 = 6u;
+const VOP_SNE:          u32 = 7u;
+const VOP_FRC:          u32 = 8u;
+const VOP_TRUNC:        u32 = 9u;
+const VOP_FLOOR:        u32 = 10u;
+const VOP_MAD:          u32 = 11u;
+const VOP_CND_EQ:       u32 = 12u;
+const VOP_CND_GE:       u32 = 13u;
+const VOP_CND_GT:       u32 = 14u;
+const VOP_DP4:          u32 = 15u;
+const VOP_DP3:          u32 = 16u;
+const VOP_DP2_ADD:      u32 = 17u;
+const VOP_CUBE:         u32 = 18u;
+const VOP_MAX4:         u32 = 19u;
+const VOP_KILL_EQ:      u32 = 24u;
+const VOP_KILL_GT:      u32 = 25u;
+const VOP_KILL_GE:      u32 = 26u;
+const VOP_KILL_NE:      u32 = 27u;
+const VOP_DST:          u32 = 28u;
+const VOP_MAX_A:        u32 = 29u;
+
+const SOP_ADDS:         u32 = 0u;
+const SOP_ADDS_PREV:    u32 = 1u;
+const SOP_MULS:         u32 = 2u;
+const SOP_MULS_PREV:    u32 = 3u;
+const SOP_MAXS:         u32 = 5u;
+const SOP_MINS:         u32 = 6u;
+const SOP_SEQS:         u32 = 7u;
+const SOP_SGTS:         u32 = 8u;
+const SOP_SGES:         u32 = 9u;
+const SOP_SNES:         u32 = 10u;
+const SOP_FRCS:         u32 = 11u;
+const SOP_TRUNCS:       u32 = 12u;
+const SOP_FLOORS:       u32 = 13u;
+const SOP_EXP:          u32 = 14u;
+const SOP_LOGC:         u32 = 15u;
+const SOP_LOG:          u32 = 16u;
+const SOP_RCPC:         u32 = 17u;
+const SOP_RCPF:         u32 = 18u;
+const SOP_RCP:          u32 = 19u;
+const SOP_RSQC:         u32 = 20u;
+const SOP_RSQF:         u32 = 21u;
+const SOP_RSQ:          u32 = 22u;
+const SOP_SUBS:         u32 = 25u;
+const SOP_SUBS_PREV:    u32 = 26u;
+const SOP_SETP_EQ:      u32 = 27u;
+const SOP_SETP_NE:      u32 = 28u;
+const SOP_SETP_GT:      u32 = 29u;
+const SOP_SETP_GE:      u32 = 30u;
+const SOP_SETP_INV:     u32 = 31u;
+const SOP_SETP_POP:     u32 = 32u;
+const SOP_SETP_CLR:     u32 = 33u;
+const SOP_SETP_RSTR:    u32 = 34u;
+const SOP_KILLS_EQ:     u32 = 35u;
+const SOP_KILLS_GT:     u32 = 36u;
+const SOP_KILLS_GE:     u32 = 37u;
+const SOP_KILLS_NE:     u32 = 38u;
+const SOP_KILLS_ONE:    u32 = 39u;
+const SOP_SQRT:         u32 = 40u;
+const SOP_SIN:          u32 = 48u;
+const SOP_COS:          u32 = 49u;
+const SOP_RETAIN_PREV:  u32 = 50u;
+
+// Read a vec4 source from the register file. Treats the src index as a
+// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
+// M4+ will extend this to decode the full operand header.
+fn read_src(idx: u32) -> vec4<f32> {
+    return registers[idx & 0x7Fu];
+}
+
+fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
+    switch op {
+        case VOP_ADD:     { return a + b; }
+        case VOP_MUL:     { return a * b; }
+        case VOP_MAX:     { return max(a, b); }
+        case VOP_MIN:     { return min(a, b); }
+        case VOP_SEQ:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x == b.x),
+                select(0.0, 1.0, a.y == b.y),
+                select(0.0, 1.0, a.z == b.z),
+                select(0.0, 1.0, a.w == b.w),
+            );
+        }
+        case VOP_SGT:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x >  b.x),
+                select(0.0, 1.0, a.y >  b.y),
+                select(0.0, 1.0, a.z >  b.z),
+                select(0.0, 1.0, a.w >  b.w),
+            );
+        }
+        case VOP_SGE:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x >= b.x),
+                select(0.0, 1.0, a.y >= b.y),
+                select(0.0, 1.0, a.z >= b.z),
+                select(0.0, 1.0, a.w >= b.w),
+            );
+        }
+        case VOP_SNE:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x != b.x),
+                select(0.0, 1.0, a.y != b.y),
+                select(0.0, 1.0, a.z != b.z),
+                select(0.0, 1.0, a.w != b.w),
+            );
+        }
+        case VOP_FRC:     { return fract(a); }
+        case VOP_TRUNC:   { return trunc(a); }
+        case VOP_FLOOR:   { return floor(a); }
+        case VOP_MAD:     { return a * b + c; }
+        case VOP_CND_EQ: {
+            // dst = (src0 == 0) ? src1 : src2
+            return vec4<f32>(
+                select(c.x, b.x, a.x == 0.0),
+                select(c.y, b.y, a.y == 0.0),
+                select(c.z, b.z, a.z == 0.0),
+                select(c.w, b.w, a.w == 0.0),
+            );
+        }
+        case VOP_CND_GE: {
+            return vec4<f32>(
+                select(c.x, b.x, a.x >= 0.0),
+                select(c.y, b.y, a.y >= 0.0),
+                select(c.z, b.z, a.z >= 0.0),
+                select(c.w, b.w, a.w >= 0.0),
+            );
+        }
+        case VOP_CND_GT: {
+            return vec4<f32>(
+                select(c.x, b.x, a.x > 0.0),
+                select(c.y, b.y, a.y > 0.0),
+                select(c.z, b.z, a.z > 0.0),
+                select(c.w, b.w, a.w > 0.0),
+            );
+        }
+        case VOP_DP4:     {
+            let d = dot(a, b);
+            return vec4<f32>(d, d, d, d);
+        }
+        case VOP_DP3:     {
+            let d = dot(a.xyz, b.xyz);
+            return vec4<f32>(d, d, d, d);
+        }
+        case VOP_DP2_ADD: {
+            let d = a.x * b.x + a.y * b.y + c.x;
+            return vec4<f32>(d, d, d, d);
+        }
+        case VOP_MAX4: {
+            let m = max(max(a.x, a.y), max(a.z, a.w));
+            return vec4<f32>(m, m, m, m);
+        }
+        case VOP_KILL_EQ: {
+            if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_KILL_GT: {
+            if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_KILL_GE: {
+            if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_KILL_NE: {
+            if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_DST: {
+            // dest = (1, src0.y * src1.y, src0.z, src1.w)
+            return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
+        }
+        case VOP_CUBE, VOP_MAX_A: {
+            // Cube face projection + MAX+AR are rare in P3c's target set;
+            // forward to max() fallback so MAX_A degrades gracefully and
+            // CUBE does *something* useful (max-axis selection).
+            reject_mask |= REJECT_ALU_VEC;
+            return max(a, b);
+        }
+        default: {
+            // Unsupported — identity fallback + diagnostic flag.
+            reject_mask |= REJECT_ALU_VEC;
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+    }
+}
+
+// Scalar op executor. Takes:
+//   `src_a` / `src_b` — the two scalar source components (most ops use
+//   only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
+//   `prev` — current `ps` chain value.
+// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
+fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
+    switch op {
+        case SOP_ADDS:        { return src_a + src_b; }
+        case SOP_ADDS_PREV:   { return src_a + prev; }
+        case SOP_MULS:        { return src_a * src_b; }
+        case SOP_MULS_PREV:   { return src_a * prev; }
+        case SOP_SUBS:        { return src_a - src_b; }
+        case SOP_SUBS_PREV:   { return src_a - prev; }
+        case SOP_MAXS:        { return max(src_a, src_b); }
+        case SOP_MINS:        { return min(src_a, src_b); }
+        case SOP_SEQS:        { return select(0.0, 1.0, src_a == 0.0); }
+        case SOP_SGTS:        { return select(0.0, 1.0, src_a >  0.0); }
+        case SOP_SGES:        { return select(0.0, 1.0, src_a >= 0.0); }
+        case SOP_SNES:        { return select(0.0, 1.0, src_a != 0.0); }
+        case SOP_FRCS:        { return fract(src_a); }
+        case SOP_TRUNCS:      { return trunc(src_a); }
+        case SOP_FLOORS:      { return floor(src_a); }
+        case SOP_EXP:         {
+            // exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
+            // WGSL's exp2 already produces for 0.
+            return exp2(src_a);
+        }
+        case SOP_LOG, SOP_LOGC: {
+            // log/logc both compute log2. Canary's LOGC clamps -INF to
+            // -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
+            // inputs > 0, and we leave src <= 0 to be the shader author's
+            // problem (identity fallback via select).
+            return select(log2(src_a), 0.0, src_a == 1.0);
+        }
+        case SOP_RCP, SOP_RCPC, SOP_RCPF: {
+            // IEEE reciprocal with src==0 guard; the clamp-variants differ
+            // only in how they treat INF/NaN, which is shader-author-rare.
+            return select(0.0, 1.0 / src_a, src_a != 0.0);
+        }
+        case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
+            return select(0.0, inverseSqrt(src_a), src_a > 0.0);
+        }
+        case SOP_SQRT:        { return select(0.0, sqrt(src_a), src_a >= 0.0); }
+        case SOP_SIN:         { return sin(src_a); }
+        case SOP_COS:         { return cos(src_a); }
+        // Predicate writes — update `predicate` and produce a result that
+        // the surrounding ALU slot can still consume via `ps`. Canary's
+        // setp-variant dst-write semantics are preserved.
+        case SOP_SETP_EQ: {
+            predicate = (src_a == 0.0);
+            return select(1.0, 0.0, src_a == 0.0);
+        }
+        case SOP_SETP_NE: {
+            predicate = (src_a != 0.0);
+            return select(1.0, 0.0, src_a != 0.0);
+        }
+        case SOP_SETP_GT: {
+            predicate = (src_a > 0.0);
+            return select(1.0, 0.0, src_a > 0.0);
+        }
+        case SOP_SETP_GE: {
+            predicate = (src_a >= 0.0);
+            return select(1.0, 0.0, src_a >= 0.0);
+        }
+        case SOP_SETP_INV: {
+            if src_a == 1.0 {
+                predicate = true;
+                return 0.0;
+            } else {
+                predicate = false;
+                return select(src_a, 1.0, src_a == 0.0);
+            }
+        }
+        case SOP_SETP_POP: {
+            if src_a - 1.0 <= 0.0 {
+                predicate = true;
+                return 0.0;
+            } else {
+                predicate = false;
+                return src_a - 1.0;
+            }
+        }
+        case SOP_SETP_CLR: {
+            predicate = false;
+            // FLT_MAX sentinel. WGSL's bitcast keeps this portable.
+            return bitcast<f32>(0x7F7FFFFFu);
+        }
+        case SOP_SETP_RSTR: {
+            if src_a == 0.0 {
+                predicate = true;
+                return 0.0;
+            } else {
+                predicate = false;
+                return src_a;
+            }
+        }
+        // Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
+        // canary, and `discard` runs at fragment exit when the flag is set.
+        case SOP_KILLS_EQ: {
+            if src_a == 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_GT: {
+            if src_a > 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_GE: {
+            if src_a >= 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_NE: {
+            if src_a != 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_ONE: {
+            if src_a == 1.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_RETAIN_PREV: { return prev; }
+        default: {
+            reject_mask |= REJECT_ALU_SCL;
+            return 0.0;
+        }
+    }
+}
+
+fn interpret_alu(t: u32, is_vertex: bool) {
+    // Read the 3-dword instruction triple.
+    var w0: u32;
+    var w1: u32;
+    var w2: u32;
+    if is_vertex {
+        w0 = vs_instr_dword(t, 0u);
+        w1 = vs_instr_dword(t, 1u);
+        w2 = vs_instr_dword(t, 2u);
+    } else {
+        w0 = ps_instr_dword(t, 0u);
+        w1 = ps_instr_dword(t, 1u);
+        w2 = ps_instr_dword(t, 2u);
+    }
+    // Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
+    let vec_op  = w2 & 0x3Fu;
+    let scl_op  = (w2 >> 6u) & 0x3Fu;
+    let vec_dst = (w2 >> 16u) & 0x7Fu;
+    let scl_dst = (w2 >> 24u) & 0x7Fu;
+    let vec_wm  = (w2 >> 12u) & 0xFu;
+    let scl_wm  = (w2 >> 8u)  & 0xFu;
+    let src_a   = w0 & 0xFFu;
+    let src_b   = (w0 >> 8u)  & 0xFFu;
+    let src_c   = (w0 >> 16u) & 0xFFu;
+    let predicated           = ((w0 >> 27u) & 1u) != 0u;
+    let predicate_condition  = ((w0 >> 28u) & 1u) != 0u;
+    let scalar_src_is_ps     = ((w0 >> 26u) & 1u) != 0u;
+    // `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
+    // treats every source as a full r#, no modifiers — M4+ decodes it.
+    _ = w1;
+
+    // Honor per-instruction predicate: skip when predicated and the
+    // predicate doesn't match the required condition.
+    if predicated && (predicate != predicate_condition) {
+        return;
+    }
+
+    // Vector pipe.
+    let a = read_src(src_a);
+    let b = read_src(src_b);
+    let c = read_src(src_c);
+    let vec_result = exec_vector_op(vec_op, a, b, c);
+    if vec_wm != 0u {
+        write_reg_masked(vec_dst, vec_wm, vec_result);
+    }
+
+    // Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
+    // read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
+    // is set the operand selector chooses `ps` as the primary source.
+    let scl_src_a = select(a.x, ps, scalar_src_is_ps);
+    let scl_src_b = b.x;
+    let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
+    ps = new_ps;
+    if scl_wm != 0u {
+        write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
+    }
+}
+// Xenos VertexFormat values from `xenos.h:641`.
+const VFMT_8_8_8_8:         u32 = 6u;
+const VFMT_2_10_10_10:      u32 = 7u;
+const VFMT_10_11_11:        u32 = 16u;
+const VFMT_11_11_10:        u32 = 17u;
+const VFMT_16_16:           u32 = 25u;
+const VFMT_16_16_16_16:     u32 = 26u;
+const VFMT_16_16_FLOAT:     u32 = 31u;
+const VFMT_16_16_16_16_FLOAT:u32 = 32u;
+const VFMT_32:              u32 = 33u;
+const VFMT_32_32:           u32 = 34u;
+const VFMT_32_32_32_32:     u32 = 35u;
+const VFMT_32_FLOAT:        u32 = 36u;
+const VFMT_32_32_FLOAT:     u32 = 37u;
+const VFMT_32_32_32_32_FLOAT:u32 = 38u;
+const VFMT_32_32_32_FLOAT:  u32 = 57u;
+
+// Decode vertex fetch instruction fields (canary's VertexFetchInstruction
+// layout in `ucode.h:690`):
+//   w0 [4:0]   opcode
+//   w0 [10:5]  src_reg[5:0]
+//   w0 [17:11] dst_reg[6:0] + must-be-one
+//   w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
+//   w1 [21:16] format[5:0]
+//   w2 [7:0]   stride (in dwords)
+//   w2 [30:8]  offset (signed, in dwords)
+//
+// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
+// sign/normalization flags are used for the most-common normalized-unsigned
+// path. Rejects set `REJECT_VFETCH_FMT`.
+fn interpret_vertex_fetch(t: u32) {
+    let w0 = vs_instr_dword(t, 0u);
+    let w1 = vs_instr_dword(t, 1u);
+    let w2 = vs_instr_dword(t, 2u);
+    let fetch_const = (w0 >> 5u) & 0x1Fu;
+    let dst_reg = (w0 >> 10u) & 0x7Fu;
+    let src_reg = (w0 >> 17u) & 0x7Fu;
+    let format  = (w1 >> 16u) & 0x3Fu;
+    let stride  = w2 & 0xFFu;
+
+    // Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
+    // dword 1 carries (endian[1:0], size[25:2]).
+    let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
+    let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
+
+    let vidx = u32(registers[src_reg & 0x7Fu].x);
+    // Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
+    let effective_stride = select(stride, 4u, stride == 0u);
+    let addr = base_dwords + vidx * effective_stride;
+
+    let n = arrayLength(&vertex_buffer);
+    var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+
+    switch format {
+        case VFMT_32_32_32_32_FLOAT: {
+            if addr + 3u < n {
+                result = vec4<f32>(
+                    bitcast<f32>(vertex_buffer[addr + 0u]),
+                    bitcast<f32>(vertex_buffer[addr + 1u]),
+                    bitcast<f32>(vertex_buffer[addr + 2u]),
+                    bitcast<f32>(vertex_buffer[addr + 3u]),
+                );
+            }
+        }
+        case VFMT_32_32_32_FLOAT: {
+            if addr + 2u < n {
+                result = vec4<f32>(
+                    bitcast<f32>(vertex_buffer[addr + 0u]),
+                    bitcast<f32>(vertex_buffer[addr + 1u]),
+                    bitcast<f32>(vertex_buffer[addr + 2u]),
+                    1.0,
+                );
+            }
+        }
+        case VFMT_32_32_FLOAT: {
+            if addr + 1u < n {
+                result = vec4<f32>(
+                    bitcast<f32>(vertex_buffer[addr + 0u]),
+                    bitcast<f32>(vertex_buffer[addr + 1u]),
+                    0.0,
+                    1.0,
+                );
+            }
+        }
+        case VFMT_32_FLOAT: {
+            if addr < n {
+                result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
+            }
+        }
+        case VFMT_8_8_8_8: {
+            if addr < n {
+                result = unpack4x8unorm(vertex_buffer[addr]);
+            }
+        }
+        case VFMT_16_16_FLOAT: {
+            if addr < n {
+                let h = unpack2x16float(vertex_buffer[addr]);
+                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
+            }
+        }
+        case VFMT_16_16_16_16_FLOAT: {
+            if addr + 1u < n {
+                let h0 = unpack2x16float(vertex_buffer[addr]);
+                let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
+                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
+            }
+        }
+        case VFMT_16_16: {
+            if addr < n {
+                // Default to signed normalized; unsigned variants differ
+                // only for sign-extension and are less common on Xenos VBs.
+                let h = unpack2x16snorm(vertex_buffer[addr]);
+                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
+            }
+        }
+        case VFMT_16_16_16_16: {
+            if addr + 1u < n {
+                let h0 = unpack2x16snorm(vertex_buffer[addr]);
+                let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
+                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
+            }
+        }
+        case VFMT_2_10_10_10: {
+            // Unpack 10-bit R/G/B (signed or unsigned, default unsigned
+            // normalized) + 2-bit A. Unnormalized not exercised here.
+            if addr < n {
+                let packed = vertex_buffer[addr];
+                let r = f32(packed & 0x3FFu) / 1023.0;
+                let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
+                let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
+                let a = f32((packed >> 30u) & 0x3u) / 3.0;
+                result = vec4<f32>(r, g, b, a);
+            }
+        }
+        default: {
+            reject_mask |= REJECT_VFETCH_FMT;
+            // Identity fallback preserves vertex-index visibility.
+            if addr < n {
+                result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
+            }
+        }
+    }
+
+    registers[dst_reg & 0x7Fu] = result;
+}
+
+// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
+// dummy bound at group(1); the real per-slot texture cache lands with P5.
+// Reads (u, v) from the source register's .xy and writes the sample into
+// the destination register. `textureSampleLevel` works in both VS and PS
+// (no implicit derivatives), so no per-stage specialisation needed.
+fn interpret_texture_fetch(t: u32, is_vertex: bool) {
+    var w0: u32 = 0u;
+    if is_vertex {
+        w0 = vs_instr_dword(t, 0u);
+    } else {
+        w0 = ps_instr_dword(t, 0u);
+    }
+    let dst_reg = (w0 >> 10u) & 0x7Fu;
+    let src_reg = (w0 >> 17u) & 0x7Fu;
+    let uv = registers[src_reg & 0x7Fu].xy;
+    let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
+    registers[dst_reg & 0x7Fu] = sample;
+}
+
+// Walk an Exec clause's instruction triples.
+//   sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
+//             (we ignore in MVP); bit 1 = is-fetch.
+fn exec_vs(address: u32, count: u32, sequence: u32) {
+    for (var i: u32 = 0u; i < count; i = i + 1u) {
+        let t = address + i;
+        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
+        if is_fetch {
+            let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
+            // 0x00 = vertex fetch, 0x01 = texture fetch.
+            if opcode == 0u {
+                interpret_vertex_fetch(t);
+            } else if opcode == 1u {
+                interpret_texture_fetch(t, true);
+            }
+        } else {
+            interpret_alu(t, true);
+        }
+    }
+}
+fn exec_ps(address: u32, count: u32, sequence: u32) {
+    for (var i: u32 = 0u; i < count; i = i + 1u) {
+        let t = address + i;
+        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
+        if is_fetch {
+            interpret_texture_fetch(t, false);
+        } else {
+            interpret_alu(t, false);
+        }
+    }
+}
+
+// Reset the per-invocation register state to a known baseline.
+fn reset_state() {
+    for (var i: u32 = 0u; i < 128u; i = i + 1u) {
+        registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+    }
+    ps = 0.0;
+    predicate = false;
+    current_alloc = 0u;
+    kill_flag = false;
+    loop_depth = 0u;
+    reject_mask = 0u;
+    for (var i: u32 = 0u; i < 4u; i = i + 1u) {
+        loop_counters[i] = 0u;
+        loop_starts[i] = 0u;
+    }
+}
+
+// ── Stage entry points.
+
+// M7 register slots for exports. VS writes position at oPos (convention:
+// Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
+// and a set of interpolators. We track both via `current_alloc`: writes
+// inside each alloc range are tagged and copied out at Exit.
+const OPOS_REG:       u32 = 32u;  // synthetic slot used by the interpreter
+const OCOLOR_REG:     u32 = 33u;  // color0 scratch slot
+
+@vertex
+fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
+    reset_state();
+
+    // Seed r0 with the vertex index so simple shaders (or the procedural
+    // fallback) have access without a real vertex fetch.
+    registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
+    // Seed the export slots with a procedural fallback: if the shader
+    // never writes oPos / oColor, this keeps the output visible rather
+    // than collapsing to (0,0) which would skip rasterization.
+    let total = max(draw_ctx.vertex_count, 1u);
+    let t_param = f32(vidx) / f32(total);
+    let angle = t_param * 6.2831853;
+    let radius = 0.35;
+    registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
+    let d = f32(draw_ctx.draw_index);
+    registers[OCOLOR_REG] = vec4<f32>(
+        0.5 + 0.5 * sin(d * 0.37),
+        0.5 + 0.5 * sin(d * 0.51 + 2.0),
+        0.5 + 0.5 * sin(d * 0.73 + 4.0),
+        1.0,
+    );
+
+    // Dead-binding guard for VERTEX-stage-only vertex_buffer access.
+    let vb_live = f32(vertex_buffer[0]) * 0.0;
+
+    // Walk the VS CF table.
+    walk_cf_vs();
+
+    var out: VsOut;
+    // Use registers[OPOS_REG] as position; the procedural fallback above
+    // seeded it so an un-interpreted shader still draws a recognisable
+    // circle.
+    out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
+    out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
+    return out;
+}
+
+@fragment
+fn fs_main(in: VsOut) -> FsOut {
+    reset_state();
+
+    walk_cf_ps();
+
+    // Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
+    // `discard` at the entry level (outside any helper) is the only way to
+    // guarantee early-out in WGSL.
+    if kill_flag {
+        discard;
+    }
+
+    var out: FsOut;
+    out.color0 = in.color;
+    return out;
+}
+
+// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
+// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
+// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
+// we don't have). A hard iteration cap keeps the GPU from hanging on
+// malformed or extreme shaders.
+const CF_WALKER_MAX_ITER: u32 = 4096u;
+
+fn walk_cf_vs() {
+    let cf_n = vs_cf_count();
+    var cf_i: u32 = 0u;
+    var iter: u32 = 0u;
+    loop {
+        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
+        iter = iter + 1u;
+        let kind = vs_cf_kind(cf_i);
+        let primary = vs_cf_primary(cf_i);
+        let aux = vs_cf_aux(cf_i);
+        var advance: bool = true;
+        var stop: bool = false;
+        switch kind {
+            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
+                let count = aux & 0xFFu;
+                let sequence = aux >> 8u;
+                exec_vs(primary, count, sequence);
+                if kind == CF_KIND_EXEC_END { stop = true; }
+            }
+            case CF_KIND_ALLOC: { current_alloc = primary; }
+            case CF_KIND_EXIT:  { stop = true; }
+            case CF_KIND_LOOP_START: {
+                let loop_id = aux & 0x1Fu;
+                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
+                if loop_count > 16u {
+                    loop_count = 16u;
+                    reject_mask |= REJECT_LOOP_OVERFLOW;
+                }
+                if loop_count > 0u && loop_depth < 4u {
+                    loop_starts[loop_depth] = cf_i;
+                    loop_counters[loop_depth] = loop_count;
+                    loop_depth = loop_depth + 1u;
+                }
+                // count==0 → fall through; matching LOOP_END will pop.
+            }
+            case CF_KIND_LOOP_END: {
+                if loop_depth > 0u {
+                    let d = loop_depth - 1u;
+                    if loop_counters[d] > 1u {
+                        loop_counters[d] = loop_counters[d] - 1u;
+                        cf_i = loop_starts[d] + 1u;
+                        advance = false;
+                    } else {
+                        loop_counters[d] = 0u;
+                        loop_depth = d;
+                    }
+                }
+            }
+            case CF_KIND_COND_JMP: {
+                let pred_bits = aux;
+                let is_pred  = (pred_bits & 1u) != 0u;
+                let pred_cnd = (pred_bits & 2u) != 0u;
+                if !is_pred || predicate == pred_cnd {
+                    cf_i = primary;
+                    advance = false;
+                }
+            }
+            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
+                // No call stack — mark and continue.
+                reject_mask |= REJECT_CF_CALL;
+            }
+            default: { reject_mask |= REJECT_CF_JUMP; }
+        }
+        if stop { break; }
+        if advance { cf_i = cf_i + 1u; }
+    }
+}
+
+fn walk_cf_ps() {
+    let cf_n = ps_cf_count();
+    var cf_i: u32 = 0u;
+    var iter: u32 = 0u;
+    loop {
+        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
+        iter = iter + 1u;
+        let kind = ps_cf_kind(cf_i);
+        let primary = ps_cf_primary(cf_i);
+        let aux = ps_cf_aux(cf_i);
+        var advance: bool = true;
+        var stop: bool = false;
+        switch kind {
+            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
+                let count = aux & 0xFFu;
+                let sequence = aux >> 8u;
+                exec_ps(primary, count, sequence);
+                if kind == CF_KIND_EXEC_END { stop = true; }
+            }
+            case CF_KIND_ALLOC: { current_alloc = primary; }
+            case CF_KIND_EXIT:  { stop = true; }
+            case CF_KIND_LOOP_START: {
+                let loop_id = aux & 0x1Fu;
+                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
+                if loop_count > 16u {
+                    loop_count = 16u;
+                    reject_mask |= REJECT_LOOP_OVERFLOW;
+                }
+                if loop_count > 0u && loop_depth < 4u {
+                    loop_starts[loop_depth] = cf_i;
+                    loop_counters[loop_depth] = loop_count;
+                    loop_depth = loop_depth + 1u;
+                }
+            }
+            case CF_KIND_LOOP_END: {
+                if loop_depth > 0u {
+                    let d = loop_depth - 1u;
+                    if loop_counters[d] > 1u {
+                        loop_counters[d] = loop_counters[d] - 1u;
+                        cf_i = loop_starts[d] + 1u;
+                        advance = false;
+                    } else {
+                        loop_counters[d] = 0u;
+                        loop_depth = d;
+                    }
+                }
+            }
+            case CF_KIND_COND_JMP: {
+                let pred_bits = aux;
+                let is_pred  = (pred_bits & 1u) != 0u;
+                let pred_cnd = (pred_bits & 2u) != 0u;
+                if !is_pred || predicate == pred_cnd {
+                    cf_i = primary;
+                    advance = false;
+                }
+            }
+            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
+                reject_mask |= REJECT_CF_CALL;
+            }
+            default: { reject_mask |= REJECT_CF_JUMP; }
+        }
+        if stop { break; }
+        if advance { cf_i = cf_i + 1u; }
+    }
+}