xenia-rs/crates/xenia-gpu/src/shaders/xenos_interp.wgsl

// xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
//
// Bindings (stable across P3b milestones):
//   @group(0) @binding(0) draw_ctx      (uniform, 16 B — XenosDrawConstants)
//   @group(0) @binding(1) xenos_consts  (uniform, ~9.2 KB — XenosConstants)
//   @group(0) @binding(2) vs_ucode      (storage<read>, packed VS shader)
//   @group(0) @binding(3) ps_ucode      (storage<read>, packed PS shader)
//   @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
//
// Packed shader layout (both vs_ucode & ps_ucode):
//   [0]                            = cf_count
//   [1 .. 1 + cf_count*3]          = CF table: (kind, primary, aux) × cf_count
//   [1 + cf_count*3 ..]            = instruction triples (3 dwords each)
//
// M3 state (this file): CF walker + operand decode helpers + register file
// scaffold are complete. ALU / fetch bodies are still stubs that fall back
// to the procedural-circle visualisation; M4-M7 fill them in.

struct XenosDrawConstants {
    draw_index: u32,
    vertex_count: u32,
    prim_kind: u32,
    _pad: u32,
};

struct XenosConstants {
    alu:          array<vec4<f32>, 512>,
    fetch:        array<u32, 256>,
    bool_consts:  array<u32, 8>,
    loop_consts:  array<u32, 32>,
};

@group(0) @binding(0) var<uniform>            draw_ctx      : XenosDrawConstants;
// `xenos_consts` is a read-only storage buffer (not uniform) because the
// block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
// address space requires 16-byte element stride, which would triple the
// allocation; storage accepts the natural 4-byte stride.
@group(0) @binding(1) var<storage, read>      xenos_consts  : XenosConstants;
@group(0) @binding(2) var<storage, read>      vs_ucode      : array<u32>;
@group(0) @binding(3) var<storage, read>      ps_ucode      : array<u32>;
@group(0) @binding(4) var<storage, read>      vertex_buffer : array<u32>;

// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
// all texture slots; the P5 texture cache will replace this with per-slot
// bindings.
@group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
@group(1) @binding(1) var xenos_samp : sampler;

// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
const CF_KIND_EXEC:        u32 = 0u;
const CF_KIND_EXEC_END:    u32 = 1u;
const CF_KIND_ALLOC:       u32 = 2u;
const CF_KIND_EXIT:        u32 = 3u;
const CF_KIND_LOOP_START:  u32 = 4u;
const CF_KIND_LOOP_END:    u32 = 5u;
const CF_KIND_COND_JMP:    u32 = 6u;
const CF_KIND_COND_CALL:   u32 = 7u;
const CF_KIND_RETURN:      u32 = 8u;
const CF_KIND_UNKNOWN:     u32 = 15u;

// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
const ALLOC_KIND_POSITION:      u32 = 0u;
const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
const ALLOC_KIND_COLORS:        u32 = 2u;

// Per-invocation Xenos register file + scalar `ps` + predicate.
var<private> registers: array<vec4<f32>, 128>;
var<private> ps: f32;
var<private> predicate: bool;

// Currently-active export alloc kind; set by Alloc clauses.
var<private> current_alloc: u32;

// P3c additions:
//   `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
//                 interpreter exits. (`discard` inside a helper function is
//                 allowed in WGSL, but keeping it at the entry level makes
//                 control flow easier to read.)
//   `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
//                 clauses. Xenos supports up to 4 nested loops.
//   `reject_mask` — bitfield of op categories we failed to interpret, so the
//                 PS fallback color + host-side diagnostics can surface it.
var<private> kill_flag: bool;
var<private> loop_depth: u32;
var<private> loop_counters: array<u32, 4>;
var<private> loop_starts: array<u32, 4>;
var<private> reject_mask: u32;

const REJECT_ALU_VEC:      u32 = 1u;
const REJECT_ALU_SCL:      u32 = 2u;
const REJECT_TEX_NON2D:    u32 = 4u;
const REJECT_VFETCH_FMT:   u32 = 8u;
const REJECT_CF_JUMP:      u32 = 16u;
const REJECT_CF_CALL:      u32 = 32u;
const REJECT_LOOP_OVERFLOW:u32 = 64u;

struct VsOut {
    @builtin(position) position: vec4<f32>,
    @location(0) color: vec4<f32>,
};

struct FsOut {
    @location(0) color0: vec4<f32>,
};

// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.

fn vs_cf_count() -> u32 { return vs_ucode[0]; }
fn ps_cf_count() -> u32 { return ps_ucode[0]; }

fn vs_cf_kind(i: u32) -> u32    { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
fn vs_cf_aux(i: u32) -> u32     { return vs_ucode[1u + i * 3u + 2u]; }
fn ps_cf_kind(i: u32) -> u32    { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
fn ps_cf_aux(i: u32) -> u32     { return ps_ucode[1u + i * 3u + 2u]; }

fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }

// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
fn vs_instr_dword(t: u32, which: u32) -> u32 {
    return vs_ucode[vs_instr_base() + t * 3u + which];
}
fn ps_instr_dword(t: u32, which: u32) -> u32 {
    return ps_ucode[ps_instr_base() + t * 3u + which];
}

// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.

fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
    // Swizzle is 8 bits: 2 bits per output lane, xyzw order.
    let sx = (swizzle >> 0u) & 3u;
    let sy = (swizzle >> 2u) & 3u;
    let sz = (swizzle >> 4u) & 3u;
    let sw = (swizzle >> 6u) & 3u;
    return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
}

fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
    var r = v;
    if take_abs {
        r = abs(r);
    }
    if negate {
        r = -r;
    }
    return r;
}

fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
    if idx >= 128u {
        return;
    }
    let prev = registers[idx];
    var out = prev;
    if (mask & 1u) != 0u { out.x = value.x; }
    if (mask & 2u) != 0u { out.y = value.y; }
    if (mask & 4u) != 0u { out.z = value.z; }
    if (mask & 8u) != 0u { out.w = value.w; }
    registers[idx] = out;
}

// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
// `xenia_gpu::ucode::alu::{vop,sop}`).
const VOP_ADD:          u32 = 0u;
const VOP_MUL:          u32 = 1u;
const VOP_MAX:          u32 = 2u;
const VOP_MIN:          u32 = 3u;
const VOP_SEQ:          u32 = 4u;
const VOP_SGT:          u32 = 5u;
const VOP_SGE:          u32 = 6u;
const VOP_SNE:          u32 = 7u;
const VOP_FRC:          u32 = 8u;
const VOP_TRUNC:        u32 = 9u;
const VOP_FLOOR:        u32 = 10u;
const VOP_MAD:          u32 = 11u;
const VOP_CND_EQ:       u32 = 12u;
const VOP_CND_GE:       u32 = 13u;
const VOP_CND_GT:       u32 = 14u;
const VOP_DP4:          u32 = 15u;
const VOP_DP3:          u32 = 16u;
const VOP_DP2_ADD:      u32 = 17u;
const VOP_CUBE:         u32 = 18u;
const VOP_MAX4:         u32 = 19u;
const VOP_KILL_EQ:      u32 = 24u;
const VOP_KILL_GT:      u32 = 25u;
const VOP_KILL_GE:      u32 = 26u;
const VOP_KILL_NE:      u32 = 27u;
const VOP_DST:          u32 = 28u;
const VOP_MAX_A:        u32 = 29u;

const SOP_ADDS:         u32 = 0u;
const SOP_ADDS_PREV:    u32 = 1u;
const SOP_MULS:         u32 = 2u;
const SOP_MULS_PREV:    u32 = 3u;
const SOP_MAXS:         u32 = 5u;
const SOP_MINS:         u32 = 6u;
const SOP_SEQS:         u32 = 7u;
const SOP_SGTS:         u32 = 8u;
const SOP_SGES:         u32 = 9u;
const SOP_SNES:         u32 = 10u;
const SOP_FRCS:         u32 = 11u;
const SOP_TRUNCS:       u32 = 12u;
const SOP_FLOORS:       u32 = 13u;
const SOP_EXP:          u32 = 14u;
const SOP_LOGC:         u32 = 15u;
const SOP_LOG:          u32 = 16u;
const SOP_RCPC:         u32 = 17u;
const SOP_RCPF:         u32 = 18u;
const SOP_RCP:          u32 = 19u;
const SOP_RSQC:         u32 = 20u;
const SOP_RSQF:         u32 = 21u;
const SOP_RSQ:          u32 = 22u;
const SOP_SUBS:         u32 = 25u;
const SOP_SUBS_PREV:    u32 = 26u;
const SOP_SETP_EQ:      u32 = 27u;
const SOP_SETP_NE:      u32 = 28u;
const SOP_SETP_GT:      u32 = 29u;
const SOP_SETP_GE:      u32 = 30u;
const SOP_SETP_INV:     u32 = 31u;
const SOP_SETP_POP:     u32 = 32u;
const SOP_SETP_CLR:     u32 = 33u;
const SOP_SETP_RSTR:    u32 = 34u;
const SOP_KILLS_EQ:     u32 = 35u;
const SOP_KILLS_GT:     u32 = 36u;
const SOP_KILLS_GE:     u32 = 37u;
const SOP_KILLS_NE:     u32 = 38u;
const SOP_KILLS_ONE:    u32 = 39u;
const SOP_SQRT:         u32 = 40u;
const SOP_SIN:          u32 = 48u;
const SOP_COS:          u32 = 49u;
const SOP_RETAIN_PREV:  u32 = 50u;

// Read a vec4 source from the register file. Treats the src index as a
// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
// M4+ will extend this to decode the full operand header.
fn read_src(idx: u32) -> vec4<f32> {
    return registers[idx & 0x7Fu];
}

fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
    switch op {
        case VOP_ADD:     { return a + b; }
        case VOP_MUL:     { return a * b; }
        case VOP_MAX:     { return max(a, b); }
        case VOP_MIN:     { return min(a, b); }
        case VOP_SEQ:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x == b.x),
                select(0.0, 1.0, a.y == b.y),
                select(0.0, 1.0, a.z == b.z),
                select(0.0, 1.0, a.w == b.w),
            );
        }
        case VOP_SGT:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x >  b.x),
                select(0.0, 1.0, a.y >  b.y),
                select(0.0, 1.0, a.z >  b.z),
                select(0.0, 1.0, a.w >  b.w),
            );
        }
        case VOP_SGE:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x >= b.x),
                select(0.0, 1.0, a.y >= b.y),
                select(0.0, 1.0, a.z >= b.z),
                select(0.0, 1.0, a.w >= b.w),
            );
        }
        case VOP_SNE:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x != b.x),
                select(0.0, 1.0, a.y != b.y),
                select(0.0, 1.0, a.z != b.z),
                select(0.0, 1.0, a.w != b.w),
            );
        }
        case VOP_FRC:     { return fract(a); }
        case VOP_TRUNC:   { return trunc(a); }
        case VOP_FLOOR:   { return floor(a); }
        case VOP_MAD:     { return a * b + c; }
        case VOP_CND_EQ: {
            // dst = (src0 == 0) ? src1 : src2
            return vec4<f32>(
                select(c.x, b.x, a.x == 0.0),
                select(c.y, b.y, a.y == 0.0),
                select(c.z, b.z, a.z == 0.0),
                select(c.w, b.w, a.w == 0.0),
            );
        }
        case VOP_CND_GE: {
            return vec4<f32>(
                select(c.x, b.x, a.x >= 0.0),
                select(c.y, b.y, a.y >= 0.0),
                select(c.z, b.z, a.z >= 0.0),
                select(c.w, b.w, a.w >= 0.0),
            );
        }
        case VOP_CND_GT: {
            return vec4<f32>(
                select(c.x, b.x, a.x > 0.0),
                select(c.y, b.y, a.y > 0.0),
                select(c.z, b.z, a.z > 0.0),
                select(c.w, b.w, a.w > 0.0),
            );
        }
        case VOP_DP4:     {
            let d = dot(a, b);
            return vec4<f32>(d, d, d, d);
        }
        case VOP_DP3:     {
            let d = dot(a.xyz, b.xyz);
            return vec4<f32>(d, d, d, d);
        }
        case VOP_DP2_ADD: {
            let d = a.x * b.x + a.y * b.y + c.x;
            return vec4<f32>(d, d, d, d);
        }
        case VOP_MAX4: {
            let m = max(max(a.x, a.y), max(a.z, a.w));
            return vec4<f32>(m, m, m, m);
        }
        case VOP_KILL_EQ: {
            if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_KILL_GT: {
            if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_KILL_GE: {
            if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_KILL_NE: {
            if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_DST: {
            // dest = (1, src0.y * src1.y, src0.z, src1.w)
            return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
        }
        case VOP_CUBE, VOP_MAX_A: {
            // Cube face projection + MAX+AR are rare in P3c's target set;
            // forward to max() fallback so MAX_A degrades gracefully and
            // CUBE does *something* useful (max-axis selection).
            reject_mask |= REJECT_ALU_VEC;
            return max(a, b);
        }
        default: {
            // Unsupported — identity fallback + diagnostic flag.
            reject_mask |= REJECT_ALU_VEC;
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
    }
}

// Scalar op executor. Takes:
//   `src_a` / `src_b` — the two scalar source components (most ops use
//   only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
//   `prev` — current `ps` chain value.
// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
    switch op {
        case SOP_ADDS:        { return src_a + src_b; }
        case SOP_ADDS_PREV:   { return src_a + prev; }
        case SOP_MULS:        { return src_a * src_b; }
        case SOP_MULS_PREV:   { return src_a * prev; }
        case SOP_SUBS:        { return src_a - src_b; }
        case SOP_SUBS_PREV:   { return src_a - prev; }
        case SOP_MAXS:        { return max(src_a, src_b); }
        case SOP_MINS:        { return min(src_a, src_b); }
        case SOP_SEQS:        { return select(0.0, 1.0, src_a == 0.0); }
        case SOP_SGTS:        { return select(0.0, 1.0, src_a >  0.0); }
        case SOP_SGES:        { return select(0.0, 1.0, src_a >= 0.0); }
        case SOP_SNES:        { return select(0.0, 1.0, src_a != 0.0); }
        case SOP_FRCS:        { return fract(src_a); }
        case SOP_TRUNCS:      { return trunc(src_a); }
        case SOP_FLOORS:      { return floor(src_a); }
        case SOP_EXP:         {
            // exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
            // WGSL's exp2 already produces for 0.
            return exp2(src_a);
        }
        case SOP_LOG, SOP_LOGC: {
            // log/logc both compute log2. Canary's LOGC clamps -INF to
            // -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
            // inputs > 0, and we leave src <= 0 to be the shader author's
            // problem (identity fallback via select).
            return select(log2(src_a), 0.0, src_a == 1.0);
        }
        case SOP_RCP, SOP_RCPC, SOP_RCPF: {
            // IEEE reciprocal with src==0 guard; the clamp-variants differ
            // only in how they treat INF/NaN, which is shader-author-rare.
            return select(0.0, 1.0 / src_a, src_a != 0.0);
        }
        case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
            return select(0.0, inverseSqrt(src_a), src_a > 0.0);
        }
        case SOP_SQRT:        { return select(0.0, sqrt(src_a), src_a >= 0.0); }
        case SOP_SIN:         { return sin(src_a); }
        case SOP_COS:         { return cos(src_a); }
        // Predicate writes — update `predicate` and produce a result that
        // the surrounding ALU slot can still consume via `ps`. Canary's
        // setp-variant dst-write semantics are preserved.
        case SOP_SETP_EQ: {
            predicate = (src_a == 0.0);
            return select(1.0, 0.0, src_a == 0.0);
        }
        case SOP_SETP_NE: {
            predicate = (src_a != 0.0);
            return select(1.0, 0.0, src_a != 0.0);
        }
        case SOP_SETP_GT: {
            predicate = (src_a > 0.0);
            return select(1.0, 0.0, src_a > 0.0);
        }
        case SOP_SETP_GE: {
            predicate = (src_a >= 0.0);
            return select(1.0, 0.0, src_a >= 0.0);
        }
        case SOP_SETP_INV: {
            if src_a == 1.0 {
                predicate = true;
                return 0.0;
            } else {
                predicate = false;
                return select(src_a, 1.0, src_a == 0.0);
            }
        }
        case SOP_SETP_POP: {
            if src_a - 1.0 <= 0.0 {
                predicate = true;
                return 0.0;
            } else {
                predicate = false;
                return src_a - 1.0;
            }
        }
        case SOP_SETP_CLR: {
            predicate = false;
            // FLT_MAX sentinel. WGSL's bitcast keeps this portable.
            return bitcast<f32>(0x7F7FFFFFu);
        }
        case SOP_SETP_RSTR: {
            if src_a == 0.0 {
                predicate = true;
                return 0.0;
            } else {
                predicate = false;
                return src_a;
            }
        }
        // Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
        // canary, and `discard` runs at fragment exit when the flag is set.
        case SOP_KILLS_EQ: {
            if src_a == 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_GT: {
            if src_a > 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_GE: {
            if src_a >= 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_NE: {
            if src_a != 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_ONE: {
            if src_a == 1.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_RETAIN_PREV: { return prev; }
        default: {
            reject_mask |= REJECT_ALU_SCL;
            return 0.0;
        }
    }
}

fn interpret_alu(t: u32, is_vertex: bool) {
    // Read the 3-dword instruction triple.
    var w0: u32;
    var w1: u32;
    var w2: u32;
    if is_vertex {
        w0 = vs_instr_dword(t, 0u);
        w1 = vs_instr_dword(t, 1u);
        w2 = vs_instr_dword(t, 2u);
    } else {
        w0 = ps_instr_dword(t, 0u);
        w1 = ps_instr_dword(t, 1u);
        w2 = ps_instr_dword(t, 2u);
    }
    // Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
    let vec_op  = w2 & 0x3Fu;
    let scl_op  = (w2 >> 6u) & 0x3Fu;
    let vec_dst = (w2 >> 16u) & 0x7Fu;
    let scl_dst = (w2 >> 24u) & 0x7Fu;
    let vec_wm  = (w2 >> 12u) & 0xFu;
    let scl_wm  = (w2 >> 8u)  & 0xFu;
    let src_a   = w0 & 0xFFu;
    let src_b   = (w0 >> 8u)  & 0xFFu;
    let src_c   = (w0 >> 16u) & 0xFFu;
    let predicated           = ((w0 >> 27u) & 1u) != 0u;
    let predicate_condition  = ((w0 >> 28u) & 1u) != 0u;
    let scalar_src_is_ps     = ((w0 >> 26u) & 1u) != 0u;
    // `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
    // treats every source as a full r#, no modifiers — M4+ decodes it.
    _ = w1;

    // Honor per-instruction predicate: skip when predicated and the
    // predicate doesn't match the required condition.
    if predicated && (predicate != predicate_condition) {
        return;
    }

    // Vector pipe.
    let a = read_src(src_a);
    let b = read_src(src_b);
    let c = read_src(src_c);
    let vec_result = exec_vector_op(vec_op, a, b, c);
    if vec_wm != 0u {
        write_reg_masked(vec_dst, vec_wm, vec_result);
    }

    // Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
    // read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
    // is set the operand selector chooses `ps` as the primary source.
    let scl_src_a = select(a.x, ps, scalar_src_is_ps);
    let scl_src_b = b.x;
    let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
    ps = new_ps;
    if scl_wm != 0u {
        write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
    }
}
// Xenos VertexFormat values from `xenos.h:641`.
const VFMT_8_8_8_8:         u32 = 6u;
const VFMT_2_10_10_10:      u32 = 7u;
const VFMT_10_11_11:        u32 = 16u;
const VFMT_11_11_10:        u32 = 17u;
const VFMT_16_16:           u32 = 25u;
const VFMT_16_16_16_16:     u32 = 26u;
const VFMT_16_16_FLOAT:     u32 = 31u;
const VFMT_16_16_16_16_FLOAT:u32 = 32u;
const VFMT_32:              u32 = 33u;
const VFMT_32_32:           u32 = 34u;
const VFMT_32_32_32_32:     u32 = 35u;
const VFMT_32_FLOAT:        u32 = 36u;
const VFMT_32_32_FLOAT:     u32 = 37u;
const VFMT_32_32_32_32_FLOAT:u32 = 38u;
const VFMT_32_32_32_FLOAT:  u32 = 57u;

// Decode vertex fetch instruction fields (canary's VertexFetchInstruction
// layout in `ucode.h:690`):
//   w0 [4:0]   opcode
//   w0 [10:5]  src_reg[5:0]
//   w0 [17:11] dst_reg[6:0] + must-be-one
//   w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
//   w1 [21:16] format[5:0]
//   w2 [7:0]   stride (in dwords)
//   w2 [30:8]  offset (signed, in dwords)
//
// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
// sign/normalization flags are used for the most-common normalized-unsigned
// path. Rejects set `REJECT_VFETCH_FMT`.
fn interpret_vertex_fetch(t: u32) {
    let w0 = vs_instr_dword(t, 0u);
    let w1 = vs_instr_dword(t, 1u);
    let w2 = vs_instr_dword(t, 2u);
    let fetch_const = (w0 >> 5u) & 0x1Fu;
    let dst_reg = (w0 >> 10u) & 0x7Fu;
    let src_reg = (w0 >> 17u) & 0x7Fu;
    let format  = (w1 >> 16u) & 0x3Fu;
    let stride  = w2 & 0xFFu;

    // Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
    // dword 1 carries (endian[1:0], size[25:2]).
    let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
    let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;

    let vidx = u32(registers[src_reg & 0x7Fu].x);
    // Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
    let effective_stride = select(stride, 4u, stride == 0u);
    let addr = base_dwords + vidx * effective_stride;

    let n = arrayLength(&vertex_buffer);
    var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);

    switch format {
        case VFMT_32_32_32_32_FLOAT: {
            if addr + 3u < n {
                result = vec4<f32>(
                    bitcast<f32>(vertex_buffer[addr + 0u]),
                    bitcast<f32>(vertex_buffer[addr + 1u]),
                    bitcast<f32>(vertex_buffer[addr + 2u]),
                    bitcast<f32>(vertex_buffer[addr + 3u]),
                );
            }
        }
        case VFMT_32_32_32_FLOAT: {
            if addr + 2u < n {
                result = vec4<f32>(
                    bitcast<f32>(vertex_buffer[addr + 0u]),
                    bitcast<f32>(vertex_buffer[addr + 1u]),
                    bitcast<f32>(vertex_buffer[addr + 2u]),
                    1.0,
                );
            }
        }
        case VFMT_32_32_FLOAT: {
            if addr + 1u < n {
                result = vec4<f32>(
                    bitcast<f32>(vertex_buffer[addr + 0u]),
                    bitcast<f32>(vertex_buffer[addr + 1u]),
                    0.0,
                    1.0,
                );
            }
        }
        case VFMT_32_FLOAT: {
            if addr < n {
                result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
            }
        }
        case VFMT_8_8_8_8: {
            if addr < n {
                result = unpack4x8unorm(vertex_buffer[addr]);
            }
        }
        case VFMT_16_16_FLOAT: {
            if addr < n {
                let h = unpack2x16float(vertex_buffer[addr]);
                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
            }
        }
        case VFMT_16_16_16_16_FLOAT: {
            if addr + 1u < n {
                let h0 = unpack2x16float(vertex_buffer[addr]);
                let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
            }
        }
        case VFMT_16_16: {
            if addr < n {
                // Default to signed normalized; unsigned variants differ
                // only for sign-extension and are less common on Xenos VBs.
                let h = unpack2x16snorm(vertex_buffer[addr]);
                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
            }
        }
        case VFMT_16_16_16_16: {
            if addr + 1u < n {
                let h0 = unpack2x16snorm(vertex_buffer[addr]);
                let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
            }
        }
        case VFMT_2_10_10_10: {
            // Unpack 10-bit R/G/B (signed or unsigned, default unsigned
            // normalized) + 2-bit A. Unnormalized not exercised here.
            if addr < n {
                let packed = vertex_buffer[addr];
                let r = f32(packed & 0x3FFu) / 1023.0;
                let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
                let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
                let a = f32((packed >> 30u) & 0x3u) / 3.0;
                result = vec4<f32>(r, g, b, a);
            }
        }
        default: {
            reject_mask |= REJECT_VFETCH_FMT;
            // Identity fallback preserves vertex-index visibility.
            if addr < n {
                result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
            }
        }
    }

    registers[dst_reg & 0x7Fu] = result;
}

// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
// dummy bound at group(1); the real per-slot texture cache lands with P5.
// Reads (u, v) from the source register's .xy and writes the sample into
// the destination register. `textureSampleLevel` works in both VS and PS
// (no implicit derivatives), so no per-stage specialisation needed.
fn interpret_texture_fetch(t: u32, is_vertex: bool) {
    var w0: u32 = 0u;
    if is_vertex {
        w0 = vs_instr_dword(t, 0u);
    } else {
        w0 = ps_instr_dword(t, 0u);
    }
    let dst_reg = (w0 >> 10u) & 0x7Fu;
    let src_reg = (w0 >> 17u) & 0x7Fu;
    let uv = registers[src_reg & 0x7Fu].xy;
    let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
    registers[dst_reg & 0x7Fu] = sample;
}

// Walk an Exec clause's instruction triples.
//   sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
//             (we ignore in MVP); bit 1 = is-fetch.
fn exec_vs(address: u32, count: u32, sequence: u32) {
    for (var i: u32 = 0u; i < count; i = i + 1u) {
        let t = address + i;
        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
        if is_fetch {
            let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
            // 0x00 = vertex fetch, 0x01 = texture fetch.
            if opcode == 0u {
                interpret_vertex_fetch(t);
            } else if opcode == 1u {
                interpret_texture_fetch(t, true);
            }
        } else {
            interpret_alu(t, true);
        }
    }
}
fn exec_ps(address: u32, count: u32, sequence: u32) {
    for (var i: u32 = 0u; i < count; i = i + 1u) {
        let t = address + i;
        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
        if is_fetch {
            interpret_texture_fetch(t, false);
        } else {
            interpret_alu(t, false);
        }
    }
}

// Reset the per-invocation register state to a known baseline.
fn reset_state() {
    for (var i: u32 = 0u; i < 128u; i = i + 1u) {
        registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
    }
    ps = 0.0;
    predicate = false;
    current_alloc = 0u;
    kill_flag = false;
    loop_depth = 0u;
    reject_mask = 0u;
    for (var i: u32 = 0u; i < 4u; i = i + 1u) {
        loop_counters[i] = 0u;
        loop_starts[i] = 0u;
    }
}

// ── Stage entry points.

// M7 register slots for exports. VS writes position at oPos (convention:
// Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
// and a set of interpolators. We track both via `current_alloc`: writes
// inside each alloc range are tagged and copied out at Exit.
const OPOS_REG:       u32 = 32u;  // synthetic slot used by the interpreter
const OCOLOR_REG:     u32 = 33u;  // color0 scratch slot

@vertex
fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
    reset_state();

    // Seed r0 with the vertex index so simple shaders (or the procedural
    // fallback) have access without a real vertex fetch.
    registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
    // Seed the export slots with a procedural fallback: if the shader
    // never writes oPos / oColor, this keeps the output visible rather
    // than collapsing to (0,0) which would skip rasterization.
    let total = max(draw_ctx.vertex_count, 1u);
    let t_param = f32(vidx) / f32(total);
    let angle = t_param * 6.2831853;
    let radius = 0.35;
    registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
    let d = f32(draw_ctx.draw_index);
    registers[OCOLOR_REG] = vec4<f32>(
        0.5 + 0.5 * sin(d * 0.37),
        0.5 + 0.5 * sin(d * 0.51 + 2.0),
        0.5 + 0.5 * sin(d * 0.73 + 4.0),
        1.0,
    );

    // Dead-binding guard for VERTEX-stage-only vertex_buffer access.
    let vb_live = f32(vertex_buffer[0]) * 0.0;

    // Walk the VS CF table.
    walk_cf_vs();

    var out: VsOut;
    // Use registers[OPOS_REG] as position; the procedural fallback above
    // seeded it so an un-interpreted shader still draws a recognisable
    // circle.
    out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
    out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
    return out;
}

@fragment
fn fs_main(in: VsOut) -> FsOut {
    reset_state();

    walk_cf_ps();

    // Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
    // `discard` at the entry level (outside any helper) is the only way to
    // guarantee early-out in WGSL.
    if kill_flag {
        discard;
    }

    var out: FsOut;
    out.color0 = in.color;
    return out;
}

// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
// we don't have). A hard iteration cap keeps the GPU from hanging on
// malformed or extreme shaders.
const CF_WALKER_MAX_ITER: u32 = 4096u;

fn walk_cf_vs() {
    let cf_n = vs_cf_count();
    var cf_i: u32 = 0u;
    var iter: u32 = 0u;
    loop {
        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
        iter = iter + 1u;
        let kind = vs_cf_kind(cf_i);
        let primary = vs_cf_primary(cf_i);
        let aux = vs_cf_aux(cf_i);
        var advance: bool = true;
        var stop: bool = false;
        switch kind {
            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
                let count = aux & 0xFFu;
                let sequence = aux >> 8u;
                exec_vs(primary, count, sequence);
                if kind == CF_KIND_EXEC_END { stop = true; }
            }
            case CF_KIND_ALLOC: { current_alloc = primary; }
            case CF_KIND_EXIT:  { stop = true; }
            case CF_KIND_LOOP_START: {
                let loop_id = aux & 0x1Fu;
                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
                if loop_count > 16u {
                    loop_count = 16u;
                    reject_mask |= REJECT_LOOP_OVERFLOW;
                }
                if loop_count > 0u && loop_depth < 4u {
                    loop_starts[loop_depth] = cf_i;
                    loop_counters[loop_depth] = loop_count;
                    loop_depth = loop_depth + 1u;
                }
                // count==0 → fall through; matching LOOP_END will pop.
            }
            case CF_KIND_LOOP_END: {
                if loop_depth > 0u {
                    let d = loop_depth - 1u;
                    if loop_counters[d] > 1u {
                        loop_counters[d] = loop_counters[d] - 1u;
                        cf_i = loop_starts[d] + 1u;
                        advance = false;
                    } else {
                        loop_counters[d] = 0u;
                        loop_depth = d;
                    }
                }
            }
            case CF_KIND_COND_JMP: {
                let pred_bits = aux;
                let is_pred  = (pred_bits & 1u) != 0u;
                let pred_cnd = (pred_bits & 2u) != 0u;
                if !is_pred || predicate == pred_cnd {
                    cf_i = primary;
                    advance = false;
                }
            }
            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
                // No call stack — mark and continue.
                reject_mask |= REJECT_CF_CALL;
            }
            default: { reject_mask |= REJECT_CF_JUMP; }
        }
        if stop { break; }
        if advance { cf_i = cf_i + 1u; }
    }
}

fn walk_cf_ps() {
    let cf_n = ps_cf_count();
    var cf_i: u32 = 0u;
    var iter: u32 = 0u;
    loop {
        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
        iter = iter + 1u;
        let kind = ps_cf_kind(cf_i);
        let primary = ps_cf_primary(cf_i);
        let aux = ps_cf_aux(cf_i);
        var advance: bool = true;
        var stop: bool = false;
        switch kind {
            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
                let count = aux & 0xFFu;
                let sequence = aux >> 8u;
                exec_ps(primary, count, sequence);
                if kind == CF_KIND_EXEC_END { stop = true; }
            }
            case CF_KIND_ALLOC: { current_alloc = primary; }
            case CF_KIND_EXIT:  { stop = true; }
            case CF_KIND_LOOP_START: {
                let loop_id = aux & 0x1Fu;
                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
                if loop_count > 16u {
                    loop_count = 16u;
                    reject_mask |= REJECT_LOOP_OVERFLOW;
                }
                if loop_count > 0u && loop_depth < 4u {
                    loop_starts[loop_depth] = cf_i;
                    loop_counters[loop_depth] = loop_count;
                    loop_depth = loop_depth + 1u;
                }
            }
            case CF_KIND_LOOP_END: {
                if loop_depth > 0u {
                    let d = loop_depth - 1u;
                    if loop_counters[d] > 1u {
                        loop_counters[d] = loop_counters[d] - 1u;
                        cf_i = loop_starts[d] + 1u;
                        advance = false;
                    } else {
                        loop_counters[d] = 0u;
                        loop_depth = d;
                    }
                }
            }
            case CF_KIND_COND_JMP: {
                let pred_bits = aux;
                let is_pred  = (pred_bits & 1u) != 0u;
                let pred_cnd = (pred_bits & 2u) != 0u;
                if !is_pred || predicate == pred_cnd {
                    cf_i = primary;
                    advance = false;
                }
            }
            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
                reject_mask |= REJECT_CF_CALL;
            }
            default: { reject_mask |= REJECT_CF_JUMP; }
        }
        if stop { break; }
        if advance { cf_i = cf_i + 1u; }
    }
}