xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)
First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).
Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.
Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
974
crates/xenia-gpu/src/shaders/xenos_interp.wgsl
Normal file
974
crates/xenia-gpu/src/shaders/xenos_interp.wgsl
Normal file
@@ -0,0 +1,974 @@
|
||||
// xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
|
||||
//
|
||||
// Bindings (stable across P3b milestones):
|
||||
// @group(0) @binding(0) draw_ctx (uniform, 16 B — XenosDrawConstants)
|
||||
// @group(0) @binding(1) xenos_consts (uniform, ~9.2 KB — XenosConstants)
|
||||
// @group(0) @binding(2) vs_ucode (storage<read>, packed VS shader)
|
||||
// @group(0) @binding(3) ps_ucode (storage<read>, packed PS shader)
|
||||
// @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
|
||||
//
|
||||
// Packed shader layout (both vs_ucode & ps_ucode):
|
||||
// [0] = cf_count
|
||||
// [1 .. 1 + cf_count*3] = CF table: (kind, primary, aux) × cf_count
|
||||
// [1 + cf_count*3 ..] = instruction triples (3 dwords each)
|
||||
//
|
||||
// M3 state (this file): CF walker + operand decode helpers + register file
|
||||
// scaffold are complete. ALU / fetch bodies are still stubs that fall back
|
||||
// to the procedural-circle visualisation; M4-M7 fill them in.
|
||||
|
||||
struct XenosDrawConstants {
|
||||
draw_index: u32,
|
||||
vertex_count: u32,
|
||||
prim_kind: u32,
|
||||
_pad: u32,
|
||||
};
|
||||
|
||||
struct XenosConstants {
|
||||
alu: array<vec4<f32>, 512>,
|
||||
fetch: array<u32, 256>,
|
||||
bool_consts: array<u32, 8>,
|
||||
loop_consts: array<u32, 32>,
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<uniform> draw_ctx : XenosDrawConstants;
|
||||
// `xenos_consts` is a read-only storage buffer (not uniform) because the
|
||||
// block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
|
||||
// address space requires 16-byte element stride, which would triple the
|
||||
// allocation; storage accepts the natural 4-byte stride.
|
||||
@group(0) @binding(1) var<storage, read> xenos_consts : XenosConstants;
|
||||
@group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
|
||||
@group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
|
||||
@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
|
||||
|
||||
// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
|
||||
// all texture slots; the P5 texture cache will replace this with per-slot
|
||||
// bindings.
|
||||
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||
@group(1) @binding(1) var xenos_samp : sampler;
|
||||
|
||||
// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
|
||||
const CF_KIND_EXEC: u32 = 0u;
|
||||
const CF_KIND_EXEC_END: u32 = 1u;
|
||||
const CF_KIND_ALLOC: u32 = 2u;
|
||||
const CF_KIND_EXIT: u32 = 3u;
|
||||
const CF_KIND_LOOP_START: u32 = 4u;
|
||||
const CF_KIND_LOOP_END: u32 = 5u;
|
||||
const CF_KIND_COND_JMP: u32 = 6u;
|
||||
const CF_KIND_COND_CALL: u32 = 7u;
|
||||
const CF_KIND_RETURN: u32 = 8u;
|
||||
const CF_KIND_UNKNOWN: u32 = 15u;
|
||||
|
||||
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
|
||||
const ALLOC_KIND_POSITION: u32 = 0u;
|
||||
const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
|
||||
const ALLOC_KIND_COLORS: u32 = 2u;
|
||||
|
||||
// Per-invocation Xenos register file + scalar `ps` + predicate.
|
||||
var<private> registers: array<vec4<f32>, 128>;
|
||||
var<private> ps: f32;
|
||||
var<private> predicate: bool;
|
||||
|
||||
// Currently-active export alloc kind; set by Alloc clauses.
|
||||
var<private> current_alloc: u32;
|
||||
|
||||
// P3c additions:
|
||||
// `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
|
||||
// interpreter exits. (`discard` inside a helper function is
|
||||
// allowed in WGSL, but keeping it at the entry level makes
|
||||
// control flow easier to read.)
|
||||
// `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
|
||||
// clauses. Xenos supports up to 4 nested loops.
|
||||
// `reject_mask` — bitfield of op categories we failed to interpret, so the
|
||||
// PS fallback color + host-side diagnostics can surface it.
|
||||
var<private> kill_flag: bool;
|
||||
var<private> loop_depth: u32;
|
||||
var<private> loop_counters: array<u32, 4>;
|
||||
var<private> loop_starts: array<u32, 4>;
|
||||
var<private> reject_mask: u32;
|
||||
|
||||
const REJECT_ALU_VEC: u32 = 1u;
|
||||
const REJECT_ALU_SCL: u32 = 2u;
|
||||
const REJECT_TEX_NON2D: u32 = 4u;
|
||||
const REJECT_VFETCH_FMT: u32 = 8u;
|
||||
const REJECT_CF_JUMP: u32 = 16u;
|
||||
const REJECT_CF_CALL: u32 = 32u;
|
||||
const REJECT_LOOP_OVERFLOW:u32 = 64u;
|
||||
|
||||
struct VsOut {
|
||||
@builtin(position) position: vec4<f32>,
|
||||
@location(0) color: vec4<f32>,
|
||||
};
|
||||
|
||||
struct FsOut {
|
||||
@location(0) color0: vec4<f32>,
|
||||
};
|
||||
|
||||
// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.
|
||||
|
||||
fn vs_cf_count() -> u32 { return vs_ucode[0]; }
|
||||
fn ps_cf_count() -> u32 { return ps_ucode[0]; }
|
||||
|
||||
fn vs_cf_kind(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
|
||||
fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
|
||||
fn vs_cf_aux(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 2u]; }
|
||||
fn ps_cf_kind(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
|
||||
fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
|
||||
fn ps_cf_aux(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 2u]; }
|
||||
|
||||
fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
|
||||
fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }
|
||||
|
||||
// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
|
||||
fn vs_instr_dword(t: u32, which: u32) -> u32 {
|
||||
return vs_ucode[vs_instr_base() + t * 3u + which];
|
||||
}
|
||||
fn ps_instr_dword(t: u32, which: u32) -> u32 {
|
||||
return ps_ucode[ps_instr_base() + t * 3u + which];
|
||||
}
|
||||
|
||||
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
|
||||
|
||||
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
|
||||
// Swizzle is 8 bits: 2 bits per output lane, xyzw order.
|
||||
let sx = (swizzle >> 0u) & 3u;
|
||||
let sy = (swizzle >> 2u) & 3u;
|
||||
let sz = (swizzle >> 4u) & 3u;
|
||||
let sw = (swizzle >> 6u) & 3u;
|
||||
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
|
||||
}
|
||||
|
||||
fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
|
||||
var r = v;
|
||||
if take_abs {
|
||||
r = abs(r);
|
||||
}
|
||||
if negate {
|
||||
r = -r;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
|
||||
if idx >= 128u {
|
||||
return;
|
||||
}
|
||||
let prev = registers[idx];
|
||||
var out = prev;
|
||||
if (mask & 1u) != 0u { out.x = value.x; }
|
||||
if (mask & 2u) != 0u { out.y = value.y; }
|
||||
if (mask & 4u) != 0u { out.z = value.z; }
|
||||
if (mask & 8u) != 0u { out.w = value.w; }
|
||||
registers[idx] = out;
|
||||
}
|
||||
|
||||
// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
|
||||
// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
|
||||
// `xenia_gpu::ucode::alu::{vop,sop}`).
|
||||
const VOP_ADD: u32 = 0u;
|
||||
const VOP_MUL: u32 = 1u;
|
||||
const VOP_MAX: u32 = 2u;
|
||||
const VOP_MIN: u32 = 3u;
|
||||
const VOP_SEQ: u32 = 4u;
|
||||
const VOP_SGT: u32 = 5u;
|
||||
const VOP_SGE: u32 = 6u;
|
||||
const VOP_SNE: u32 = 7u;
|
||||
const VOP_FRC: u32 = 8u;
|
||||
const VOP_TRUNC: u32 = 9u;
|
||||
const VOP_FLOOR: u32 = 10u;
|
||||
const VOP_MAD: u32 = 11u;
|
||||
const VOP_CND_EQ: u32 = 12u;
|
||||
const VOP_CND_GE: u32 = 13u;
|
||||
const VOP_CND_GT: u32 = 14u;
|
||||
const VOP_DP4: u32 = 15u;
|
||||
const VOP_DP3: u32 = 16u;
|
||||
const VOP_DP2_ADD: u32 = 17u;
|
||||
const VOP_CUBE: u32 = 18u;
|
||||
const VOP_MAX4: u32 = 19u;
|
||||
const VOP_KILL_EQ: u32 = 24u;
|
||||
const VOP_KILL_GT: u32 = 25u;
|
||||
const VOP_KILL_GE: u32 = 26u;
|
||||
const VOP_KILL_NE: u32 = 27u;
|
||||
const VOP_DST: u32 = 28u;
|
||||
const VOP_MAX_A: u32 = 29u;
|
||||
|
||||
const SOP_ADDS: u32 = 0u;
|
||||
const SOP_ADDS_PREV: u32 = 1u;
|
||||
const SOP_MULS: u32 = 2u;
|
||||
const SOP_MULS_PREV: u32 = 3u;
|
||||
const SOP_MAXS: u32 = 5u;
|
||||
const SOP_MINS: u32 = 6u;
|
||||
const SOP_SEQS: u32 = 7u;
|
||||
const SOP_SGTS: u32 = 8u;
|
||||
const SOP_SGES: u32 = 9u;
|
||||
const SOP_SNES: u32 = 10u;
|
||||
const SOP_FRCS: u32 = 11u;
|
||||
const SOP_TRUNCS: u32 = 12u;
|
||||
const SOP_FLOORS: u32 = 13u;
|
||||
const SOP_EXP: u32 = 14u;
|
||||
const SOP_LOGC: u32 = 15u;
|
||||
const SOP_LOG: u32 = 16u;
|
||||
const SOP_RCPC: u32 = 17u;
|
||||
const SOP_RCPF: u32 = 18u;
|
||||
const SOP_RCP: u32 = 19u;
|
||||
const SOP_RSQC: u32 = 20u;
|
||||
const SOP_RSQF: u32 = 21u;
|
||||
const SOP_RSQ: u32 = 22u;
|
||||
const SOP_SUBS: u32 = 25u;
|
||||
const SOP_SUBS_PREV: u32 = 26u;
|
||||
const SOP_SETP_EQ: u32 = 27u;
|
||||
const SOP_SETP_NE: u32 = 28u;
|
||||
const SOP_SETP_GT: u32 = 29u;
|
||||
const SOP_SETP_GE: u32 = 30u;
|
||||
const SOP_SETP_INV: u32 = 31u;
|
||||
const SOP_SETP_POP: u32 = 32u;
|
||||
const SOP_SETP_CLR: u32 = 33u;
|
||||
const SOP_SETP_RSTR: u32 = 34u;
|
||||
const SOP_KILLS_EQ: u32 = 35u;
|
||||
const SOP_KILLS_GT: u32 = 36u;
|
||||
const SOP_KILLS_GE: u32 = 37u;
|
||||
const SOP_KILLS_NE: u32 = 38u;
|
||||
const SOP_KILLS_ONE: u32 = 39u;
|
||||
const SOP_SQRT: u32 = 40u;
|
||||
const SOP_SIN: u32 = 48u;
|
||||
const SOP_COS: u32 = 49u;
|
||||
const SOP_RETAIN_PREV: u32 = 50u;
|
||||
|
||||
// Read a vec4 source from the register file. Treats the src index as a
|
||||
// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
|
||||
// M4+ will extend this to decode the full operand header.
|
||||
fn read_src(idx: u32) -> vec4<f32> {
|
||||
return registers[idx & 0x7Fu];
|
||||
}
|
||||
|
||||
fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
|
||||
switch op {
|
||||
case VOP_ADD: { return a + b; }
|
||||
case VOP_MUL: { return a * b; }
|
||||
case VOP_MAX: { return max(a, b); }
|
||||
case VOP_MIN: { return min(a, b); }
|
||||
case VOP_SEQ: {
|
||||
return vec4<f32>(
|
||||
select(0.0, 1.0, a.x == b.x),
|
||||
select(0.0, 1.0, a.y == b.y),
|
||||
select(0.0, 1.0, a.z == b.z),
|
||||
select(0.0, 1.0, a.w == b.w),
|
||||
);
|
||||
}
|
||||
case VOP_SGT: {
|
||||
return vec4<f32>(
|
||||
select(0.0, 1.0, a.x > b.x),
|
||||
select(0.0, 1.0, a.y > b.y),
|
||||
select(0.0, 1.0, a.z > b.z),
|
||||
select(0.0, 1.0, a.w > b.w),
|
||||
);
|
||||
}
|
||||
case VOP_SGE: {
|
||||
return vec4<f32>(
|
||||
select(0.0, 1.0, a.x >= b.x),
|
||||
select(0.0, 1.0, a.y >= b.y),
|
||||
select(0.0, 1.0, a.z >= b.z),
|
||||
select(0.0, 1.0, a.w >= b.w),
|
||||
);
|
||||
}
|
||||
case VOP_SNE: {
|
||||
return vec4<f32>(
|
||||
select(0.0, 1.0, a.x != b.x),
|
||||
select(0.0, 1.0, a.y != b.y),
|
||||
select(0.0, 1.0, a.z != b.z),
|
||||
select(0.0, 1.0, a.w != b.w),
|
||||
);
|
||||
}
|
||||
case VOP_FRC: { return fract(a); }
|
||||
case VOP_TRUNC: { return trunc(a); }
|
||||
case VOP_FLOOR: { return floor(a); }
|
||||
case VOP_MAD: { return a * b + c; }
|
||||
case VOP_CND_EQ: {
|
||||
// dst = (src0 == 0) ? src1 : src2
|
||||
return vec4<f32>(
|
||||
select(c.x, b.x, a.x == 0.0),
|
||||
select(c.y, b.y, a.y == 0.0),
|
||||
select(c.z, b.z, a.z == 0.0),
|
||||
select(c.w, b.w, a.w == 0.0),
|
||||
);
|
||||
}
|
||||
case VOP_CND_GE: {
|
||||
return vec4<f32>(
|
||||
select(c.x, b.x, a.x >= 0.0),
|
||||
select(c.y, b.y, a.y >= 0.0),
|
||||
select(c.z, b.z, a.z >= 0.0),
|
||||
select(c.w, b.w, a.w >= 0.0),
|
||||
);
|
||||
}
|
||||
case VOP_CND_GT: {
|
||||
return vec4<f32>(
|
||||
select(c.x, b.x, a.x > 0.0),
|
||||
select(c.y, b.y, a.y > 0.0),
|
||||
select(c.z, b.z, a.z > 0.0),
|
||||
select(c.w, b.w, a.w > 0.0),
|
||||
);
|
||||
}
|
||||
case VOP_DP4: {
|
||||
let d = dot(a, b);
|
||||
return vec4<f32>(d, d, d, d);
|
||||
}
|
||||
case VOP_DP3: {
|
||||
let d = dot(a.xyz, b.xyz);
|
||||
return vec4<f32>(d, d, d, d);
|
||||
}
|
||||
case VOP_DP2_ADD: {
|
||||
let d = a.x * b.x + a.y * b.y + c.x;
|
||||
return vec4<f32>(d, d, d, d);
|
||||
}
|
||||
case VOP_MAX4: {
|
||||
let m = max(max(a.x, a.y), max(a.z, a.w));
|
||||
return vec4<f32>(m, m, m, m);
|
||||
}
|
||||
case VOP_KILL_EQ: {
|
||||
if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
|
||||
kill_flag = true;
|
||||
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||
}
|
||||
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
}
|
||||
case VOP_KILL_GT: {
|
||||
if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
|
||||
kill_flag = true;
|
||||
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||
}
|
||||
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
}
|
||||
case VOP_KILL_GE: {
|
||||
if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
|
||||
kill_flag = true;
|
||||
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||
}
|
||||
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
}
|
||||
case VOP_KILL_NE: {
|
||||
if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
|
||||
kill_flag = true;
|
||||
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||
}
|
||||
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
}
|
||||
case VOP_DST: {
|
||||
// dest = (1, src0.y * src1.y, src0.z, src1.w)
|
||||
return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
|
||||
}
|
||||
case VOP_CUBE, VOP_MAX_A: {
|
||||
// Cube face projection + MAX+AR are rare in P3c's target set;
|
||||
// forward to max() fallback so MAX_A degrades gracefully and
|
||||
// CUBE does *something* useful (max-axis selection).
|
||||
reject_mask |= REJECT_ALU_VEC;
|
||||
return max(a, b);
|
||||
}
|
||||
default: {
|
||||
// Unsupported — identity fallback + diagnostic flag.
|
||||
reject_mask |= REJECT_ALU_VEC;
|
||||
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar op executor. Takes:
|
||||
// `src_a` / `src_b` — the two scalar source components (most ops use
|
||||
// only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
|
||||
// `prev` — current `ps` chain value.
|
||||
// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
|
||||
fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
|
||||
switch op {
|
||||
case SOP_ADDS: { return src_a + src_b; }
|
||||
case SOP_ADDS_PREV: { return src_a + prev; }
|
||||
case SOP_MULS: { return src_a * src_b; }
|
||||
case SOP_MULS_PREV: { return src_a * prev; }
|
||||
case SOP_SUBS: { return src_a - src_b; }
|
||||
case SOP_SUBS_PREV: { return src_a - prev; }
|
||||
case SOP_MAXS: { return max(src_a, src_b); }
|
||||
case SOP_MINS: { return min(src_a, src_b); }
|
||||
case SOP_SEQS: { return select(0.0, 1.0, src_a == 0.0); }
|
||||
case SOP_SGTS: { return select(0.0, 1.0, src_a > 0.0); }
|
||||
case SOP_SGES: { return select(0.0, 1.0, src_a >= 0.0); }
|
||||
case SOP_SNES: { return select(0.0, 1.0, src_a != 0.0); }
|
||||
case SOP_FRCS: { return fract(src_a); }
|
||||
case SOP_TRUNCS: { return trunc(src_a); }
|
||||
case SOP_FLOORS: { return floor(src_a); }
|
||||
case SOP_EXP: {
|
||||
// exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
|
||||
// WGSL's exp2 already produces for 0.
|
||||
return exp2(src_a);
|
||||
}
|
||||
case SOP_LOG, SOP_LOGC: {
|
||||
// log/logc both compute log2. Canary's LOGC clamps -INF to
|
||||
// -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
|
||||
// inputs > 0, and we leave src <= 0 to be the shader author's
|
||||
// problem (identity fallback via select).
|
||||
return select(log2(src_a), 0.0, src_a == 1.0);
|
||||
}
|
||||
case SOP_RCP, SOP_RCPC, SOP_RCPF: {
|
||||
// IEEE reciprocal with src==0 guard; the clamp-variants differ
|
||||
// only in how they treat INF/NaN, which is shader-author-rare.
|
||||
return select(0.0, 1.0 / src_a, src_a != 0.0);
|
||||
}
|
||||
case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
|
||||
return select(0.0, inverseSqrt(src_a), src_a > 0.0);
|
||||
}
|
||||
case SOP_SQRT: { return select(0.0, sqrt(src_a), src_a >= 0.0); }
|
||||
case SOP_SIN: { return sin(src_a); }
|
||||
case SOP_COS: { return cos(src_a); }
|
||||
// Predicate writes — update `predicate` and produce a result that
|
||||
// the surrounding ALU slot can still consume via `ps`. Canary's
|
||||
// setp-variant dst-write semantics are preserved.
|
||||
case SOP_SETP_EQ: {
|
||||
predicate = (src_a == 0.0);
|
||||
return select(1.0, 0.0, src_a == 0.0);
|
||||
}
|
||||
case SOP_SETP_NE: {
|
||||
predicate = (src_a != 0.0);
|
||||
return select(1.0, 0.0, src_a != 0.0);
|
||||
}
|
||||
case SOP_SETP_GT: {
|
||||
predicate = (src_a > 0.0);
|
||||
return select(1.0, 0.0, src_a > 0.0);
|
||||
}
|
||||
case SOP_SETP_GE: {
|
||||
predicate = (src_a >= 0.0);
|
||||
return select(1.0, 0.0, src_a >= 0.0);
|
||||
}
|
||||
case SOP_SETP_INV: {
|
||||
if src_a == 1.0 {
|
||||
predicate = true;
|
||||
return 0.0;
|
||||
} else {
|
||||
predicate = false;
|
||||
return select(src_a, 1.0, src_a == 0.0);
|
||||
}
|
||||
}
|
||||
case SOP_SETP_POP: {
|
||||
if src_a - 1.0 <= 0.0 {
|
||||
predicate = true;
|
||||
return 0.0;
|
||||
} else {
|
||||
predicate = false;
|
||||
return src_a - 1.0;
|
||||
}
|
||||
}
|
||||
case SOP_SETP_CLR: {
|
||||
predicate = false;
|
||||
// FLT_MAX sentinel. WGSL's bitcast keeps this portable.
|
||||
return bitcast<f32>(0x7F7FFFFFu);
|
||||
}
|
||||
case SOP_SETP_RSTR: {
|
||||
if src_a == 0.0 {
|
||||
predicate = true;
|
||||
return 0.0;
|
||||
} else {
|
||||
predicate = false;
|
||||
return src_a;
|
||||
}
|
||||
}
|
||||
// Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
|
||||
// canary, and `discard` runs at fragment exit when the flag is set.
|
||||
case SOP_KILLS_EQ: {
|
||||
if src_a == 0.0 { kill_flag = true; return 1.0; }
|
||||
return 0.0;
|
||||
}
|
||||
case SOP_KILLS_GT: {
|
||||
if src_a > 0.0 { kill_flag = true; return 1.0; }
|
||||
return 0.0;
|
||||
}
|
||||
case SOP_KILLS_GE: {
|
||||
if src_a >= 0.0 { kill_flag = true; return 1.0; }
|
||||
return 0.0;
|
||||
}
|
||||
case SOP_KILLS_NE: {
|
||||
if src_a != 0.0 { kill_flag = true; return 1.0; }
|
||||
return 0.0;
|
||||
}
|
||||
case SOP_KILLS_ONE: {
|
||||
if src_a == 1.0 { kill_flag = true; return 1.0; }
|
||||
return 0.0;
|
||||
}
|
||||
case SOP_RETAIN_PREV: { return prev; }
|
||||
default: {
|
||||
reject_mask |= REJECT_ALU_SCL;
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn interpret_alu(t: u32, is_vertex: bool) {
|
||||
// Read the 3-dword instruction triple.
|
||||
var w0: u32;
|
||||
var w1: u32;
|
||||
var w2: u32;
|
||||
if is_vertex {
|
||||
w0 = vs_instr_dword(t, 0u);
|
||||
w1 = vs_instr_dword(t, 1u);
|
||||
w2 = vs_instr_dword(t, 2u);
|
||||
} else {
|
||||
w0 = ps_instr_dword(t, 0u);
|
||||
w1 = ps_instr_dword(t, 1u);
|
||||
w2 = ps_instr_dword(t, 2u);
|
||||
}
|
||||
// Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
|
||||
let vec_op = w2 & 0x3Fu;
|
||||
let scl_op = (w2 >> 6u) & 0x3Fu;
|
||||
let vec_dst = (w2 >> 16u) & 0x7Fu;
|
||||
let scl_dst = (w2 >> 24u) & 0x7Fu;
|
||||
let vec_wm = (w2 >> 12u) & 0xFu;
|
||||
let scl_wm = (w2 >> 8u) & 0xFu;
|
||||
let src_a = w0 & 0xFFu;
|
||||
let src_b = (w0 >> 8u) & 0xFFu;
|
||||
let src_c = (w0 >> 16u) & 0xFFu;
|
||||
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
||||
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
||||
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
||||
// `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
|
||||
// treats every source as a full r#, no modifiers — M4+ decodes it.
|
||||
_ = w1;
|
||||
|
||||
// Honor per-instruction predicate: skip when predicated and the
|
||||
// predicate doesn't match the required condition.
|
||||
if predicated && (predicate != predicate_condition) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Vector pipe.
|
||||
let a = read_src(src_a);
|
||||
let b = read_src(src_b);
|
||||
let c = read_src(src_c);
|
||||
let vec_result = exec_vector_op(vec_op, a, b, c);
|
||||
if vec_wm != 0u {
|
||||
write_reg_masked(vec_dst, vec_wm, vec_result);
|
||||
}
|
||||
|
||||
// Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
|
||||
// read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
|
||||
// is set the operand selector chooses `ps` as the primary source.
|
||||
let scl_src_a = select(a.x, ps, scalar_src_is_ps);
|
||||
let scl_src_b = b.x;
|
||||
let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
|
||||
ps = new_ps;
|
||||
if scl_wm != 0u {
|
||||
write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
|
||||
}
|
||||
}
|
||||
// Xenos VertexFormat values from `xenos.h:641`.
|
||||
const VFMT_8_8_8_8: u32 = 6u;
|
||||
const VFMT_2_10_10_10: u32 = 7u;
|
||||
const VFMT_10_11_11: u32 = 16u;
|
||||
const VFMT_11_11_10: u32 = 17u;
|
||||
const VFMT_16_16: u32 = 25u;
|
||||
const VFMT_16_16_16_16: u32 = 26u;
|
||||
const VFMT_16_16_FLOAT: u32 = 31u;
|
||||
const VFMT_16_16_16_16_FLOAT:u32 = 32u;
|
||||
const VFMT_32: u32 = 33u;
|
||||
const VFMT_32_32: u32 = 34u;
|
||||
const VFMT_32_32_32_32: u32 = 35u;
|
||||
const VFMT_32_FLOAT: u32 = 36u;
|
||||
const VFMT_32_32_FLOAT: u32 = 37u;
|
||||
const VFMT_32_32_32_32_FLOAT:u32 = 38u;
|
||||
const VFMT_32_32_32_FLOAT: u32 = 57u;
|
||||
|
||||
// Decode vertex fetch instruction fields (canary's VertexFetchInstruction
|
||||
// layout in `ucode.h:690`):
|
||||
// w0 [4:0] opcode
|
||||
// w0 [10:5] src_reg[5:0]
|
||||
// w0 [17:11] dst_reg[6:0] + must-be-one
|
||||
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
|
||||
// w1 [21:16] format[5:0]
|
||||
// w2 [7:0] stride (in dwords)
|
||||
// w2 [30:8] offset (signed, in dwords)
|
||||
//
|
||||
// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
|
||||
// sign/normalization flags are used for the most-common normalized-unsigned
|
||||
// path. Rejects set `REJECT_VFETCH_FMT`.
|
||||
fn interpret_vertex_fetch(t: u32) {
|
||||
let w0 = vs_instr_dword(t, 0u);
|
||||
let w1 = vs_instr_dword(t, 1u);
|
||||
let w2 = vs_instr_dword(t, 2u);
|
||||
let fetch_const = (w0 >> 5u) & 0x1Fu;
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let format = (w1 >> 16u) & 0x3Fu;
|
||||
let stride = w2 & 0xFFu;
|
||||
|
||||
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
|
||||
// dword 1 carries (endian[1:0], size[25:2]).
|
||||
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
||||
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||
|
||||
let vidx = u32(registers[src_reg & 0x7Fu].x);
|
||||
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
|
||||
let effective_stride = select(stride, 4u, stride == 0u);
|
||||
let addr = base_dwords + vidx * effective_stride;
|
||||
|
||||
let n = arrayLength(&vertex_buffer);
|
||||
var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);
|
||||
|
||||
switch format {
|
||||
case VFMT_32_32_32_32_FLOAT: {
|
||||
if addr + 3u < n {
|
||||
result = vec4<f32>(
|
||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 2u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 3u]),
|
||||
);
|
||||
}
|
||||
}
|
||||
case VFMT_32_32_32_FLOAT: {
|
||||
if addr + 2u < n {
|
||||
result = vec4<f32>(
|
||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 2u]),
|
||||
1.0,
|
||||
);
|
||||
}
|
||||
}
|
||||
case VFMT_32_32_FLOAT: {
|
||||
if addr + 1u < n {
|
||||
result = vec4<f32>(
|
||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||
0.0,
|
||||
1.0,
|
||||
);
|
||||
}
|
||||
}
|
||||
case VFMT_32_FLOAT: {
|
||||
if addr < n {
|
||||
result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
|
||||
}
|
||||
}
|
||||
case VFMT_8_8_8_8: {
|
||||
if addr < n {
|
||||
result = unpack4x8unorm(vertex_buffer[addr]);
|
||||
}
|
||||
}
|
||||
case VFMT_16_16_FLOAT: {
|
||||
if addr < n {
|
||||
let h = unpack2x16float(vertex_buffer[addr]);
|
||||
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
|
||||
}
|
||||
}
|
||||
case VFMT_16_16_16_16_FLOAT: {
|
||||
if addr + 1u < n {
|
||||
let h0 = unpack2x16float(vertex_buffer[addr]);
|
||||
let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
|
||||
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
|
||||
}
|
||||
}
|
||||
case VFMT_16_16: {
|
||||
if addr < n {
|
||||
// Default to signed normalized; unsigned variants differ
|
||||
// only for sign-extension and are less common on Xenos VBs.
|
||||
let h = unpack2x16snorm(vertex_buffer[addr]);
|
||||
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
|
||||
}
|
||||
}
|
||||
case VFMT_16_16_16_16: {
|
||||
if addr + 1u < n {
|
||||
let h0 = unpack2x16snorm(vertex_buffer[addr]);
|
||||
let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
|
||||
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
|
||||
}
|
||||
}
|
||||
case VFMT_2_10_10_10: {
|
||||
// Unpack 10-bit R/G/B (signed or unsigned, default unsigned
|
||||
// normalized) + 2-bit A. Unnormalized not exercised here.
|
||||
if addr < n {
|
||||
let packed = vertex_buffer[addr];
|
||||
let r = f32(packed & 0x3FFu) / 1023.0;
|
||||
let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
|
||||
let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
|
||||
let a = f32((packed >> 30u) & 0x3u) / 3.0;
|
||||
result = vec4<f32>(r, g, b, a);
|
||||
}
|
||||
}
|
||||
default: {
|
||||
reject_mask |= REJECT_VFETCH_FMT;
|
||||
// Identity fallback preserves vertex-index visibility.
|
||||
if addr < n {
|
||||
result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
registers[dst_reg & 0x7Fu] = result;
|
||||
}
|
||||
|
||||
// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
|
||||
// dummy bound at group(1); the real per-slot texture cache lands with P5.
|
||||
// Reads (u, v) from the source register's .xy and writes the sample into
|
||||
// the destination register. `textureSampleLevel` works in both VS and PS
|
||||
// (no implicit derivatives), so no per-stage specialisation needed.
|
||||
fn interpret_texture_fetch(t: u32, is_vertex: bool) {
|
||||
var w0: u32 = 0u;
|
||||
if is_vertex {
|
||||
w0 = vs_instr_dword(t, 0u);
|
||||
} else {
|
||||
w0 = ps_instr_dword(t, 0u);
|
||||
}
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let uv = registers[src_reg & 0x7Fu].xy;
|
||||
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
|
||||
registers[dst_reg & 0x7Fu] = sample;
|
||||
}
|
||||
|
||||
// Walk an Exec clause's instruction triples.
|
||||
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
|
||||
// (we ignore in MVP); bit 1 = is-fetch.
|
||||
fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
|
||||
// 0x00 = vertex fetch, 0x01 = texture fetch.
|
||||
if opcode == 0u {
|
||||
interpret_vertex_fetch(t);
|
||||
} else if opcode == 1u {
|
||||
interpret_texture_fetch(t, true);
|
||||
}
|
||||
} else {
|
||||
interpret_alu(t, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
fn exec_ps(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
interpret_texture_fetch(t, false);
|
||||
} else {
|
||||
interpret_alu(t, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset the per-invocation register state to a known baseline.
|
||||
fn reset_state() {
|
||||
for (var i: u32 = 0u; i < 128u; i = i + 1u) {
|
||||
registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
}
|
||||
ps = 0.0;
|
||||
predicate = false;
|
||||
current_alloc = 0u;
|
||||
kill_flag = false;
|
||||
loop_depth = 0u;
|
||||
reject_mask = 0u;
|
||||
for (var i: u32 = 0u; i < 4u; i = i + 1u) {
|
||||
loop_counters[i] = 0u;
|
||||
loop_starts[i] = 0u;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Stage entry points.
|
||||
|
||||
// M7 register slots for exports. VS writes position at oPos (convention:
|
||||
// Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
|
||||
// and a set of interpolators. We track both via `current_alloc`: writes
|
||||
// inside each alloc range are tagged and copied out at Exit.
|
||||
const OPOS_REG: u32 = 32u; // synthetic slot used by the interpreter
|
||||
const OCOLOR_REG: u32 = 33u; // color0 scratch slot
|
||||
|
||||
@vertex
|
||||
fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
|
||||
reset_state();
|
||||
|
||||
// Seed r0 with the vertex index so simple shaders (or the procedural
|
||||
// fallback) have access without a real vertex fetch.
|
||||
registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
|
||||
// Seed the export slots with a procedural fallback: if the shader
|
||||
// never writes oPos / oColor, this keeps the output visible rather
|
||||
// than collapsing to (0,0) which would skip rasterization.
|
||||
let total = max(draw_ctx.vertex_count, 1u);
|
||||
let t_param = f32(vidx) / f32(total);
|
||||
let angle = t_param * 6.2831853;
|
||||
let radius = 0.35;
|
||||
registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
|
||||
let d = f32(draw_ctx.draw_index);
|
||||
registers[OCOLOR_REG] = vec4<f32>(
|
||||
0.5 + 0.5 * sin(d * 0.37),
|
||||
0.5 + 0.5 * sin(d * 0.51 + 2.0),
|
||||
0.5 + 0.5 * sin(d * 0.73 + 4.0),
|
||||
1.0,
|
||||
);
|
||||
|
||||
// Dead-binding guard for VERTEX-stage-only vertex_buffer access.
|
||||
let vb_live = f32(vertex_buffer[0]) * 0.0;
|
||||
|
||||
// Walk the VS CF table.
|
||||
walk_cf_vs();
|
||||
|
||||
var out: VsOut;
|
||||
// Use registers[OPOS_REG] as position; the procedural fallback above
|
||||
// seeded it so an un-interpreted shader still draws a recognisable
|
||||
// circle.
|
||||
out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
||||
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
|
||||
return out;
|
||||
}
|
||||
|
||||
@fragment
|
||||
fn fs_main(in: VsOut) -> FsOut {
|
||||
reset_state();
|
||||
|
||||
walk_cf_ps();
|
||||
|
||||
// Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
|
||||
// `discard` at the entry level (outside any helper) is the only way to
|
||||
// guarantee early-out in WGSL.
|
||||
if kill_flag {
|
||||
discard;
|
||||
}
|
||||
|
||||
var out: FsOut;
|
||||
out.color0 = in.color;
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
|
||||
// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
|
||||
// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
|
||||
// we don't have). A hard iteration cap keeps the GPU from hanging on
|
||||
// malformed or extreme shaders.
|
||||
const CF_WALKER_MAX_ITER: u32 = 4096u;
|
||||
|
||||
fn walk_cf_vs() {
|
||||
let cf_n = vs_cf_count();
|
||||
var cf_i: u32 = 0u;
|
||||
var iter: u32 = 0u;
|
||||
loop {
|
||||
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
|
||||
iter = iter + 1u;
|
||||
let kind = vs_cf_kind(cf_i);
|
||||
let primary = vs_cf_primary(cf_i);
|
||||
let aux = vs_cf_aux(cf_i);
|
||||
var advance: bool = true;
|
||||
var stop: bool = false;
|
||||
switch kind {
|
||||
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
|
||||
let count = aux & 0xFFu;
|
||||
let sequence = aux >> 8u;
|
||||
exec_vs(primary, count, sequence);
|
||||
if kind == CF_KIND_EXEC_END { stop = true; }
|
||||
}
|
||||
case CF_KIND_ALLOC: { current_alloc = primary; }
|
||||
case CF_KIND_EXIT: { stop = true; }
|
||||
case CF_KIND_LOOP_START: {
|
||||
let loop_id = aux & 0x1Fu;
|
||||
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
|
||||
if loop_count > 16u {
|
||||
loop_count = 16u;
|
||||
reject_mask |= REJECT_LOOP_OVERFLOW;
|
||||
}
|
||||
if loop_count > 0u && loop_depth < 4u {
|
||||
loop_starts[loop_depth] = cf_i;
|
||||
loop_counters[loop_depth] = loop_count;
|
||||
loop_depth = loop_depth + 1u;
|
||||
}
|
||||
// count==0 → fall through; matching LOOP_END will pop.
|
||||
}
|
||||
case CF_KIND_LOOP_END: {
|
||||
if loop_depth > 0u {
|
||||
let d = loop_depth - 1u;
|
||||
if loop_counters[d] > 1u {
|
||||
loop_counters[d] = loop_counters[d] - 1u;
|
||||
cf_i = loop_starts[d] + 1u;
|
||||
advance = false;
|
||||
} else {
|
||||
loop_counters[d] = 0u;
|
||||
loop_depth = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
case CF_KIND_COND_JMP: {
|
||||
let pred_bits = aux;
|
||||
let is_pred = (pred_bits & 1u) != 0u;
|
||||
let pred_cnd = (pred_bits & 2u) != 0u;
|
||||
if !is_pred || predicate == pred_cnd {
|
||||
cf_i = primary;
|
||||
advance = false;
|
||||
}
|
||||
}
|
||||
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
|
||||
// No call stack — mark and continue.
|
||||
reject_mask |= REJECT_CF_CALL;
|
||||
}
|
||||
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||
}
|
||||
if stop { break; }
|
||||
if advance { cf_i = cf_i + 1u; }
|
||||
}
|
||||
}
|
||||
|
||||
fn walk_cf_ps() {
|
||||
let cf_n = ps_cf_count();
|
||||
var cf_i: u32 = 0u;
|
||||
var iter: u32 = 0u;
|
||||
loop {
|
||||
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
|
||||
iter = iter + 1u;
|
||||
let kind = ps_cf_kind(cf_i);
|
||||
let primary = ps_cf_primary(cf_i);
|
||||
let aux = ps_cf_aux(cf_i);
|
||||
var advance: bool = true;
|
||||
var stop: bool = false;
|
||||
switch kind {
|
||||
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
|
||||
let count = aux & 0xFFu;
|
||||
let sequence = aux >> 8u;
|
||||
exec_ps(primary, count, sequence);
|
||||
if kind == CF_KIND_EXEC_END { stop = true; }
|
||||
}
|
||||
case CF_KIND_ALLOC: { current_alloc = primary; }
|
||||
case CF_KIND_EXIT: { stop = true; }
|
||||
case CF_KIND_LOOP_START: {
|
||||
let loop_id = aux & 0x1Fu;
|
||||
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
|
||||
if loop_count > 16u {
|
||||
loop_count = 16u;
|
||||
reject_mask |= REJECT_LOOP_OVERFLOW;
|
||||
}
|
||||
if loop_count > 0u && loop_depth < 4u {
|
||||
loop_starts[loop_depth] = cf_i;
|
||||
loop_counters[loop_depth] = loop_count;
|
||||
loop_depth = loop_depth + 1u;
|
||||
}
|
||||
}
|
||||
case CF_KIND_LOOP_END: {
|
||||
if loop_depth > 0u {
|
||||
let d = loop_depth - 1u;
|
||||
if loop_counters[d] > 1u {
|
||||
loop_counters[d] = loop_counters[d] - 1u;
|
||||
cf_i = loop_starts[d] + 1u;
|
||||
advance = false;
|
||||
} else {
|
||||
loop_counters[d] = 0u;
|
||||
loop_depth = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
case CF_KIND_COND_JMP: {
|
||||
let pred_bits = aux;
|
||||
let is_pred = (pred_bits & 1u) != 0u;
|
||||
let pred_cnd = (pred_bits & 2u) != 0u;
|
||||
if !is_pred || predicate == pred_cnd {
|
||||
cf_i = primary;
|
||||
advance = false;
|
||||
}
|
||||
}
|
||||
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
|
||||
reject_mask |= REJECT_CF_CALL;
|
||||
}
|
||||
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||
}
|
||||
if stop { break; }
|
||||
if advance { cf_i = cf_i + 1u; }
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user