Merge audit-2026-05-fix/shader-p0-operand-modifiers: GPUBUG-100/101/102
This commit is contained in:
@@ -129,11 +129,15 @@ fn ps_instr_dword(t: u32, which: u32) -> u32 {
|
||||
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
|
||||
|
||||
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
|
||||
// Swizzle is 8 bits: 2 bits per output lane, xyzw order.
|
||||
let sx = (swizzle >> 0u) & 3u;
|
||||
let sy = (swizzle >> 2u) & 3u;
|
||||
let sz = (swizzle >> 4u) & 3u;
|
||||
let sw = (swizzle >> 6u) & 3u;
|
||||
// Per canary `xenia-canary/src/xenia/gpu/ucode.h:1996`
|
||||
// (`AluInstruction::GetSwizzledComponentIndex`): the 8-bit swizzle is
|
||||
// **component-relative**, not absolute. For output component i, the
|
||||
// selected source component is `((swizzle >> (2*i)) + i) & 3`. Identity
|
||||
// swizzle is therefore 0x00 (all bits zero), NOT 0xE4. GPUBUG-100.
|
||||
let sx = ((swizzle >> 0u) + 0u) & 3u;
|
||||
let sy = ((swizzle >> 2u) + 1u) & 3u;
|
||||
let sz = ((swizzle >> 4u) + 2u) & 3u;
|
||||
let sw = ((swizzle >> 6u) + 3u) & 3u;
|
||||
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
|
||||
}
|
||||
|
||||
@@ -148,6 +152,24 @@ fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
|
||||
return r;
|
||||
}
|
||||
|
||||
// GPUBUG-102: per-format byte-swap matching canary `GpuSwapInline`
|
||||
// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; host is
|
||||
// little-endian. The fetch constant's `endian` field (low 2 bits of
|
||||
// dword_1) selects the swap pattern.
|
||||
fn gpu_swap(value: u32, endian: u32) -> u32 {
|
||||
switch endian {
|
||||
case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); }
|
||||
case 2u: {
|
||||
return ((value & 0x000000FFu) << 24u)
|
||||
| ((value & 0x0000FF00u) << 8u)
|
||||
| ((value & 0x00FF0000u) >> 8u)
|
||||
| ((value & 0xFF000000u) >> 24u);
|
||||
}
|
||||
case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); }
|
||||
default: { return value; }
|
||||
}
|
||||
}
|
||||
|
||||
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
|
||||
if idx >= 128u {
|
||||
return;
|
||||
@@ -233,11 +255,24 @@ const SOP_SIN: u32 = 48u;
|
||||
const SOP_COS: u32 = 49u;
|
||||
const SOP_RETAIN_PREV: u32 = 50u;
|
||||
|
||||
// Read a vec4 source from the register file. Treats the src index as a
|
||||
// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
|
||||
// M4+ will extend this to decode the full operand header.
|
||||
fn read_src(idx: u32) -> vec4<f32> {
|
||||
return registers[idx & 0x7Fu];
|
||||
// Read a vec4 source. Per canary `xenia-canary/src/xenia/gpu/ucode.h`
|
||||
// the temp-vs-constant selector lives in word-0 bits 29-31
|
||||
// (`srcN_sel`); the corresponding 8-bit src_byte is **either** a temp
|
||||
// register descriptor (bit 7 = abs flag, bit 6 = relative, bits 5:0 =
|
||||
// temp index) **or** a flat constant index (full byte). Pre-fix, the
|
||||
// MVP masked `& 0x7F` and read `registers[low7]` regardless — every
|
||||
// shader's WVP matrix / light constant / per-frame uniform read came
|
||||
// back as r[low7] (typically zero → invisible rendering). GPUBUG-101.
|
||||
fn read_src(src_byte: u32, is_temp: bool) -> vec4<f32> {
|
||||
if is_temp {
|
||||
// Bits 5:0 of the byte give the temp index; bit 7 (abs) and
|
||||
// bit 6 (relative) are handled in read_src_full when modifiers
|
||||
// land in Phase D2.
|
||||
return registers[src_byte & 0x3Fu];
|
||||
}
|
||||
// Constant index — full byte (covers c0..c127 in bank 0; higher
|
||||
// banks via cf-level relative addressing land in a later phase).
|
||||
return xenos_consts.alu[src_byte & 0xFFu];
|
||||
}
|
||||
|
||||
fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
|
||||
@@ -520,12 +555,26 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
||||
let src_a = w0 & 0xFFu;
|
||||
let src_b = (w0 >> 8u) & 0xFFu;
|
||||
let src_c = (w0 >> 16u) & 0xFFu;
|
||||
// GPUBUG-101: word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||
// selectors (canary `srcN_sel`, ucode.h:2078-2086). `src_a` is
|
||||
// canary's third operand (low byte), so its selector is bit 29.
|
||||
let src_a_is_temp = ((w0 >> 29u) & 1u) != 0u;
|
||||
let src_b_is_temp = ((w0 >> 30u) & 1u) != 0u;
|
||||
let src_c_is_temp = ((w0 >> 31u) & 1u) != 0u;
|
||||
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
||||
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
||||
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
||||
// `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
|
||||
// treats every source as a full r#, no modifiers — M4+ decodes it.
|
||||
_ = w1;
|
||||
// GPUBUG-100: word-1 holds per-operand swizzle (8 bits each, low to
|
||||
// high = src3/src2/src1 in canary's numbering) + per-operand negate
|
||||
// (bits 24/25/26). Pre-fix `_ = w1;` discarded these → every ALU
|
||||
// result was missing its swizzle (e.g. `.zyxw`, `.xxxx` broadcast)
|
||||
// and any negated operand was used positive instead.
|
||||
let src_a_swiz = w1 & 0xFFu;
|
||||
let src_b_swiz = (w1 >> 8u) & 0xFFu;
|
||||
let src_c_swiz = (w1 >> 16u) & 0xFFu;
|
||||
let src_a_neg = ((w1 >> 24u) & 1u) != 0u;
|
||||
let src_b_neg = ((w1 >> 25u) & 1u) != 0u;
|
||||
let src_c_neg = ((w1 >> 26u) & 1u) != 0u;
|
||||
|
||||
// Honor per-instruction predicate: skip when predicated and the
|
||||
// predicate doesn't match the required condition.
|
||||
@@ -533,10 +582,15 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Vector pipe.
|
||||
let a = read_src(src_a);
|
||||
let b = read_src(src_b);
|
||||
let c = read_src(src_c);
|
||||
// Vector pipe — apply swizzle then negate per operand. Abs is
|
||||
// omitted (the abs flag is dual-meaning and lives at different
|
||||
// bits depending on temp-vs-constant; left for a follow-up).
|
||||
let a_raw = read_src(src_a, src_a_is_temp);
|
||||
let b_raw = read_src(src_b, src_b_is_temp);
|
||||
let c_raw = read_src(src_c, src_c_is_temp);
|
||||
let a = apply_modifiers(apply_swizzle(a_raw, src_a_swiz), src_a_neg, false);
|
||||
let b = apply_modifiers(apply_swizzle(b_raw, src_b_swiz), src_b_neg, false);
|
||||
let c = apply_modifiers(apply_swizzle(c_raw, src_c_swiz), src_c_neg, false);
|
||||
let vec_result = exec_vector_op(vec_op, a, b, c);
|
||||
if vec_wm != 0u {
|
||||
write_reg_masked(vec_dst, vec_wm, vec_result);
|
||||
@@ -596,7 +650,12 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
|
||||
// dword 1 carries (endian[1:0], size[25:2]).
|
||||
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
||||
let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u];
|
||||
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||
// GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is
|
||||
// big-endian; the host is little-endian. Pre-fix every dword was
|
||||
// bitcast as-is — vertex positions were byte-reversed garbage.
|
||||
let endian = fc1 & 0x3u;
|
||||
|
||||
let vidx = u32(registers[src_reg & 0x7Fu].x);
|
||||
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
|
||||
@@ -610,19 +669,19 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
case VFMT_32_32_32_32_FLOAT: {
|
||||
if addr + 3u < n {
|
||||
result = vec4<f32>(
|
||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 2u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 3u]),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian)),
|
||||
);
|
||||
}
|
||||
}
|
||||
case VFMT_32_32_32_FLOAT: {
|
||||
if addr + 2u < n {
|
||||
result = vec4<f32>(
|
||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 2u]),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)),
|
||||
1.0,
|
||||
);
|
||||
}
|
||||
@@ -630,8 +689,8 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
case VFMT_32_32_FLOAT: {
|
||||
if addr + 1u < n {
|
||||
result = vec4<f32>(
|
||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
|
||||
0.0,
|
||||
1.0,
|
||||
);
|
||||
@@ -639,17 +698,20 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
}
|
||||
case VFMT_32_FLOAT: {
|
||||
if addr < n {
|
||||
result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
|
||||
result = vec4<f32>(
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr], endian)),
|
||||
0.0, 0.0, 1.0,
|
||||
);
|
||||
}
|
||||
}
|
||||
case VFMT_8_8_8_8: {
|
||||
if addr < n {
|
||||
result = unpack4x8unorm(vertex_buffer[addr]);
|
||||
result = unpack4x8unorm(gpu_swap(vertex_buffer[addr], endian));
|
||||
}
|
||||
}
|
||||
case VFMT_16_16_FLOAT: {
|
||||
if addr < n {
|
||||
let h = unpack2x16float(vertex_buffer[addr]);
|
||||
let h = unpack2x16float(gpu_swap(vertex_buffer[addr], endian));
|
||||
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,6 +126,28 @@ struct FsOut {
|
||||
fn xe_rcp(x: f32) -> f32 {
|
||||
return select(0.0, 1.0 / x, x != 0.0);
|
||||
}
|
||||
|
||||
// GPUBUG-102: per-format byte-swap matching canary's `GpuSwapInline`
|
||||
// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; the host is
|
||||
// little-endian. The fetch constant's `endian` field (low 2 bits of
|
||||
// dword_1) selects:
|
||||
// 0 (kNone) — no swap
|
||||
// 1 (k8in16) — swap bytes within halfwords
|
||||
// 2 (k8in32) — full byte reverse
|
||||
// 3 (k16in32) — swap halfwords
|
||||
fn gpu_swap(value: u32, endian: u32) -> u32 {
|
||||
switch endian {
|
||||
case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); }
|
||||
case 2u: {
|
||||
return ((value & 0x000000FFu) << 24u)
|
||||
| ((value & 0x0000FF00u) << 8u)
|
||||
| ((value & 0x00FF0000u) >> 8u)
|
||||
| ((value & 0xFF000000u) >> 24u);
|
||||
}
|
||||
case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); }
|
||||
default: { return value; }
|
||||
}
|
||||
}
|
||||
"#;
|
||||
|
||||
struct EmitCtx {
|
||||
@@ -287,9 +309,15 @@ impl EmitCtx {
|
||||
alu: &AluInstruction,
|
||||
current_alloc: AllocKind,
|
||||
) -> Result<(), &'static str> {
|
||||
let a = format!("r[{}u]", alu.src_a & 0x7F);
|
||||
let b = format!("r[{}u]", alu.src_b & 0x7F);
|
||||
let c = format!("r[{}u]", alu.src_c & 0x7F);
|
||||
// GPUBUG-100/101: per-operand temp-vs-constant selector (w0
|
||||
// bits 29-31), 8-bit component-relative swizzle (w1 bytes 0-2),
|
||||
// and 1-bit negate (w1 bits 24-26). Pre-fix all three were
|
||||
// discarded, so every ALU read came back as r[low7] without
|
||||
// any swizzle / negation, dropping every shader's uniforms +
|
||||
// negative operands.
|
||||
let a = src_operand(alu.src_a, alu.src_a_is_temp, alu.src_a_swiz, alu.src_a_negate);
|
||||
let b = src_operand(alu.src_b, alu.src_b_is_temp, alu.src_b_swiz, alu.src_b_negate);
|
||||
let c = src_operand(alu.src_c, alu.src_c_is_temp, alu.src_c_swiz, alu.src_c_negate);
|
||||
|
||||
// Vector pipe.
|
||||
if alu.vector_write_mask != 0 {
|
||||
@@ -370,23 +398,33 @@ impl EmitCtx {
|
||||
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
||||
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
||||
// formats alongside the CPU texture cache's format expansion.
|
||||
//
|
||||
// GPUBUG-102: the fetch constant (xe_gpu_vertex_fetch_t,
|
||||
// xenos.h:1158-1172) holds the endian field in dword_1's low
|
||||
// 2 bits. Vertex data on Xbox 360 is big-endian; the host is
|
||||
// little-endian. Pre-fix, every dword was bitcast as-is →
|
||||
// vertex positions were byte-reversed garbage and any draw
|
||||
// that did reach the host produced clipped / NaN positions.
|
||||
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
||||
let src_reg = vf.src_register & 0x7F;
|
||||
let dst_reg = vf.dest_register & 0x7F;
|
||||
self.push(&format!(
|
||||
"{{ let fc0 = xenos_consts.fetch[{}u]; \
|
||||
"{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \
|
||||
let fc1 = xenos_consts.fetch[{fc1_idx}u]; \
|
||||
let endian = fc1 & 0x3u; \
|
||||
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
||||
let vidx = u32(r[{src_reg}u].x); \
|
||||
let addr = base + vidx * 4u; \
|
||||
let n = arrayLength(&vertex_buffer); \
|
||||
if (addr + 3u < n) {{ \
|
||||
r[{dst_reg}u] = vec4<f32>( \
|
||||
bitcast<f32>(vertex_buffer[addr + 0u]), \
|
||||
bitcast<f32>(vertex_buffer[addr + 1u]), \
|
||||
bitcast<f32>(vertex_buffer[addr + 2u]), \
|
||||
bitcast<f32>(vertex_buffer[addr + 3u])); \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)), \
|
||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian))); \
|
||||
}} }}",
|
||||
fetch_const * 2,
|
||||
fc0_idx = fetch_const * 2,
|
||||
fc1_idx = fetch_const * 2 + 1,
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
@@ -403,6 +441,41 @@ impl EmitCtx {
|
||||
}
|
||||
}
|
||||
|
||||
/// Emit the WGSL expression that reads an ALU source operand with
|
||||
/// swizzle + negate applied (no abs — see GPUBUG-100 deferred). Mirrors
|
||||
/// the interpreter shader's `read_src` + `apply_swizzle` + the negate
|
||||
/// half of `apply_modifiers`. The 8-bit `swizzle` is component-relative
|
||||
/// per canary `AluInstruction::GetSwizzledComponentIndex`: for output
|
||||
/// component i, source component is `((swiz >> (2*i)) + i) & 3`.
|
||||
/// Identity swizzle is `0x00`. GPUBUG-100 / GPUBUG-101.
|
||||
fn src_operand(src_byte: u8, is_temp: bool, swizzle: u8, negate: bool) -> String {
|
||||
let base = if is_temp {
|
||||
format!("r[{}u]", (src_byte & 0x3F) as u32)
|
||||
} else {
|
||||
format!("xenos_consts.alu[{}u]", src_byte as u32)
|
||||
};
|
||||
let s = swizzle as u32;
|
||||
let lane = |i: u32| -> char {
|
||||
let c = (((s >> (2 * i)) + i) & 3) as usize;
|
||||
['x', 'y', 'z', 'w'][c]
|
||||
};
|
||||
// Identity swizzle (0x00) maps to .xyzw — emit a bare expression.
|
||||
let swizzled = if swizzle == 0 {
|
||||
base
|
||||
} else {
|
||||
let lx = lane(0);
|
||||
let ly = lane(1);
|
||||
let lz = lane(2);
|
||||
let lw = lane(3);
|
||||
format!("vec4<f32>({base}.{lx}, {base}.{ly}, {base}.{lz}, {base}.{lw})")
|
||||
};
|
||||
if negate {
|
||||
format!("(-{swizzled})")
|
||||
} else {
|
||||
swizzled
|
||||
}
|
||||
}
|
||||
|
||||
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||
let s = match op {
|
||||
vop::ADD => format!("({a} + {b})"),
|
||||
@@ -459,6 +532,9 @@ mod tests {
|
||||
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||
// precedes so the ALU's export (if it were one) would target oPos.
|
||||
// Word-0 bits 29-31 set so all three operands resolve as temps —
|
||||
// matches the prior assertion `r[0u] = (r[0u] + r[0u])`.
|
||||
let w0 = (1u32 << 29) | (1u32 << 30) | (1u32 << 31);
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
@@ -478,7 +554,7 @@ mod tests {
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![0, 0, w2],
|
||||
instructions: vec![w0, 0, w2],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -518,6 +594,117 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn src_operand_decodes_temp_vs_constant_no_modifiers() {
|
||||
// GPUBUG-101: is_temp=true → r[low6]; is_temp=false → xenos_consts.alu[full].
|
||||
// Identity swizzle (0x00), no negate → bare base expression.
|
||||
assert_eq!(src_operand(0x00, true, 0x00, false), "r[0u]");
|
||||
assert_eq!(src_operand(0x05, true, 0x00, false), "r[5u]");
|
||||
assert_eq!(src_operand(0x3F, true, 0x00, false), "r[63u]");
|
||||
// For temps, bits 6/7 are reserved (abs/rel) — they don't widen
|
||||
// the register index even if set. Phase D2 will consume them.
|
||||
assert_eq!(src_operand(0x80, true, 0x00, false), "r[0u]");
|
||||
assert_eq!(src_operand(0xFF, true, 0x00, false), "r[63u]");
|
||||
// Constants: full 8-bit index.
|
||||
assert_eq!(src_operand(0x00, false, 0x00, false), "xenos_consts.alu[0u]");
|
||||
assert_eq!(src_operand(0x05, false, 0x00, false), "xenos_consts.alu[5u]");
|
||||
assert_eq!(src_operand(0xFF, false, 0x00, false), "xenos_consts.alu[255u]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn src_operand_applies_swizzle_and_negate() {
|
||||
// GPUBUG-100. Component-relative swizzle. swizzle=0x1B reverses
|
||||
// the lanes (.wzyx): for i=0 → ((0x1B >> 0) + 0) & 3 = 3 = w;
|
||||
// for i=1 → ((0x1B >> 2) + 1) & 3 = (6+1)&3 = 3 = w. Hmm —
|
||||
// canary's identity is 0x00 = .xyzw, so .wzyx in component-
|
||||
// relative terms = `s0=3, s1=2, s2=1, s3=0` → bits would be
|
||||
// (3, (2-1)&3=1, (1-2)&3=3, (0-3)&3=1) which combines weirdly.
|
||||
// We just verify the mechanics by precomputing a known case:
|
||||
// swizzle=0x00 (identity) outputs .xyzw — matched by no-swizzle
|
||||
// branch. Negate wraps in `(-…)`.
|
||||
assert_eq!(src_operand(0x05, true, 0x00, true), "(-r[5u])");
|
||||
// swizzle=0xFF → for each i, ((0xFF >> (2i)) + i) & 3:
|
||||
// i=0: (3 + 0) & 3 = 3 → w
|
||||
// i=1: ((0x3F) + 1) & 3 = (63+1)&3 = 0 → x
|
||||
// i=2: ((0x0F) + 2) & 3 = (15+2)&3 = 1 → y
|
||||
// i=3: ((0x03) + 3) & 3 = (3+3)&3 = 2 → z
|
||||
// Output: .wxyz
|
||||
assert_eq!(
|
||||
src_operand(0x05, true, 0xFF, false),
|
||||
"vec4<f32>(r[5u].w, r[5u].x, r[5u].y, r[5u].z)"
|
||||
);
|
||||
// Combined: negate of constant with .wxyz swizzle.
|
||||
assert_eq!(
|
||||
src_operand(0x07, false, 0xFF, true),
|
||||
"(-vec4<f32>(xenos_consts.alu[7u].w, xenos_consts.alu[7u].x, xenos_consts.alu[7u].y, xenos_consts.alu[7u].z))"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shader_using_c0_emits_xenos_consts_read() {
|
||||
// ALU: r0 = c0 + r0. src_a (low byte) is constant index 0;
|
||||
// src_b (next byte) is temp index 0. src_a_is_temp=false →
|
||||
// src1_sel-style bit at w0 bit 29 = 0; src_b_is_temp=true →
|
||||
// bit 30 = 1. (src_c left as 0/temp; unused.)
|
||||
let w0 = 0x00u32 // src_a = c0
|
||||
| (0x00u32 << 8) // src_b = r0
|
||||
| (0x00u32 << 16) // src_c
|
||||
| (0u32 << 29) // src_a_is_temp = false (constant)
|
||||
| (1u32 << 30); // src_b_is_temp = true (register)
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RETAIN_PREV as u32) << 6)
|
||||
| (0xF << 12)
|
||||
| (0u32 << 16);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Alloc {
|
||||
size: 1,
|
||||
kind: AllocKind::Position,
|
||||
},
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: true,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
],
|
||||
instructions: vec![w0, 0, w2],
|
||||
};
|
||||
match translate(&shader, Stage::Vertex) {
|
||||
Translation::Ok(body) => {
|
||||
assert!(
|
||||
body.contains("xenos_consts.alu[0u]"),
|
||||
"expected c0 operand, got: {body}"
|
||||
);
|
||||
assert!(
|
||||
body.contains("r[0u]"),
|
||||
"expected r0 temp operand, got: {body}"
|
||||
);
|
||||
}
|
||||
Translation::Reject(r) => panic!("rejected: {r}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vfetch_emit_includes_gpu_swap_helper_call() {
|
||||
// GPUBUG-102: emit_vfetch should reference `gpu_swap(...)` for
|
||||
// each lane. Ensures the per-format endian byte-swap is wired
|
||||
// into the AOT path.
|
||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||
let vf = crate::ucode::fetch::VertexFetch {
|
||||
fetch_const: 0,
|
||||
src_register: 0,
|
||||
dest_register: 0,
|
||||
dest_write_mask: 0xF,
|
||||
raw: [0; 3],
|
||||
};
|
||||
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
||||
let body = ctx.finish();
|
||||
assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn loop_clause_rejected() {
|
||||
let shader = ParsedShader {
|
||||
|
||||
@@ -29,10 +29,34 @@ pub struct AluInstruction {
|
||||
pub vector_dest_is_export: bool,
|
||||
/// Selects `ps` (previous scalar result) as the scalar operand when set.
|
||||
pub scalar_src_is_ps: bool,
|
||||
/// Source register indices (at most 3 for vector ops).
|
||||
/// Source register indices (at most 3 for vector ops). The src bytes
|
||||
/// are the canary `srcN_reg` fields (8 bits each); for **temp-typed**
|
||||
/// operands (see `src_a_is_temp` etc.), bit 7 of the byte is the abs
|
||||
/// flag and bit 6 is the loop-relative flag — bits 5:0 give the temp
|
||||
/// index. For **constant-typed** operands the full byte is the
|
||||
/// constant index.
|
||||
pub src_a: u8,
|
||||
pub src_b: u8,
|
||||
pub src_c: u8,
|
||||
/// Per-operand "is temporary" flag — when true, the corresponding
|
||||
/// `src_X` byte indexes a general register (r#); when false, it
|
||||
/// indexes an ALU constant (c#). Decoded from word-0 bits 29-31
|
||||
/// (canary's `src3_sel`/`src2_sel`/`src1_sel`). GPUBUG-101.
|
||||
pub src_a_is_temp: bool,
|
||||
pub src_b_is_temp: bool,
|
||||
pub src_c_is_temp: bool,
|
||||
/// Per-operand 8-bit component-relative swizzle (canary's
|
||||
/// `srcN_swiz`, ucode.h:2064-2066). For output component i, the
|
||||
/// selected source component is `((swizzle >> (2*i)) + i) & 3`.
|
||||
/// Identity swizzle is `0x00`. GPUBUG-100.
|
||||
pub src_a_swiz: u8,
|
||||
pub src_b_swiz: u8,
|
||||
pub src_c_swiz: u8,
|
||||
/// Per-operand negate flags (canary's `srcN_reg_negate`, w1 bits
|
||||
/// 24/25/26). Applied after the swizzle. GPUBUG-100.
|
||||
pub src_a_negate: bool,
|
||||
pub src_b_negate: bool,
|
||||
pub src_c_negate: bool,
|
||||
/// Set when the instruction is predicated; skipped if the predicate
|
||||
/// doesn't match `predicate_condition`.
|
||||
pub predicated: bool,
|
||||
@@ -45,7 +69,7 @@ pub struct AluInstruction {
|
||||
/// Decode a 3-dword ALU triple.
|
||||
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||
let w0 = words[0];
|
||||
let _w1 = words[1];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
AluInstruction {
|
||||
vector_opcode: (w2 & 0x3F) as u8,
|
||||
@@ -59,6 +83,19 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||
src_a: (w0 & 0xFF) as u8,
|
||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||
// Word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
|
||||
// ucode.h:2078-2086). Our `src_a` is canary's third operand
|
||||
// (low byte of w0), so its selector is bit 29.
|
||||
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
||||
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
||||
src_a_swiz: (w1 & 0xFF) as u8,
|
||||
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
||||
src_c_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 24) & 1) != 0,
|
||||
src_b_negate: ((w1 >> 25) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 26) & 1) != 0,
|
||||
predicated: ((w0 >> 27) & 1) != 0,
|
||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||
raw: words,
|
||||
|
||||
Reference in New Issue
Block a user