fix(gpu): GPUBUG-101 — decode src1/2/3_sel temp-vs-constant selector
Per canary AluInstruction layout (xenia-canary/src/xenia/gpu/ucode.h:
2078-2086), word-0 bits 29-31 are the per-operand `srcN_sel` flags
selecting temp register (1) vs ALU constant (0); the corresponding
8-bit src byte indexes either:
- a temp register (bits 5:0 = index, bits 6/7 reserved for
relative-addressing / abs flags consumed by Phase D2), or
- an ALU constant (full 8-bit index).
Pre-fix, the WGSL interpreter and AOT translator both masked `& 0x7F`
on the src byte and emitted `r[low7]` regardless of the operand class.
Every shader's WVP matrix / light constant / per-frame uniform read
came back as r[low7] — typically zero — yielding invisible rendering.
Mechanical changes:
- crates/xenia-gpu/src/ucode/alu.rs: decode src_a_is_temp /
src_b_is_temp / src_c_is_temp from w0 bits 29/30/31. Note that our
src_a (low byte of w0) is canary's third operand, hence its selector
is bit 29 (canary src3_sel), not bit 31.
- crates/xenia-gpu/src/shaders/xenos_interp.wgsl: `read_src` now takes
the is_temp flag; constants index xenos_consts.alu directly.
- crates/xenia-gpu/src/translator.rs: `src_operand` mirrors the
interpreter — `r[idx]` when temp, `xenos_consts.alu[idx]` when
constant.
The trivial-shader synthetic test was updated to set the temp flags so
its `r[0u] = (r[0u] + r[0u])` assertion remains valid; without the
flags set, all sources would now resolve as constants.
Bank-selection (cf-level relative addressing for higher banks of the
512 ALU constants) remains a Phase G+ extension — covers c0..c127
in bank 0, which most Sylpheed shaders use directly.
Verification at -n 100M lockstep:
swaps: 2 → 2 (unchanged — gated by D2/D3/E for draws)
draws: 0 → 0
packets: ~61M (within noise)
Tests: 552 → 554 (+2 translator tests for the temp/constant decode).
Closes GPUBUG-101 (P0).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -233,11 +233,24 @@ const SOP_SIN: u32 = 48u;
|
|||||||
const SOP_COS: u32 = 49u;
|
const SOP_COS: u32 = 49u;
|
||||||
const SOP_RETAIN_PREV: u32 = 50u;
|
const SOP_RETAIN_PREV: u32 = 50u;
|
||||||
|
|
||||||
// Read a vec4 source from the register file. Treats the src index as a
|
// Read a vec4 source. Per canary `xenia-canary/src/xenia/gpu/ucode.h`
|
||||||
// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
|
// the temp-vs-constant selector lives in word-0 bits 29-31
|
||||||
// M4+ will extend this to decode the full operand header.
|
// (`srcN_sel`); the corresponding 8-bit src_byte is **either** a temp
|
||||||
fn read_src(idx: u32) -> vec4<f32> {
|
// register descriptor (bit 7 = abs flag, bit 6 = relative, bits 5:0 =
|
||||||
return registers[idx & 0x7Fu];
|
// temp index) **or** a flat constant index (full byte). Pre-fix, the
|
||||||
|
// MVP masked `& 0x7F` and read `registers[low7]` regardless — every
|
||||||
|
// shader's WVP matrix / light constant / per-frame uniform read came
|
||||||
|
// back as r[low7] (typically zero → invisible rendering). GPUBUG-101.
|
||||||
|
fn read_src(src_byte: u32, is_temp: bool) -> vec4<f32> {
|
||||||
|
if is_temp {
|
||||||
|
// Bits 5:0 of the byte give the temp index; bit 7 (abs) and
|
||||||
|
// bit 6 (relative) are handled in read_src_full when modifiers
|
||||||
|
// land in Phase D2.
|
||||||
|
return registers[src_byte & 0x3Fu];
|
||||||
|
}
|
||||||
|
// Constant index — full byte (covers c0..c127 in bank 0; higher
|
||||||
|
// banks via cf-level relative addressing land in a later phase).
|
||||||
|
return xenos_consts.alu[src_byte & 0xFFu];
|
||||||
}
|
}
|
||||||
|
|
||||||
fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
|
fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
|
||||||
@@ -520,11 +533,17 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
|||||||
let src_a = w0 & 0xFFu;
|
let src_a = w0 & 0xFFu;
|
||||||
let src_b = (w0 >> 8u) & 0xFFu;
|
let src_b = (w0 >> 8u) & 0xFFu;
|
||||||
let src_c = (w0 >> 16u) & 0xFFu;
|
let src_c = (w0 >> 16u) & 0xFFu;
|
||||||
|
// GPUBUG-101: word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||||
|
// selectors (canary `srcN_sel`, ucode.h:2078-2086). `src_a` is
|
||||||
|
// canary's third operand (low byte), so its selector is bit 29.
|
||||||
|
let src_a_is_temp = ((w0 >> 29u) & 1u) != 0u;
|
||||||
|
let src_b_is_temp = ((w0 >> 30u) & 1u) != 0u;
|
||||||
|
let src_c_is_temp = ((w0 >> 31u) & 1u) != 0u;
|
||||||
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
||||||
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
||||||
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
||||||
// `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
|
// `w1` holds per-operand swizzle + negate/abs flags. Phase D2 decodes
|
||||||
// treats every source as a full r#, no modifiers — M4+ decodes it.
|
// them; Phase D1 only resolved the temp/constant selector.
|
||||||
_ = w1;
|
_ = w1;
|
||||||
|
|
||||||
// Honor per-instruction predicate: skip when predicated and the
|
// Honor per-instruction predicate: skip when predicated and the
|
||||||
@@ -534,9 +553,9 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector pipe.
|
// Vector pipe.
|
||||||
let a = read_src(src_a);
|
let a = read_src(src_a, src_a_is_temp);
|
||||||
let b = read_src(src_b);
|
let b = read_src(src_b, src_b_is_temp);
|
||||||
let c = read_src(src_c);
|
let c = read_src(src_c, src_c_is_temp);
|
||||||
let vec_result = exec_vector_op(vec_op, a, b, c);
|
let vec_result = exec_vector_op(vec_op, a, b, c);
|
||||||
if vec_wm != 0u {
|
if vec_wm != 0u {
|
||||||
write_reg_masked(vec_dst, vec_wm, vec_result);
|
write_reg_masked(vec_dst, vec_wm, vec_result);
|
||||||
|
|||||||
@@ -287,9 +287,15 @@ impl EmitCtx {
|
|||||||
alu: &AluInstruction,
|
alu: &AluInstruction,
|
||||||
current_alloc: AllocKind,
|
current_alloc: AllocKind,
|
||||||
) -> Result<(), &'static str> {
|
) -> Result<(), &'static str> {
|
||||||
let a = format!("r[{}u]", alu.src_a & 0x7F);
|
// GPUBUG-101: word-0 bits 29-31 select temp vs constant for
|
||||||
let b = format!("r[{}u]", alu.src_b & 0x7F);
|
// each operand (canary `srcN_sel`); the corresponding src byte
|
||||||
let c = format!("r[{}u]", alu.src_c & 0x7F);
|
// indexes either a general register (temp) or an ALU constant
|
||||||
|
// (c#). Pre-fix the translator unconditionally emitted r[low7]
|
||||||
|
// for both, dropping every shader's uniform read. Mirrors the
|
||||||
|
// interpreter's `read_src`.
|
||||||
|
let a = src_operand(alu.src_a, alu.src_a_is_temp);
|
||||||
|
let b = src_operand(alu.src_b, alu.src_b_is_temp);
|
||||||
|
let c = src_operand(alu.src_c, alu.src_c_is_temp);
|
||||||
|
|
||||||
// Vector pipe.
|
// Vector pipe.
|
||||||
if alu.vector_write_mask != 0 {
|
if alu.vector_write_mask != 0 {
|
||||||
@@ -403,6 +409,20 @@ impl EmitCtx {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Emit the WGSL expression that reads an ALU source operand. Per
|
||||||
|
/// canary `xenia-canary/src/xenia/gpu/ucode.h`, the temp-vs-constant
|
||||||
|
/// selector is in word-0 bits 29-31 (`srcN_sel`); the src byte is then
|
||||||
|
/// either a temp descriptor (bits 5:0 = temp index, bits 6/7 reserved
|
||||||
|
/// for relative/abs flags decoded in Phase D2) or a flat constant
|
||||||
|
/// index. Mirrors the interpreter shader's `read_src`. GPUBUG-101.
|
||||||
|
fn src_operand(src_byte: u8, is_temp: bool) -> String {
|
||||||
|
if is_temp {
|
||||||
|
format!("r[{}u]", (src_byte & 0x3F) as u32)
|
||||||
|
} else {
|
||||||
|
format!("xenos_consts.alu[{}u]", src_byte as u32)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||||
let s = match op {
|
let s = match op {
|
||||||
vop::ADD => format!("({a} + {b})"),
|
vop::ADD => format!("({a} + {b})"),
|
||||||
@@ -459,6 +479,9 @@ mod tests {
|
|||||||
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||||
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||||
// precedes so the ALU's export (if it were one) would target oPos.
|
// precedes so the ALU's export (if it were one) would target oPos.
|
||||||
|
// Word-0 bits 29-31 set so all three operands resolve as temps —
|
||||||
|
// matches the prior assertion `r[0u] = (r[0u] + r[0u])`.
|
||||||
|
let w0 = (1u32 << 29) | (1u32 << 30) | (1u32 << 31);
|
||||||
let w2 = (vop::ADD as u32)
|
let w2 = (vop::ADD as u32)
|
||||||
| ((sop::RETAIN_PREV as u32) << 6)
|
| ((sop::RETAIN_PREV as u32) << 6)
|
||||||
| (0xF << 12) // vector_write_mask
|
| (0xF << 12) // vector_write_mask
|
||||||
@@ -478,7 +501,7 @@ mod tests {
|
|||||||
predicate_condition: false,
|
predicate_condition: false,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
instructions: vec![0, 0, w2],
|
instructions: vec![w0, 0, w2],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -518,6 +541,69 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn src_operand_decodes_temp_vs_constant() {
|
||||||
|
// GPUBUG-101: is_temp=true → r[low6]; is_temp=false → xenos_consts.alu[full].
|
||||||
|
assert_eq!(src_operand(0x00, true), "r[0u]");
|
||||||
|
assert_eq!(src_operand(0x05, true), "r[5u]");
|
||||||
|
assert_eq!(src_operand(0x3F, true), "r[63u]");
|
||||||
|
// For temps, bits 6/7 are reserved (abs/rel) — they don't widen
|
||||||
|
// the register index even if set. Phase D2 will consume them.
|
||||||
|
assert_eq!(src_operand(0x80, true), "r[0u]");
|
||||||
|
assert_eq!(src_operand(0xFF, true), "r[63u]");
|
||||||
|
// Constants: full 8-bit index.
|
||||||
|
assert_eq!(src_operand(0x00, false), "xenos_consts.alu[0u]");
|
||||||
|
assert_eq!(src_operand(0x05, false), "xenos_consts.alu[5u]");
|
||||||
|
assert_eq!(src_operand(0xFF, false), "xenos_consts.alu[255u]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shader_using_c0_emits_xenos_consts_read() {
|
||||||
|
// ALU: r0 = c0 + r0. src_a (low byte) is constant index 0;
|
||||||
|
// src_b (next byte) is temp index 0. src_a_is_temp=false →
|
||||||
|
// src1_sel-style bit at w0 bit 29 = 0; src_b_is_temp=true →
|
||||||
|
// bit 30 = 1. (src_c left as 0/temp; unused.)
|
||||||
|
let w0 = 0x00u32 // src_a = c0
|
||||||
|
| (0x00u32 << 8) // src_b = r0
|
||||||
|
| (0x00u32 << 16) // src_c
|
||||||
|
| (0u32 << 29) // src_a_is_temp = false (constant)
|
||||||
|
| (1u32 << 30); // src_b_is_temp = true (register)
|
||||||
|
let w2 = (vop::ADD as u32)
|
||||||
|
| ((sop::RETAIN_PREV as u32) << 6)
|
||||||
|
| (0xF << 12)
|
||||||
|
| (0u32 << 16);
|
||||||
|
let shader = ParsedShader {
|
||||||
|
cf: vec![
|
||||||
|
ControlFlowInstruction::Alloc {
|
||||||
|
size: 1,
|
||||||
|
kind: AllocKind::Position,
|
||||||
|
},
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address: 0,
|
||||||
|
count: 1,
|
||||||
|
sequence: 0,
|
||||||
|
is_end: true,
|
||||||
|
predicated: false,
|
||||||
|
predicate_condition: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
instructions: vec![w0, 0, w2],
|
||||||
|
};
|
||||||
|
match translate(&shader, Stage::Vertex) {
|
||||||
|
Translation::Ok(body) => {
|
||||||
|
assert!(
|
||||||
|
body.contains("xenos_consts.alu[0u]"),
|
||||||
|
"expected c0 operand, got: {body}"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
body.contains("r[0u]"),
|
||||||
|
"expected r0 temp operand, got: {body}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Translation::Reject(r) => panic!("rejected: {r}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn loop_clause_rejected() {
|
fn loop_clause_rejected() {
|
||||||
let shader = ParsedShader {
|
let shader = ParsedShader {
|
||||||
|
|||||||
@@ -29,10 +29,22 @@ pub struct AluInstruction {
|
|||||||
pub vector_dest_is_export: bool,
|
pub vector_dest_is_export: bool,
|
||||||
/// Selects `ps` (previous scalar result) as the scalar operand when set.
|
/// Selects `ps` (previous scalar result) as the scalar operand when set.
|
||||||
pub scalar_src_is_ps: bool,
|
pub scalar_src_is_ps: bool,
|
||||||
/// Source register indices (at most 3 for vector ops).
|
/// Source register indices (at most 3 for vector ops). The src bytes
|
||||||
|
/// are the canary `srcN_reg` fields (8 bits each); for **temp-typed**
|
||||||
|
/// operands (see `src_a_is_temp` etc.), bit 7 of the byte is the abs
|
||||||
|
/// flag and bit 6 is the loop-relative flag — bits 5:0 give the temp
|
||||||
|
/// index. For **constant-typed** operands the full byte is the
|
||||||
|
/// constant index.
|
||||||
pub src_a: u8,
|
pub src_a: u8,
|
||||||
pub src_b: u8,
|
pub src_b: u8,
|
||||||
pub src_c: u8,
|
pub src_c: u8,
|
||||||
|
/// Per-operand "is temporary" flag — when true, the corresponding
|
||||||
|
/// `src_X` byte indexes a general register (r#); when false, it
|
||||||
|
/// indexes an ALU constant (c#). Decoded from word-0 bits 29-31
|
||||||
|
/// (canary's `src3_sel`/`src2_sel`/`src1_sel`). GPUBUG-101.
|
||||||
|
pub src_a_is_temp: bool,
|
||||||
|
pub src_b_is_temp: bool,
|
||||||
|
pub src_c_is_temp: bool,
|
||||||
/// Set when the instruction is predicated; skipped if the predicate
|
/// Set when the instruction is predicated; skipped if the predicate
|
||||||
/// doesn't match `predicate_condition`.
|
/// doesn't match `predicate_condition`.
|
||||||
pub predicated: bool,
|
pub predicated: bool,
|
||||||
@@ -59,6 +71,13 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
|||||||
src_a: (w0 & 0xFF) as u8,
|
src_a: (w0 & 0xFF) as u8,
|
||||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||||
|
// Word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||||
|
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
|
||||||
|
// ucode.h:2078-2086). Our `src_a` is canary's third operand
|
||||||
|
// (low byte of w0), so its selector is bit 29.
|
||||||
|
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
||||||
|
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
||||||
|
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
||||||
predicated: ((w0 >> 27) & 1) != 0,
|
predicated: ((w0 >> 27) & 1) != 0,
|
||||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||||
raw: words,
|
raw: words,
|
||||||
|
|||||||
Reference in New Issue
Block a user