diff --git a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl index 8fcab3e..bdb93eb 100644 --- a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl +++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl @@ -233,11 +233,24 @@ const SOP_SIN: u32 = 48u; const SOP_COS: u32 = 49u; const SOP_RETAIN_PREV: u32 = 50u; -// Read a vec4 source from the register file. Treats the src index as a -// direct r# reference (ignores c# selector + swizzle/modifiers for MVP). -// M4+ will extend this to decode the full operand header. -fn read_src(idx: u32) -> vec4 { - return registers[idx & 0x7Fu]; +// Read a vec4 source. Per canary `xenia-canary/src/xenia/gpu/ucode.h` +// the temp-vs-constant selector lives in word-0 bits 29-31 +// (`srcN_sel`); the corresponding 8-bit src_byte is **either** a temp +// register descriptor (bit 7 = abs flag, bit 6 = relative, bits 5:0 = +// temp index) **or** a flat constant index (full byte). Pre-fix, the +// MVP masked `& 0x7F` and read `registers[low7]` regardless — every +// shader's WVP matrix / light constant / per-frame uniform read came +// back as r[low7] (typically zero → invisible rendering). GPUBUG-101. +fn read_src(src_byte: u32, is_temp: bool) -> vec4 { + if is_temp { + // Bits 5:0 of the byte give the temp index; bit 7 (abs) and + // bit 6 (relative) are handled in read_src_full when modifiers + // land in Phase D2. + return registers[src_byte & 0x3Fu]; + } + // Constant index — full byte (covers c0..c127 in bank 0; higher + // banks via cf-level relative addressing land in a later phase). + return xenos_consts.alu[src_byte & 0xFFu]; } fn exec_vector_op(op: u32, a: vec4, b: vec4, c: vec4) -> vec4 { @@ -520,11 +533,17 @@ fn interpret_alu(t: u32, is_vertex: bool) { let src_a = w0 & 0xFFu; let src_b = (w0 >> 8u) & 0xFFu; let src_c = (w0 >> 16u) & 0xFFu; + // GPUBUG-101: word-0 bits 29-31 are the per-operand temp-vs-constant + // selectors (canary `srcN_sel`, ucode.h:2078-2086). `src_a` is + // canary's third operand (low byte), so its selector is bit 29. + let src_a_is_temp = ((w0 >> 29u) & 1u) != 0u; + let src_b_is_temp = ((w0 >> 30u) & 1u) != 0u; + let src_c_is_temp = ((w0 >> 31u) & 1u) != 0u; let predicated = ((w0 >> 27u) & 1u) != 0u; let predicate_condition = ((w0 >> 28u) & 1u) != 0u; let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u; - // `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP - // treats every source as a full r#, no modifiers — M4+ decodes it. + // `w1` holds per-operand swizzle + negate/abs flags. Phase D2 decodes + // them; Phase D1 only resolved the temp/constant selector. _ = w1; // Honor per-instruction predicate: skip when predicated and the @@ -534,9 +553,9 @@ fn interpret_alu(t: u32, is_vertex: bool) { } // Vector pipe. - let a = read_src(src_a); - let b = read_src(src_b); - let c = read_src(src_c); + let a = read_src(src_a, src_a_is_temp); + let b = read_src(src_b, src_b_is_temp); + let c = read_src(src_c, src_c_is_temp); let vec_result = exec_vector_op(vec_op, a, b, c); if vec_wm != 0u { write_reg_masked(vec_dst, vec_wm, vec_result); diff --git a/crates/xenia-gpu/src/translator.rs b/crates/xenia-gpu/src/translator.rs index 9a8d8c1..0261a93 100644 --- a/crates/xenia-gpu/src/translator.rs +++ b/crates/xenia-gpu/src/translator.rs @@ -287,9 +287,15 @@ impl EmitCtx { alu: &AluInstruction, current_alloc: AllocKind, ) -> Result<(), &'static str> { - let a = format!("r[{}u]", alu.src_a & 0x7F); - let b = format!("r[{}u]", alu.src_b & 0x7F); - let c = format!("r[{}u]", alu.src_c & 0x7F); + // GPUBUG-101: word-0 bits 29-31 select temp vs constant for + // each operand (canary `srcN_sel`); the corresponding src byte + // indexes either a general register (temp) or an ALU constant + // (c#). Pre-fix the translator unconditionally emitted r[low7] + // for both, dropping every shader's uniform read. Mirrors the + // interpreter's `read_src`. + let a = src_operand(alu.src_a, alu.src_a_is_temp); + let b = src_operand(alu.src_b, alu.src_b_is_temp); + let c = src_operand(alu.src_c, alu.src_c_is_temp); // Vector pipe. if alu.vector_write_mask != 0 { @@ -403,6 +409,20 @@ impl EmitCtx { } } +/// Emit the WGSL expression that reads an ALU source operand. Per +/// canary `xenia-canary/src/xenia/gpu/ucode.h`, the temp-vs-constant +/// selector is in word-0 bits 29-31 (`srcN_sel`); the src byte is then +/// either a temp descriptor (bits 5:0 = temp index, bits 6/7 reserved +/// for relative/abs flags decoded in Phase D2) or a flat constant +/// index. Mirrors the interpreter shader's `read_src`. GPUBUG-101. +fn src_operand(src_byte: u8, is_temp: bool) -> String { + if is_temp { + format!("r[{}u]", (src_byte & 0x3F) as u32) + } else { + format!("xenos_consts.alu[{}u]", src_byte as u32) + } +} + fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option { let s = match op { vop::ADD => format!("({a} + {b})"), @@ -459,6 +479,9 @@ mod tests { // Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV // with full write-mask on vector, zero on scalar. Alloc(Position) // precedes so the ALU's export (if it were one) would target oPos. + // Word-0 bits 29-31 set so all three operands resolve as temps — + // matches the prior assertion `r[0u] = (r[0u] + r[0u])`. + let w0 = (1u32 << 29) | (1u32 << 30) | (1u32 << 31); let w2 = (vop::ADD as u32) | ((sop::RETAIN_PREV as u32) << 6) | (0xF << 12) // vector_write_mask @@ -478,7 +501,7 @@ mod tests { predicate_condition: false, }, ], - instructions: vec![0, 0, w2], + instructions: vec![w0, 0, w2], } } @@ -518,6 +541,69 @@ mod tests { } } + #[test] + fn src_operand_decodes_temp_vs_constant() { + // GPUBUG-101: is_temp=true → r[low6]; is_temp=false → xenos_consts.alu[full]. + assert_eq!(src_operand(0x00, true), "r[0u]"); + assert_eq!(src_operand(0x05, true), "r[5u]"); + assert_eq!(src_operand(0x3F, true), "r[63u]"); + // For temps, bits 6/7 are reserved (abs/rel) — they don't widen + // the register index even if set. Phase D2 will consume them. + assert_eq!(src_operand(0x80, true), "r[0u]"); + assert_eq!(src_operand(0xFF, true), "r[63u]"); + // Constants: full 8-bit index. + assert_eq!(src_operand(0x00, false), "xenos_consts.alu[0u]"); + assert_eq!(src_operand(0x05, false), "xenos_consts.alu[5u]"); + assert_eq!(src_operand(0xFF, false), "xenos_consts.alu[255u]"); + } + + #[test] + fn shader_using_c0_emits_xenos_consts_read() { + // ALU: r0 = c0 + r0. src_a (low byte) is constant index 0; + // src_b (next byte) is temp index 0. src_a_is_temp=false → + // src1_sel-style bit at w0 bit 29 = 0; src_b_is_temp=true → + // bit 30 = 1. (src_c left as 0/temp; unused.) + let w0 = 0x00u32 // src_a = c0 + | (0x00u32 << 8) // src_b = r0 + | (0x00u32 << 16) // src_c + | (0u32 << 29) // src_a_is_temp = false (constant) + | (1u32 << 30); // src_b_is_temp = true (register) + let w2 = (vop::ADD as u32) + | ((sop::RETAIN_PREV as u32) << 6) + | (0xF << 12) + | (0u32 << 16); + let shader = ParsedShader { + cf: vec![ + ControlFlowInstruction::Alloc { + size: 1, + kind: AllocKind::Position, + }, + ControlFlowInstruction::Exec { + address: 0, + count: 1, + sequence: 0, + is_end: true, + predicated: false, + predicate_condition: false, + }, + ], + instructions: vec![w0, 0, w2], + }; + match translate(&shader, Stage::Vertex) { + Translation::Ok(body) => { + assert!( + body.contains("xenos_consts.alu[0u]"), + "expected c0 operand, got: {body}" + ); + assert!( + body.contains("r[0u]"), + "expected r0 temp operand, got: {body}" + ); + } + Translation::Reject(r) => panic!("rejected: {r}"), + } + } + #[test] fn loop_clause_rejected() { let shader = ParsedShader { diff --git a/crates/xenia-gpu/src/ucode/alu.rs b/crates/xenia-gpu/src/ucode/alu.rs index 4130a29..a4506c9 100644 --- a/crates/xenia-gpu/src/ucode/alu.rs +++ b/crates/xenia-gpu/src/ucode/alu.rs @@ -29,10 +29,22 @@ pub struct AluInstruction { pub vector_dest_is_export: bool, /// Selects `ps` (previous scalar result) as the scalar operand when set. pub scalar_src_is_ps: bool, - /// Source register indices (at most 3 for vector ops). + /// Source register indices (at most 3 for vector ops). The src bytes + /// are the canary `srcN_reg` fields (8 bits each); for **temp-typed** + /// operands (see `src_a_is_temp` etc.), bit 7 of the byte is the abs + /// flag and bit 6 is the loop-relative flag — bits 5:0 give the temp + /// index. For **constant-typed** operands the full byte is the + /// constant index. pub src_a: u8, pub src_b: u8, pub src_c: u8, + /// Per-operand "is temporary" flag — when true, the corresponding + /// `src_X` byte indexes a general register (r#); when false, it + /// indexes an ALU constant (c#). Decoded from word-0 bits 29-31 + /// (canary's `src3_sel`/`src2_sel`/`src1_sel`). GPUBUG-101. + pub src_a_is_temp: bool, + pub src_b_is_temp: bool, + pub src_c_is_temp: bool, /// Set when the instruction is predicated; skipped if the predicate /// doesn't match `predicate_condition`. pub predicated: bool, @@ -59,6 +71,13 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction { src_a: (w0 & 0xFF) as u8, src_b: ((w0 >> 8) & 0xFF) as u8, src_c: ((w0 >> 16) & 0xFF) as u8, + // Word-0 bits 29-31 are the per-operand temp-vs-constant + // selector (canary `src3_sel`/`src2_sel`/`src1_sel`, + // ucode.h:2078-2086). Our `src_a` is canary's third operand + // (low byte of w0), so its selector is bit 29. + src_a_is_temp: ((w0 >> 29) & 1) != 0, + src_b_is_temp: ((w0 >> 30) & 1) != 0, + src_c_is_temp: ((w0 >> 31) & 1) != 0, predicated: ((w0 >> 27) & 1) != 0, predicate_condition: ((w0 >> 28) & 1) != 0, raw: words,