Files
xenia-rs/crates/xenia-gpu/src/ucode/alu.rs
MechaCat02 78ea81c12a fix(gpu): GPUBUG-101 — decode src1/2/3_sel temp-vs-constant selector
Per canary AluInstruction layout (xenia-canary/src/xenia/gpu/ucode.h:
2078-2086), word-0 bits 29-31 are the per-operand `srcN_sel` flags
selecting temp register (1) vs ALU constant (0); the corresponding
8-bit src byte indexes either:
  - a temp register (bits 5:0 = index, bits 6/7 reserved for
    relative-addressing / abs flags consumed by Phase D2), or
  - an ALU constant (full 8-bit index).

Pre-fix, the WGSL interpreter and AOT translator both masked `& 0x7F`
on the src byte and emitted `r[low7]` regardless of the operand class.
Every shader's WVP matrix / light constant / per-frame uniform read
came back as r[low7] — typically zero — yielding invisible rendering.

Mechanical changes:
- crates/xenia-gpu/src/ucode/alu.rs: decode src_a_is_temp /
  src_b_is_temp / src_c_is_temp from w0 bits 29/30/31. Note that our
  src_a (low byte of w0) is canary's third operand, hence its selector
  is bit 29 (canary src3_sel), not bit 31.
- crates/xenia-gpu/src/shaders/xenos_interp.wgsl: `read_src` now takes
  the is_temp flag; constants index xenos_consts.alu directly.
- crates/xenia-gpu/src/translator.rs: `src_operand` mirrors the
  interpreter — `r[idx]` when temp, `xenos_consts.alu[idx]` when
  constant.

The trivial-shader synthetic test was updated to set the temp flags so
its `r[0u] = (r[0u] + r[0u])` assertion remains valid; without the
flags set, all sources would now resolve as constants.

Bank-selection (cf-level relative addressing for higher banks of the
512 ALU constants) remains a Phase G+ extension — covers c0..c127
in bank 0, which most Sylpheed shaders use directly.

Verification at -n 100M lockstep:
  swaps:                2 → 2     (unchanged — gated by D2/D3/E for draws)
  draws:                0 → 0
  packets:              ~61M (within noise)
Tests: 552 → 554 (+2 translator tests for the temp/constant decode).

Closes GPUBUG-101 (P0).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 14:10:11 +02:00

226 lines
8.0 KiB
Rust

//! Xenos ALU (vector + scalar) instruction decoder.
//!
//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
//! - word0: operand modifier flags + destination info
//! - word1: source register / swizzle fields
//! - word2: opcode + write mask + export target
//!
//! See `ucode.h:900-1400` for the full field map. This decoder captures the
//! minimal shape the uber-shader needs; flags we don't interpret yet are
//! retained as raw bits in `raw` for downstream inspection.
/// Decoded ALU instruction.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct AluInstruction {
/// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
pub vector_opcode: u8,
/// Scalar ALU opcode (bits 7..13 of word2).
pub scalar_opcode: u8,
/// Destination register index for vector result (7 bits).
pub vector_dest: u8,
/// Destination register index for scalar result (7 bits).
pub scalar_dest: u8,
/// 4-bit write mask for the vector result (x/y/z/w).
pub vector_write_mask: u8,
/// 4-bit write mask for the scalar result.
pub scalar_write_mask: u8,
/// Set when the instruction should write to the export bank (position,
/// interpolators, color, etc.) instead of the general register file.
pub vector_dest_is_export: bool,
/// Selects `ps` (previous scalar result) as the scalar operand when set.
pub scalar_src_is_ps: bool,
/// Source register indices (at most 3 for vector ops). The src bytes
/// are the canary `srcN_reg` fields (8 bits each); for **temp-typed**
/// operands (see `src_a_is_temp` etc.), bit 7 of the byte is the abs
/// flag and bit 6 is the loop-relative flag — bits 5:0 give the temp
/// index. For **constant-typed** operands the full byte is the
/// constant index.
pub src_a: u8,
pub src_b: u8,
pub src_c: u8,
/// Per-operand "is temporary" flag — when true, the corresponding
/// `src_X` byte indexes a general register (r#); when false, it
/// indexes an ALU constant (c#). Decoded from word-0 bits 29-31
/// (canary's `src3_sel`/`src2_sel`/`src1_sel`). GPUBUG-101.
pub src_a_is_temp: bool,
pub src_b_is_temp: bool,
pub src_c_is_temp: bool,
/// Set when the instruction is predicated; skipped if the predicate
/// doesn't match `predicate_condition`.
pub predicated: bool,
pub predicate_condition: bool,
/// Raw dwords — preserved verbatim so the translator / interpreter can
/// reach into fields we haven't parsed explicitly yet.
pub raw: [u32; 3],
}
/// Decode a 3-dword ALU triple.
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
let w0 = words[0];
let _w1 = words[1];
let w2 = words[2];
AluInstruction {
vector_opcode: (w2 & 0x3F) as u8,
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
vector_dest: ((w2 >> 16) & 0x7F) as u8,
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
src_a: (w0 & 0xFF) as u8,
src_b: ((w0 >> 8) & 0xFF) as u8,
src_c: ((w0 >> 16) & 0xFF) as u8,
// Word-0 bits 29-31 are the per-operand temp-vs-constant
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
// ucode.h:2078-2086). Our `src_a` is canary's third operand
// (low byte of w0), so its selector is bit 29.
src_a_is_temp: ((w0 >> 29) & 1) != 0,
src_b_is_temp: ((w0 >> 30) & 1) != 0,
src_c_is_temp: ((w0 >> 31) & 1) != 0,
predicated: ((w0 >> 27) & 1) != 0,
predicate_condition: ((w0 >> 28) & 1) != 0,
raw: words,
}
}
/// Vector ALU opcodes we reference by name. Values match canary's
/// `AluVectorOpcode` enum in `ucode.h:1354`.
pub mod vop {
pub const ADD: u8 = 0;
pub const MUL: u8 = 1;
pub const MAX: u8 = 2;
pub const MIN: u8 = 3;
pub const SEQ: u8 = 4;
pub const SGT: u8 = 5;
pub const SGE: u8 = 6;
pub const SNE: u8 = 7;
pub const FRC: u8 = 8;
pub const TRUNC: u8 = 9;
pub const FLOOR: u8 = 10;
pub const MAD: u8 = 11;
pub const CND_EQ: u8 = 12;
pub const CND_GE: u8 = 13;
pub const CND_GT: u8 = 14;
pub const DOT4: u8 = 15;
pub const DOT3: u8 = 16;
pub const DOT2_ADD: u8 = 17;
pub const CUBE: u8 = 18;
pub const MAX4: u8 = 19;
pub const SETP_EQ_PUSH: u8 = 20;
pub const SETP_NE_PUSH: u8 = 21;
pub const SETP_GT_PUSH: u8 = 22;
pub const SETP_GE_PUSH: u8 = 23;
pub const KILL_EQ: u8 = 24;
pub const KILL_GT: u8 = 25;
pub const KILL_GE: u8 = 26;
pub const KILL_NE: u8 = 27;
pub const DST: u8 = 28;
pub const MAX_A: u8 = 29;
}
/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
/// `ucode.h:1001`.
pub mod sop {
pub const ADDS: u8 = 0;
pub const ADDS_PREV: u8 = 1;
pub const MULS: u8 = 2;
pub const MULS_PREV: u8 = 3;
pub const MULS_PREV2: u8 = 4;
pub const MAXS: u8 = 5;
pub const MINS: u8 = 6;
pub const SEQS: u8 = 7;
pub const SGTS: u8 = 8;
pub const SGES: u8 = 9;
pub const SNES: u8 = 10;
pub const FRCS: u8 = 11;
pub const TRUNCS: u8 = 12;
pub const FLOORS: u8 = 13;
pub const EXP: u8 = 14;
pub const LOGC: u8 = 15;
pub const LOG: u8 = 16;
pub const RCPC: u8 = 17;
pub const RCPF: u8 = 18;
pub const RCP: u8 = 19;
pub const RSQC: u8 = 20;
pub const RSQF: u8 = 21;
pub const RSQ: u8 = 22;
pub const MAXAS: u8 = 23;
pub const MAXASF: u8 = 24;
pub const SUBS: u8 = 25;
pub const SUBS_PREV: u8 = 26;
pub const SETP_EQ: u8 = 27;
pub const SETP_NE: u8 = 28;
pub const SETP_GT: u8 = 29;
pub const SETP_GE: u8 = 30;
pub const SETP_INV: u8 = 31;
pub const SETP_POP: u8 = 32;
pub const SETP_CLR: u8 = 33;
pub const SETP_RSTR: u8 = 34;
pub const KILLS_EQ: u8 = 35;
pub const KILLS_GT: u8 = 36;
pub const KILLS_GE: u8 = 37;
pub const KILLS_NE: u8 = 38;
pub const KILLS_ONE: u8 = 39;
pub const SQRT: u8 = 40;
pub const MULSC0: u8 = 42;
pub const MULSC1: u8 = 43;
pub const ADDSC0: u8 = 44;
pub const ADDSC1: u8 = 45;
pub const SUBSC0: u8 = 46;
pub const SUBSC1: u8 = 47;
pub const SIN: u8 = 48;
pub const COS: u8 = 49;
pub const RETAIN_PREV: u8 = 50;
}
#[cfg(test)]
mod tests {
use super::*;
/// Regression: our table previously drifted from canary's values (e.g.
/// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
/// the most-often-used scalar + vector opcodes here.
#[test]
fn opcodes_match_canary_values() {
// Scalar.
assert_eq!(sop::MAXS, 5);
assert_eq!(sop::MINS, 6);
assert_eq!(sop::SEQS, 7);
assert_eq!(sop::EXP, 14);
assert_eq!(sop::LOG, 16);
assert_eq!(sop::RCP, 19);
assert_eq!(sop::RSQ, 22);
assert_eq!(sop::SUBS, 25);
assert_eq!(sop::SETP_EQ, 27);
assert_eq!(sop::KILLS_EQ, 35);
assert_eq!(sop::SQRT, 40);
assert_eq!(sop::SIN, 48);
assert_eq!(sop::RETAIN_PREV, 50);
// Vector.
assert_eq!(vop::SNE, 7);
assert_eq!(vop::CND_EQ, 12);
assert_eq!(vop::MAX4, 19);
assert_eq!(vop::KILL_EQ, 24);
assert_eq!(vop::DST, 28);
}
#[test]
fn decode_extracts_opcodes_and_dests() {
// Build a minimal ALU word:
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
let w2 = (vop::ADD as u32)
| ((sop::RCP as u32) << 6)
| (0xF << 12) // vector_write_mask
| (3u32 << 16) // vector_dest
| (7u32 << 24); // scalar_dest
let alu = decode_alu([0, 0, w2]);
assert_eq!(alu.vector_opcode, vop::ADD);
assert_eq!(alu.scalar_opcode, sop::RCP);
assert_eq!(alu.vector_dest, 3);
assert_eq!(alu.scalar_dest, 7);
assert_eq!(alu.vector_write_mask, 0xF);
}
}