xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).

Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.

Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions

View File

@@ -0,0 +1,206 @@
//! Xenos ALU (vector + scalar) instruction decoder.
//!
//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
//! - word0: operand modifier flags + destination info
//! - word1: source register / swizzle fields
//! - word2: opcode + write mask + export target
//!
//! See `ucode.h:900-1400` for the full field map. This decoder captures the
//! minimal shape the uber-shader needs; flags we don't interpret yet are
//! retained as raw bits in `raw` for downstream inspection.
/// Decoded ALU instruction.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct AluInstruction {
/// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
pub vector_opcode: u8,
/// Scalar ALU opcode (bits 7..13 of word2).
pub scalar_opcode: u8,
/// Destination register index for vector result (7 bits).
pub vector_dest: u8,
/// Destination register index for scalar result (7 bits).
pub scalar_dest: u8,
/// 4-bit write mask for the vector result (x/y/z/w).
pub vector_write_mask: u8,
/// 4-bit write mask for the scalar result.
pub scalar_write_mask: u8,
/// Set when the instruction should write to the export bank (position,
/// interpolators, color, etc.) instead of the general register file.
pub vector_dest_is_export: bool,
/// Selects `ps` (previous scalar result) as the scalar operand when set.
pub scalar_src_is_ps: bool,
/// Source register indices (at most 3 for vector ops).
pub src_a: u8,
pub src_b: u8,
pub src_c: u8,
/// Set when the instruction is predicated; skipped if the predicate
/// doesn't match `predicate_condition`.
pub predicated: bool,
pub predicate_condition: bool,
/// Raw dwords — preserved verbatim so the translator / interpreter can
/// reach into fields we haven't parsed explicitly yet.
pub raw: [u32; 3],
}
/// Decode a 3-dword ALU triple.
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
let w0 = words[0];
let _w1 = words[1];
let w2 = words[2];
AluInstruction {
vector_opcode: (w2 & 0x3F) as u8,
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
vector_dest: ((w2 >> 16) & 0x7F) as u8,
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
src_a: (w0 & 0xFF) as u8,
src_b: ((w0 >> 8) & 0xFF) as u8,
src_c: ((w0 >> 16) & 0xFF) as u8,
predicated: ((w0 >> 27) & 1) != 0,
predicate_condition: ((w0 >> 28) & 1) != 0,
raw: words,
}
}
/// Vector ALU opcodes we reference by name. Values match canary's
/// `AluVectorOpcode` enum in `ucode.h:1354`.
pub mod vop {
pub const ADD: u8 = 0;
pub const MUL: u8 = 1;
pub const MAX: u8 = 2;
pub const MIN: u8 = 3;
pub const SEQ: u8 = 4;
pub const SGT: u8 = 5;
pub const SGE: u8 = 6;
pub const SNE: u8 = 7;
pub const FRC: u8 = 8;
pub const TRUNC: u8 = 9;
pub const FLOOR: u8 = 10;
pub const MAD: u8 = 11;
pub const CND_EQ: u8 = 12;
pub const CND_GE: u8 = 13;
pub const CND_GT: u8 = 14;
pub const DOT4: u8 = 15;
pub const DOT3: u8 = 16;
pub const DOT2_ADD: u8 = 17;
pub const CUBE: u8 = 18;
pub const MAX4: u8 = 19;
pub const SETP_EQ_PUSH: u8 = 20;
pub const SETP_NE_PUSH: u8 = 21;
pub const SETP_GT_PUSH: u8 = 22;
pub const SETP_GE_PUSH: u8 = 23;
pub const KILL_EQ: u8 = 24;
pub const KILL_GT: u8 = 25;
pub const KILL_GE: u8 = 26;
pub const KILL_NE: u8 = 27;
pub const DST: u8 = 28;
pub const MAX_A: u8 = 29;
}
/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
/// `ucode.h:1001`.
pub mod sop {
pub const ADDS: u8 = 0;
pub const ADDS_PREV: u8 = 1;
pub const MULS: u8 = 2;
pub const MULS_PREV: u8 = 3;
pub const MULS_PREV2: u8 = 4;
pub const MAXS: u8 = 5;
pub const MINS: u8 = 6;
pub const SEQS: u8 = 7;
pub const SGTS: u8 = 8;
pub const SGES: u8 = 9;
pub const SNES: u8 = 10;
pub const FRCS: u8 = 11;
pub const TRUNCS: u8 = 12;
pub const FLOORS: u8 = 13;
pub const EXP: u8 = 14;
pub const LOGC: u8 = 15;
pub const LOG: u8 = 16;
pub const RCPC: u8 = 17;
pub const RCPF: u8 = 18;
pub const RCP: u8 = 19;
pub const RSQC: u8 = 20;
pub const RSQF: u8 = 21;
pub const RSQ: u8 = 22;
pub const MAXAS: u8 = 23;
pub const MAXASF: u8 = 24;
pub const SUBS: u8 = 25;
pub const SUBS_PREV: u8 = 26;
pub const SETP_EQ: u8 = 27;
pub const SETP_NE: u8 = 28;
pub const SETP_GT: u8 = 29;
pub const SETP_GE: u8 = 30;
pub const SETP_INV: u8 = 31;
pub const SETP_POP: u8 = 32;
pub const SETP_CLR: u8 = 33;
pub const SETP_RSTR: u8 = 34;
pub const KILLS_EQ: u8 = 35;
pub const KILLS_GT: u8 = 36;
pub const KILLS_GE: u8 = 37;
pub const KILLS_NE: u8 = 38;
pub const KILLS_ONE: u8 = 39;
pub const SQRT: u8 = 40;
pub const MULSC0: u8 = 42;
pub const MULSC1: u8 = 43;
pub const ADDSC0: u8 = 44;
pub const ADDSC1: u8 = 45;
pub const SUBSC0: u8 = 46;
pub const SUBSC1: u8 = 47;
pub const SIN: u8 = 48;
pub const COS: u8 = 49;
pub const RETAIN_PREV: u8 = 50;
}
#[cfg(test)]
mod tests {
use super::*;
/// Regression: our table previously drifted from canary's values (e.g.
/// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
/// the most-often-used scalar + vector opcodes here.
#[test]
fn opcodes_match_canary_values() {
// Scalar.
assert_eq!(sop::MAXS, 5);
assert_eq!(sop::MINS, 6);
assert_eq!(sop::SEQS, 7);
assert_eq!(sop::EXP, 14);
assert_eq!(sop::LOG, 16);
assert_eq!(sop::RCP, 19);
assert_eq!(sop::RSQ, 22);
assert_eq!(sop::SUBS, 25);
assert_eq!(sop::SETP_EQ, 27);
assert_eq!(sop::KILLS_EQ, 35);
assert_eq!(sop::SQRT, 40);
assert_eq!(sop::SIN, 48);
assert_eq!(sop::RETAIN_PREV, 50);
// Vector.
assert_eq!(vop::SNE, 7);
assert_eq!(vop::CND_EQ, 12);
assert_eq!(vop::MAX4, 19);
assert_eq!(vop::KILL_EQ, 24);
assert_eq!(vop::DST, 28);
}
#[test]
fn decode_extracts_opcodes_and_dests() {
// Build a minimal ALU word:
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
let w2 = (vop::ADD as u32)
| ((sop::RCP as u32) << 6)
| (0xF << 12) // vector_write_mask
| (3u32 << 16) // vector_dest
| (7u32 << 24); // scalar_dest
let alu = decode_alu([0, 0, w2]);
assert_eq!(alu.vector_opcode, vop::ADD);
assert_eq!(alu.scalar_opcode, sop::RCP);
assert_eq!(alu.vector_dest, 3);
assert_eq!(alu.scalar_dest, 7);
assert_eq!(alu.vector_write_mask, 0xF);
}
}