[iterate-3S] Real splash geometry renders: fix ALU/vfetch decode + per-draw NDC transform
The 3O→3R real-render slice ran the guest's real translated VS/PS on real captured vertices at full boot speed, but the --ui window stayed blank. Bifurcated with an env-gated frontbuffer readback + per-vertex NDC dump (both removed): the captured splash quads (RectangleList, k_32_32_FLOAT, 3 verts) were non-zero and sane, so this was a transform/decode chain of bugs, not missing geometry. Four coupled root causes: - GPUBUG-106 (ucode/alu.rs): decode_alu read EVERY field out of w2, but canary's AluInstruction lays dest/write-mask/export/scalar-opcode in w0, the vector opcode + source regs in w2, swizzle/negate/pred in w1. The misread made every *export* ALU decode with vector_write_mask=0 → no oPos/oColor export emitted → the translated VS collapsed every vertex to the clip origin. Rewrote the field map to match ucode.h:2036-2086. - GPUBUG-107 (ucode/fetch.rs + translator.rs): the translator hardcoded R32G32B32A32_FLOAT (4 floats, stride 4); the splash quads are k_32_32_FLOAT (2 floats, stride 2). Over-striding read the next vertex's X into .w → negative W → the rectangle clipped behind the camera. Decode the real VertexFormat + dword stride and emit the matching component read (1/2/3/4 float formats; others reject to the interpreter). - GPUBUG-108 (translator.rs + xenos_interp.wgsl): the vfetch recomputed the buffer base from xenos_consts.fetch[], but that uniform carries the last-published per-frame fetch constant, not this draw's (stale 0x8a000002 vs the real base). The captured window already begins at the fetch base, so index from 0 (vertex i at i*stride) when a real window is present; only the synthetic fallback consults the uniform. - iterate-3S NDC transform (draw_capture.rs + xenos_pipeline.rs + WGSL): the guest VS emits screen-space pixel coords (clip disabled, VTE viewport scale/offset off). Added compute_ndc_xy (mirrors canary GetHostViewportInfo): rescales render-target pixels to [-1,1] clip with the Y-flip for wgpu, plumbed per-draw into DrawConstants and applied in both the translated and interpreter VS. Result (env-gated readback, since removed): the real splash geometry now fills ~50% of the frontbuffer in a clean triangular coverage pattern, real positions from real guest vertices through the real translated shaders (textures are the next stage — sampled color is still the magenta/white texture stub, tex-cache=0). Headless-inert: draw_capture is only built when frame_captures is Some (--ui); the changed decoders feed only the UI translator/metrics. Golden byte-identical (check -n50m --gpu-inline --stable-digest exit 0); 679 workspace tests green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -71,33 +71,50 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
// GPUBUG-106 (iterate-3S): correct the dword field map to match canary's
|
||||
// `AluInstruction` union (ucode.h:2036-2086). Pre-fix this read the
|
||||
// dest/mask/export/scalar-opcode out of `w2`, but they live in `w0`; the
|
||||
// vector opcode + source registers live in `w2`, and swizzle/negate/pred
|
||||
// in `w1`. The misread made every *export* ALU decode with
|
||||
// `vector_write_mask=0` → no oPos/oColor export emitted → the translated VS
|
||||
// collapsed every vertex to the clip origin (degenerate, nothing drawn).
|
||||
//
|
||||
// w0: vector_dest(0:5) vector_dest_rel(6) abs_constants(7)
|
||||
// scalar_dest(8:13) scalar_dest_rel(14) export_data(15)
|
||||
// vector_write_mask(16:19) scalar_write_mask(20:23)
|
||||
// vector_clamp(24) scalar_clamp(25) scalar_opc(26:31)
|
||||
// w1: src3_swiz(0:7) src2_swiz(8:15) src1_swiz(16:23)
|
||||
// src3/2/1_reg_negate(24/25/26) pred_condition(27) is_predicated(28)
|
||||
// w2: src3_reg(0:7) src2_reg(8:15) src1_reg(16:23)
|
||||
// vector_opc(24:28) src3/2/1_sel(29/30/31)
|
||||
//
|
||||
// Our (a,b,c) operands map to canary's (src1,src2,src3).
|
||||
AluInstruction {
|
||||
vector_opcode: (w2 & 0x3F) as u8,
|
||||
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
|
||||
vector_dest: ((w2 >> 16) & 0x7F) as u8,
|
||||
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
|
||||
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
|
||||
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
|
||||
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
|
||||
src_a: (w0 & 0xFF) as u8,
|
||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||
// Word-0 bits 29-31 are the per-operand temp-vs-constant
|
||||
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
|
||||
// ucode.h:2078-2086). Our `src_a` is canary's third operand
|
||||
// (low byte of w0), so its selector is bit 29.
|
||||
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
||||
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
||||
src_a_swiz: (w1 & 0xFF) as u8,
|
||||
vector_opcode: ((w2 >> 24) & 0x1F) as u8,
|
||||
scalar_opcode: ((w0 >> 26) & 0x3F) as u8,
|
||||
vector_dest: (w0 & 0x3F) as u8,
|
||||
scalar_dest: ((w0 >> 8) & 0x3F) as u8,
|
||||
vector_write_mask: ((w0 >> 16) & 0xF) as u8,
|
||||
scalar_write_mask: ((w0 >> 20) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w0 >> 15) & 1) != 0,
|
||||
// Not a real microcode bit — the scalar pipe selects `ps` implicitly
|
||||
// via the *_PREV opcodes, which `scalar_expr` handles by opcode.
|
||||
scalar_src_is_ps: false,
|
||||
src_a: ((w2 >> 16) & 0xFF) as u8,
|
||||
src_b: ((w2 >> 8) & 0xFF) as u8,
|
||||
src_c: (w2 & 0xFF) as u8,
|
||||
// sel==1 → operand is a temp register; sel==0 → ALU constant.
|
||||
src_a_is_temp: ((w2 >> 31) & 1) != 0,
|
||||
src_b_is_temp: ((w2 >> 30) & 1) != 0,
|
||||
src_c_is_temp: ((w2 >> 29) & 1) != 0,
|
||||
src_a_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
||||
src_c_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 24) & 1) != 0,
|
||||
src_c_swiz: (w1 & 0xFF) as u8,
|
||||
src_a_negate: ((w1 >> 26) & 1) != 0,
|
||||
src_b_negate: ((w1 >> 25) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 26) & 1) != 0,
|
||||
predicated: ((w0 >> 27) & 1) != 0,
|
||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||
src_c_negate: ((w1 >> 24) & 1) != 0,
|
||||
predicated: ((w1 >> 28) & 1) != 0,
|
||||
predicate_condition: ((w1 >> 27) & 1) != 0,
|
||||
raw: words,
|
||||
}
|
||||
}
|
||||
@@ -225,19 +242,24 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn decode_extracts_opcodes_and_dests() {
|
||||
// Build a minimal ALU word:
|
||||
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
|
||||
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RCP as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (3u32 << 16) // vector_dest
|
||||
| (7u32 << 24); // scalar_dest
|
||||
let alu = decode_alu([0, 0, w2]);
|
||||
// GPUBUG-106: correct canary field map. w0 carries dest/mask/scalar_opc;
|
||||
// w2 carries vector_opc + source regs.
|
||||
// vector_opcode = ADD (0) → w2 bits 24:28
|
||||
// scalar_opcode = RCP (22) → w0 bits 26:31
|
||||
// vector_dest = 3 → w0 bits 0:5, scalar_dest = 7 → w0 bits 8:13
|
||||
// vector_write_mask = 0xF → w0 bits 16:19, export_data → w0 bit 15
|
||||
let w0 = 3u32 // vector_dest
|
||||
| (7u32 << 8) // scalar_dest
|
||||
| (1u32 << 15) // export_data
|
||||
| (0xFu32 << 16) // vector_write_mask
|
||||
| ((sop::RCP as u32) << 26); // scalar_opc
|
||||
let w2 = (vop::ADD as u32) << 24; // vector_opc
|
||||
let alu = decode_alu([w0, 0, w2]);
|
||||
assert_eq!(alu.vector_opcode, vop::ADD);
|
||||
assert_eq!(alu.scalar_opcode, sop::RCP);
|
||||
assert_eq!(alu.vector_dest, 3);
|
||||
assert_eq!(alu.scalar_dest, 7);
|
||||
assert_eq!(alu.vector_write_mask, 0xF);
|
||||
assert!(alu.vector_dest_is_export);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,15 @@ pub struct VertexFetch {
|
||||
pub dest_register: u8,
|
||||
/// 4-bit write mask.
|
||||
pub dest_write_mask: u8,
|
||||
/// iterate-3S (GPUBUG-107): `xenos::VertexFormat` (6 bits, dword1[16:21]).
|
||||
/// Determines how many components to read and their packing. Pre-fix the
|
||||
/// translator hardcoded `k_32_32_32_32_FLOAT` (4 floats, stride 4),
|
||||
/// over-striding 2-float UI quads (`k_32_32_FLOAT`) → wrong/clipped
|
||||
/// positions (the next vertex's X bled into .w, giving negative W → the
|
||||
/// whole rectangle was clipped behind the camera).
|
||||
pub format: u8,
|
||||
/// Dword stride between consecutive vertices (dword2[0:7]).
|
||||
pub stride: u8,
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
@@ -72,6 +81,9 @@ pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||
dest_write_mask: (w1 & 0xF) as u8,
|
||||
// dword1[16:21] = VertexFormat; dword2[0:7] = dword stride.
|
||||
format: ((w1 >> 16) & 0x3F) as u8,
|
||||
stride: (w2 & 0xFF) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||
|
||||
Reference in New Issue
Block a user