From ec2d955dbdab6dd769e7fe02555632f639f05582 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 3 May 2026 14:18:46 +0200 Subject: [PATCH] =?UTF-8?q?fix(gpu):=20GPUBUG-102=20=E2=80=94=20apply=20pe?= =?UTF-8?q?r-format=20endian=20byte-swap=20to=20vertex=20fetch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vertex fetch constant (canary `xe_gpu_vertex_fetch_t`, xenos.h:1158-1172) holds an `endian` field (low 2 bits of dword_1) selecting kNone/k8in16/k8in32/k16in32 swap patterns per `GpuSwapInline` (xenos.h:1090-1109). Xbox 360 vertex data is stored big-endian; the host is little-endian. Pre-fix every dword was bitcast as-is — vertex positions decoded as byte-reversed garbage, producing clipped or NaN positions in any draw that survived to the host. Mechanical changes: - crates/xenia-gpu/src/translator.rs: AOT `emit_vfetch` reads fetch_const dword 1 (endian) and wraps each lane's load in `gpu_swap(value, endian)`. New `gpu_swap` helper added to the emitted module header. - crates/xenia-gpu/src/shaders/xenos_interp.wgsl: matching `gpu_swap` helper added to the runtime interpreter shader. `interpret_vertex_fetch` reads fc1, computes the endian, and wraps every format's per-lane load (including 8_8_8_8 and 16_16_FLOAT paths). Mirrors the AOT translator's emission. Verification at -n 100M lockstep: swaps: 2 → 2 (gated by Phase E for draws) draws: 0 → 0 packets: ~60M (within noise) Tests: +1 (vfetch_emit_includes_gpu_swap_helper_call). Closes GPUBUG-102 (P0). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../xenia-gpu/src/shaders/xenos_interp.wgsl | 50 +++++++++++---- crates/xenia-gpu/src/translator.rs | 62 +++++++++++++++++-- 2 files changed, 94 insertions(+), 18 deletions(-) diff --git a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl index 68ae716..6e97d86 100644 --- a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl +++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl @@ -152,6 +152,24 @@ fn apply_modifiers(v: vec4, negate: bool, take_abs: bool) -> vec4 { return r; } +// GPUBUG-102: per-format byte-swap matching canary `GpuSwapInline` +// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; host is +// little-endian. The fetch constant's `endian` field (low 2 bits of +// dword_1) selects the swap pattern. +fn gpu_swap(value: u32, endian: u32) -> u32 { + switch endian { + case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); } + case 2u: { + return ((value & 0x000000FFu) << 24u) + | ((value & 0x0000FF00u) << 8u) + | ((value & 0x00FF0000u) >> 8u) + | ((value & 0xFF000000u) >> 24u); + } + case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); } + default: { return value; } + } +} + fn write_reg_masked(idx: u32, mask: u32, value: vec4) { if idx >= 128u { return; @@ -632,7 +650,12 @@ fn interpret_vertex_fetch(t: u32) { // Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]); // dword 1 carries (endian[1:0], size[25:2]). let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u]; + let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u]; let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u; + // GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is + // big-endian; the host is little-endian. Pre-fix every dword was + // bitcast as-is — vertex positions were byte-reversed garbage. + let endian = fc1 & 0x3u; let vidx = u32(registers[src_reg & 0x7Fu].x); // Per-vertex byte offset; stride==0 means 1 element total (non-indexed). @@ -646,19 +669,19 @@ fn interpret_vertex_fetch(t: u32) { case VFMT_32_32_32_32_FLOAT: { if addr + 3u < n { result = vec4( - bitcast(vertex_buffer[addr + 0u]), - bitcast(vertex_buffer[addr + 1u]), - bitcast(vertex_buffer[addr + 2u]), - bitcast(vertex_buffer[addr + 3u]), + bitcast(gpu_swap(vertex_buffer[addr + 0u], endian)), + bitcast(gpu_swap(vertex_buffer[addr + 1u], endian)), + bitcast(gpu_swap(vertex_buffer[addr + 2u], endian)), + bitcast(gpu_swap(vertex_buffer[addr + 3u], endian)), ); } } case VFMT_32_32_32_FLOAT: { if addr + 2u < n { result = vec4( - bitcast(vertex_buffer[addr + 0u]), - bitcast(vertex_buffer[addr + 1u]), - bitcast(vertex_buffer[addr + 2u]), + bitcast(gpu_swap(vertex_buffer[addr + 0u], endian)), + bitcast(gpu_swap(vertex_buffer[addr + 1u], endian)), + bitcast(gpu_swap(vertex_buffer[addr + 2u], endian)), 1.0, ); } @@ -666,8 +689,8 @@ fn interpret_vertex_fetch(t: u32) { case VFMT_32_32_FLOAT: { if addr + 1u < n { result = vec4( - bitcast(vertex_buffer[addr + 0u]), - bitcast(vertex_buffer[addr + 1u]), + bitcast(gpu_swap(vertex_buffer[addr + 0u], endian)), + bitcast(gpu_swap(vertex_buffer[addr + 1u], endian)), 0.0, 1.0, ); @@ -675,17 +698,20 @@ fn interpret_vertex_fetch(t: u32) { } case VFMT_32_FLOAT: { if addr < n { - result = vec4(bitcast(vertex_buffer[addr]), 0.0, 0.0, 1.0); + result = vec4( + bitcast(gpu_swap(vertex_buffer[addr], endian)), + 0.0, 0.0, 1.0, + ); } } case VFMT_8_8_8_8: { if addr < n { - result = unpack4x8unorm(vertex_buffer[addr]); + result = unpack4x8unorm(gpu_swap(vertex_buffer[addr], endian)); } } case VFMT_16_16_FLOAT: { if addr < n { - let h = unpack2x16float(vertex_buffer[addr]); + let h = unpack2x16float(gpu_swap(vertex_buffer[addr], endian)); result = vec4(h.x, h.y, 0.0, 1.0); } } diff --git a/crates/xenia-gpu/src/translator.rs b/crates/xenia-gpu/src/translator.rs index cc63c5b..1655100 100644 --- a/crates/xenia-gpu/src/translator.rs +++ b/crates/xenia-gpu/src/translator.rs @@ -126,6 +126,28 @@ struct FsOut { fn xe_rcp(x: f32) -> f32 { return select(0.0, 1.0 / x, x != 0.0); } + +// GPUBUG-102: per-format byte-swap matching canary's `GpuSwapInline` +// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; the host is +// little-endian. The fetch constant's `endian` field (low 2 bits of +// dword_1) selects: +// 0 (kNone) — no swap +// 1 (k8in16) — swap bytes within halfwords +// 2 (k8in32) — full byte reverse +// 3 (k16in32) — swap halfwords +fn gpu_swap(value: u32, endian: u32) -> u32 { + switch endian { + case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); } + case 2u: { + return ((value & 0x000000FFu) << 24u) + | ((value & 0x0000FF00u) << 8u) + | ((value & 0x00FF0000u) >> 8u) + | ((value & 0xFF000000u) >> 24u); + } + case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); } + default: { return value; } + } +} "#; struct EmitCtx { @@ -376,23 +398,33 @@ impl EmitCtx { // v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4 // dwords. Matches the interpreter's MVP semantics; unlocks more // formats alongside the CPU texture cache's format expansion. + // + // GPUBUG-102: the fetch constant (xe_gpu_vertex_fetch_t, + // xenos.h:1158-1172) holds the endian field in dword_1's low + // 2 bits. Vertex data on Xbox 360 is big-endian; the host is + // little-endian. Pre-fix, every dword was bitcast as-is → + // vertex positions were byte-reversed garbage and any draw + // that did reach the host produced clipped / NaN positions. let fetch_const = (vf.raw[0] >> 5) & 0x1F; let src_reg = vf.src_register & 0x7F; let dst_reg = vf.dest_register & 0x7F; self.push(&format!( - "{{ let fc0 = xenos_consts.fetch[{}u]; \ + "{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \ + let fc1 = xenos_consts.fetch[{fc1_idx}u]; \ + let endian = fc1 & 0x3u; \ let base = (fc0 & 0xFFFFFFFCu) >> 2u; \ let vidx = u32(r[{src_reg}u].x); \ let addr = base + vidx * 4u; \ let n = arrayLength(&vertex_buffer); \ if (addr + 3u < n) {{ \ r[{dst_reg}u] = vec4( \ - bitcast(vertex_buffer[addr + 0u]), \ - bitcast(vertex_buffer[addr + 1u]), \ - bitcast(vertex_buffer[addr + 2u]), \ - bitcast(vertex_buffer[addr + 3u])); \ + bitcast(gpu_swap(vertex_buffer[addr + 0u], endian)), \ + bitcast(gpu_swap(vertex_buffer[addr + 1u], endian)), \ + bitcast(gpu_swap(vertex_buffer[addr + 2u], endian)), \ + bitcast(gpu_swap(vertex_buffer[addr + 3u], endian))); \ }} }}", - fetch_const * 2, + fc0_idx = fetch_const * 2, + fc1_idx = fetch_const * 2 + 1, )); Ok(()) } @@ -655,6 +687,24 @@ mod tests { } } + #[test] + fn vfetch_emit_includes_gpu_swap_helper_call() { + // GPUBUG-102: emit_vfetch should reference `gpu_swap(...)` for + // each lane. Ensures the per-format endian byte-swap is wired + // into the AOT path. + let mut ctx = EmitCtx::new(Stage::Vertex); + let vf = crate::ucode::fetch::VertexFetch { + fetch_const: 0, + src_register: 0, + dest_register: 0, + dest_write_mask: 0xF, + raw: [0; 3], + }; + ctx.emit_vfetch(&vf).expect("emit_vfetch"); + let body = ctx.finish(); + assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}"); + } + #[test] fn loop_clause_rejected() { let shader = ParsedShader {