fix(gpu): GPUBUG-102 — apply per-format endian byte-swap to vertex fetch

The vertex fetch constant (canary `xe_gpu_vertex_fetch_t`,
xenos.h:1158-1172) holds an `endian` field (low 2 bits of dword_1)
selecting kNone/k8in16/k8in32/k16in32 swap patterns per
`GpuSwapInline` (xenos.h:1090-1109). Xbox 360 vertex data is stored
big-endian; the host is little-endian. Pre-fix every dword was
bitcast as-is — vertex positions decoded as byte-reversed garbage,
producing clipped or NaN positions in any draw that survived to the
host.

Mechanical changes:
- crates/xenia-gpu/src/translator.rs: AOT `emit_vfetch` reads
  fetch_const dword 1 (endian) and wraps each lane's load in
  `gpu_swap(value, endian)`. New `gpu_swap` helper added to the
  emitted module header.
- crates/xenia-gpu/src/shaders/xenos_interp.wgsl: matching
  `gpu_swap` helper added to the runtime interpreter shader.
  `interpret_vertex_fetch` reads fc1, computes the endian, and wraps
  every format's per-lane load (including 8_8_8_8 and 16_16_FLOAT
  paths). Mirrors the AOT translator's emission.

Verification at -n 100M lockstep:
  swaps:                2 → 2     (gated by Phase E for draws)
  draws:                0 → 0
  packets:              ~60M (within noise)
Tests: +1 (vfetch_emit_includes_gpu_swap_helper_call).

Closes GPUBUG-102 (P0).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-03 14:18:46 +02:00
parent c5c6713419
commit ec2d955dbd
2 changed files with 94 additions and 18 deletions

View File

@@ -152,6 +152,24 @@ fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
return r;
}
// GPUBUG-102: per-format byte-swap matching canary `GpuSwapInline`
// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; host is
// little-endian. The fetch constant's `endian` field (low 2 bits of
// dword_1) selects the swap pattern.
fn gpu_swap(value: u32, endian: u32) -> u32 {
switch endian {
case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); }
case 2u: {
return ((value & 0x000000FFu) << 24u)
| ((value & 0x0000FF00u) << 8u)
| ((value & 0x00FF0000u) >> 8u)
| ((value & 0xFF000000u) >> 24u);
}
case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); }
default: { return value; }
}
}
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
if idx >= 128u {
return;
@@ -632,7 +650,12 @@ fn interpret_vertex_fetch(t: u32) {
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
// dword 1 carries (endian[1:0], size[25:2]).
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u];
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
// GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is
// big-endian; the host is little-endian. Pre-fix every dword was
// bitcast as-is — vertex positions were byte-reversed garbage.
let endian = fc1 & 0x3u;
let vidx = u32(registers[src_reg & 0x7Fu].x);
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
@@ -646,19 +669,19 @@ fn interpret_vertex_fetch(t: u32) {
case VFMT_32_32_32_32_FLOAT: {
if addr + 3u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
bitcast<f32>(vertex_buffer[addr + 2u]),
bitcast<f32>(vertex_buffer[addr + 3u]),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian)),
);
}
}
case VFMT_32_32_32_FLOAT: {
if addr + 2u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
bitcast<f32>(vertex_buffer[addr + 2u]),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)),
1.0,
);
}
@@ -666,8 +689,8 @@ fn interpret_vertex_fetch(t: u32) {
case VFMT_32_32_FLOAT: {
if addr + 1u < n {
result = vec4<f32>(
bitcast<f32>(vertex_buffer[addr + 0u]),
bitcast<f32>(vertex_buffer[addr + 1u]),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
0.0,
1.0,
);
@@ -675,17 +698,20 @@ fn interpret_vertex_fetch(t: u32) {
}
case VFMT_32_FLOAT: {
if addr < n {
result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
result = vec4<f32>(
bitcast<f32>(gpu_swap(vertex_buffer[addr], endian)),
0.0, 0.0, 1.0,
);
}
}
case VFMT_8_8_8_8: {
if addr < n {
result = unpack4x8unorm(vertex_buffer[addr]);
result = unpack4x8unorm(gpu_swap(vertex_buffer[addr], endian));
}
}
case VFMT_16_16_FLOAT: {
if addr < n {
let h = unpack2x16float(vertex_buffer[addr]);
let h = unpack2x16float(gpu_swap(vertex_buffer[addr], endian));
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
}
}