fix(gpu): GPUBUG-102 — apply per-format endian byte-swap to vertex fetch
The vertex fetch constant (canary `xe_gpu_vertex_fetch_t`, xenos.h:1158-1172) holds an `endian` field (low 2 bits of dword_1) selecting kNone/k8in16/k8in32/k16in32 swap patterns per `GpuSwapInline` (xenos.h:1090-1109). Xbox 360 vertex data is stored big-endian; the host is little-endian. Pre-fix every dword was bitcast as-is — vertex positions decoded as byte-reversed garbage, producing clipped or NaN positions in any draw that survived to the host. Mechanical changes: - crates/xenia-gpu/src/translator.rs: AOT `emit_vfetch` reads fetch_const dword 1 (endian) and wraps each lane's load in `gpu_swap(value, endian)`. New `gpu_swap` helper added to the emitted module header. - crates/xenia-gpu/src/shaders/xenos_interp.wgsl: matching `gpu_swap` helper added to the runtime interpreter shader. `interpret_vertex_fetch` reads fc1, computes the endian, and wraps every format's per-lane load (including 8_8_8_8 and 16_16_FLOAT paths). Mirrors the AOT translator's emission. Verification at -n 100M lockstep: swaps: 2 → 2 (gated by Phase E for draws) draws: 0 → 0 packets: ~60M (within noise) Tests: +1 (vfetch_emit_includes_gpu_swap_helper_call). Closes GPUBUG-102 (P0). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -152,6 +152,24 @@ fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GPUBUG-102: per-format byte-swap matching canary `GpuSwapInline`
|
||||||
|
// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; host is
|
||||||
|
// little-endian. The fetch constant's `endian` field (low 2 bits of
|
||||||
|
// dword_1) selects the swap pattern.
|
||||||
|
fn gpu_swap(value: u32, endian: u32) -> u32 {
|
||||||
|
switch endian {
|
||||||
|
case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); }
|
||||||
|
case 2u: {
|
||||||
|
return ((value & 0x000000FFu) << 24u)
|
||||||
|
| ((value & 0x0000FF00u) << 8u)
|
||||||
|
| ((value & 0x00FF0000u) >> 8u)
|
||||||
|
| ((value & 0xFF000000u) >> 24u);
|
||||||
|
}
|
||||||
|
case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); }
|
||||||
|
default: { return value; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
|
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
|
||||||
if idx >= 128u {
|
if idx >= 128u {
|
||||||
return;
|
return;
|
||||||
@@ -632,7 +650,12 @@ fn interpret_vertex_fetch(t: u32) {
|
|||||||
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
|
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
|
||||||
// dword 1 carries (endian[1:0], size[25:2]).
|
// dword 1 carries (endian[1:0], size[25:2]).
|
||||||
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
||||||
|
let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u];
|
||||||
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||||
|
// GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is
|
||||||
|
// big-endian; the host is little-endian. Pre-fix every dword was
|
||||||
|
// bitcast as-is — vertex positions were byte-reversed garbage.
|
||||||
|
let endian = fc1 & 0x3u;
|
||||||
|
|
||||||
let vidx = u32(registers[src_reg & 0x7Fu].x);
|
let vidx = u32(registers[src_reg & 0x7Fu].x);
|
||||||
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
|
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
|
||||||
@@ -646,19 +669,19 @@ fn interpret_vertex_fetch(t: u32) {
|
|||||||
case VFMT_32_32_32_32_FLOAT: {
|
case VFMT_32_32_32_32_FLOAT: {
|
||||||
if addr + 3u < n {
|
if addr + 3u < n {
|
||||||
result = vec4<f32>(
|
result = vec4<f32>(
|
||||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
|
||||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
|
||||||
bitcast<f32>(vertex_buffer[addr + 2u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)),
|
||||||
bitcast<f32>(vertex_buffer[addr + 3u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian)),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case VFMT_32_32_32_FLOAT: {
|
case VFMT_32_32_32_FLOAT: {
|
||||||
if addr + 2u < n {
|
if addr + 2u < n {
|
||||||
result = vec4<f32>(
|
result = vec4<f32>(
|
||||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
|
||||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
|
||||||
bitcast<f32>(vertex_buffer[addr + 2u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)),
|
||||||
1.0,
|
1.0,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -666,8 +689,8 @@ fn interpret_vertex_fetch(t: u32) {
|
|||||||
case VFMT_32_32_FLOAT: {
|
case VFMT_32_32_FLOAT: {
|
||||||
if addr + 1u < n {
|
if addr + 1u < n {
|
||||||
result = vec4<f32>(
|
result = vec4<f32>(
|
||||||
bitcast<f32>(vertex_buffer[addr + 0u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)),
|
||||||
bitcast<f32>(vertex_buffer[addr + 1u]),
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)),
|
||||||
0.0,
|
0.0,
|
||||||
1.0,
|
1.0,
|
||||||
);
|
);
|
||||||
@@ -675,17 +698,20 @@ fn interpret_vertex_fetch(t: u32) {
|
|||||||
}
|
}
|
||||||
case VFMT_32_FLOAT: {
|
case VFMT_32_FLOAT: {
|
||||||
if addr < n {
|
if addr < n {
|
||||||
result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
|
result = vec4<f32>(
|
||||||
|
bitcast<f32>(gpu_swap(vertex_buffer[addr], endian)),
|
||||||
|
0.0, 0.0, 1.0,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case VFMT_8_8_8_8: {
|
case VFMT_8_8_8_8: {
|
||||||
if addr < n {
|
if addr < n {
|
||||||
result = unpack4x8unorm(vertex_buffer[addr]);
|
result = unpack4x8unorm(gpu_swap(vertex_buffer[addr], endian));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case VFMT_16_16_FLOAT: {
|
case VFMT_16_16_FLOAT: {
|
||||||
if addr < n {
|
if addr < n {
|
||||||
let h = unpack2x16float(vertex_buffer[addr]);
|
let h = unpack2x16float(gpu_swap(vertex_buffer[addr], endian));
|
||||||
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
|
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -126,6 +126,28 @@ struct FsOut {
|
|||||||
fn xe_rcp(x: f32) -> f32 {
|
fn xe_rcp(x: f32) -> f32 {
|
||||||
return select(0.0, 1.0 / x, x != 0.0);
|
return select(0.0, 1.0 / x, x != 0.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GPUBUG-102: per-format byte-swap matching canary's `GpuSwapInline`
|
||||||
|
// (xenos.h:1090-1109). Xbox 360 vertex data is big-endian; the host is
|
||||||
|
// little-endian. The fetch constant's `endian` field (low 2 bits of
|
||||||
|
// dword_1) selects:
|
||||||
|
// 0 (kNone) — no swap
|
||||||
|
// 1 (k8in16) — swap bytes within halfwords
|
||||||
|
// 2 (k8in32) — full byte reverse
|
||||||
|
// 3 (k16in32) — swap halfwords
|
||||||
|
fn gpu_swap(value: u32, endian: u32) -> u32 {
|
||||||
|
switch endian {
|
||||||
|
case 1u: { return ((value << 8u) & 0xFF00FF00u) | ((value >> 8u) & 0x00FF00FFu); }
|
||||||
|
case 2u: {
|
||||||
|
return ((value & 0x000000FFu) << 24u)
|
||||||
|
| ((value & 0x0000FF00u) << 8u)
|
||||||
|
| ((value & 0x00FF0000u) >> 8u)
|
||||||
|
| ((value & 0xFF000000u) >> 24u);
|
||||||
|
}
|
||||||
|
case 3u: { return ((value >> 16u) & 0xFFFFu) | (value << 16u); }
|
||||||
|
default: { return value; }
|
||||||
|
}
|
||||||
|
}
|
||||||
"#;
|
"#;
|
||||||
|
|
||||||
struct EmitCtx {
|
struct EmitCtx {
|
||||||
@@ -376,23 +398,33 @@ impl EmitCtx {
|
|||||||
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
||||||
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
||||||
// formats alongside the CPU texture cache's format expansion.
|
// formats alongside the CPU texture cache's format expansion.
|
||||||
|
//
|
||||||
|
// GPUBUG-102: the fetch constant (xe_gpu_vertex_fetch_t,
|
||||||
|
// xenos.h:1158-1172) holds the endian field in dword_1's low
|
||||||
|
// 2 bits. Vertex data on Xbox 360 is big-endian; the host is
|
||||||
|
// little-endian. Pre-fix, every dword was bitcast as-is →
|
||||||
|
// vertex positions were byte-reversed garbage and any draw
|
||||||
|
// that did reach the host produced clipped / NaN positions.
|
||||||
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
||||||
let src_reg = vf.src_register & 0x7F;
|
let src_reg = vf.src_register & 0x7F;
|
||||||
let dst_reg = vf.dest_register & 0x7F;
|
let dst_reg = vf.dest_register & 0x7F;
|
||||||
self.push(&format!(
|
self.push(&format!(
|
||||||
"{{ let fc0 = xenos_consts.fetch[{}u]; \
|
"{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \
|
||||||
|
let fc1 = xenos_consts.fetch[{fc1_idx}u]; \
|
||||||
|
let endian = fc1 & 0x3u; \
|
||||||
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
||||||
let vidx = u32(r[{src_reg}u].x); \
|
let vidx = u32(r[{src_reg}u].x); \
|
||||||
let addr = base + vidx * 4u; \
|
let addr = base + vidx * 4u; \
|
||||||
let n = arrayLength(&vertex_buffer); \
|
let n = arrayLength(&vertex_buffer); \
|
||||||
if (addr + 3u < n) {{ \
|
if (addr + 3u < n) {{ \
|
||||||
r[{dst_reg}u] = vec4<f32>( \
|
r[{dst_reg}u] = vec4<f32>( \
|
||||||
bitcast<f32>(vertex_buffer[addr + 0u]), \
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)), \
|
||||||
bitcast<f32>(vertex_buffer[addr + 1u]), \
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)), \
|
||||||
bitcast<f32>(vertex_buffer[addr + 2u]), \
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)), \
|
||||||
bitcast<f32>(vertex_buffer[addr + 3u])); \
|
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian))); \
|
||||||
}} }}",
|
}} }}",
|
||||||
fetch_const * 2,
|
fc0_idx = fetch_const * 2,
|
||||||
|
fc1_idx = fetch_const * 2 + 1,
|
||||||
));
|
));
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -655,6 +687,24 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vfetch_emit_includes_gpu_swap_helper_call() {
|
||||||
|
// GPUBUG-102: emit_vfetch should reference `gpu_swap(...)` for
|
||||||
|
// each lane. Ensures the per-format endian byte-swap is wired
|
||||||
|
// into the AOT path.
|
||||||
|
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||||
|
let vf = crate::ucode::fetch::VertexFetch {
|
||||||
|
fetch_const: 0,
|
||||||
|
src_register: 0,
|
||||||
|
dest_register: 0,
|
||||||
|
dest_write_mask: 0xF,
|
||||||
|
raw: [0; 3],
|
||||||
|
};
|
||||||
|
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
||||||
|
let body = ctx.finish();
|
||||||
|
assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn loop_clause_rejected() {
|
fn loop_clause_rejected() {
|
||||||
let shader = ParsedShader {
|
let shader = ParsedShader {
|
||||||
|
|||||||
Reference in New Issue
Block a user