[iterate-3T] Real UV interpolation + per-draw textures: shader/UV/bind chain complete
Build the full texture-sampling chain for the publisher splash so the textured logo CAN sample real artwork at the guest's real UVs. Measured with an env-gated frontbuffer readback (since removed): the chain is correct end-to-end, but the sampled K8888 1280x768 texture is ALL-ZERO in the UI window's reachable boot range — the artwork is produced by an EDRAM resolve (RT->texture copy) that ours does not yet perform (resolves=0). So this lands the correct shader/UV/bind work and isolates the remaining blocker to the resolve gap, not the shader path. Translator (xenia-gpu/src/translator.rs), all UI-translator-only: - Real Xenos export-index model (replaces the AllocKind heuristic that collapsed every VS export to one color slot and DROPPED the texcoord). When export_data is set the 6-bit vector_dest IS the export index: VS 62=oPos, 0..15=interps; PS 0=RT0. The logo VS exports oPos(62), interp0(color), interp1(UV) distinctly. - Real interpolator passthrough: VsOut carries 8 interpolator locations; the PS seeds r[i] = in.interp[i] (Xenos PS-input-GPR mapping) so tfetch samples at the real interpolated texcoord (r1) instead of (0,0). - vfetch format 6 (k_16_16) packed-16 unpack + per-attribute dword offset, so the 3 vfetches sharing one fetch-constant (pos/UV/color in a 6-dword vertex) read the right attribute. Previously rejected the whole logo VS to the interpreter. - QuadList/RectangleList host->guest vertex-index remap in the VS (replay is non-indexed): QuadList 6 host verts -> guest [0,1,2,0,2,3] (full quad). fetch.rs: decode vfetch `offset` (dword2[8:15], dwords), `is_signed`, `is_normalized`. Per-draw textures: DrawCapture carries the decoded texture(s) (keyed off the active PS's tfetch slots, attached in gpu_system after decode); render.rs::dispatch_xenos_captures uploads + binds each capture's texture via the host texture cache before its draw, instead of one last-draw primary_texture. Determinism: all changes feed only the UI translator/capture path; frame_captures is None headless. `check -n50m --gpu-inline --stable-digest --expect` byte- identical (exit 0). 681 tests pass (+2 regression: logo VS now translates with interpolators; PS seeds interps into registers). Temp readback/dump probes removed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -66,6 +66,13 @@ pub struct DrawCapture {
|
|||||||
/// already carries the render-target → wgpu Y-flip (negated).
|
/// already carries the render-target → wgpu Y-flip (negated).
|
||||||
pub ndc_scale: [f32; 2],
|
pub ndc_scale: [f32; 2],
|
||||||
pub ndc_offset: [f32; 2],
|
pub ndc_offset: [f32; 2],
|
||||||
|
/// iterate-3T: the decoded texture(s) this draw's active pixel shader
|
||||||
|
/// samples, keyed off its real `tfetch` fetch-constant slots (the 3M
|
||||||
|
/// decoder makes these decode). The UI uploads + binds the FIRST entry
|
||||||
|
/// per-draw so the textured logo samples the real artwork instead of the
|
||||||
|
/// magenta stub. Empty for flat (no-tfetch) draws. Populated by
|
||||||
|
/// `gpu_system` after decode (left empty by `build`).
|
||||||
|
pub textures: Vec<(crate::texture_cache::TextureKey, Vec<u8>)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// iterate-3S: compute the guest→host NDC XY transform for a draw, mirroring
|
/// iterate-3S: compute the guest→host NDC XY transform for a draw, mirroring
|
||||||
@@ -287,5 +294,6 @@ pub fn build(
|
|||||||
has_real_vertices: has_real,
|
has_real_vertices: has_real,
|
||||||
ndc_scale,
|
ndc_scale,
|
||||||
ndc_offset,
|
ndc_offset,
|
||||||
|
textures: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1413,6 +1413,18 @@ impl GpuSystem {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// iterate-3T: attach this draw's decoded textures to the just-
|
||||||
|
// captured draw so the UI can bind the real artwork per-draw
|
||||||
|
// (keyed off the active PS's real tfetch slots) instead of a
|
||||||
|
// single last-draw `primary_texture`. UI-only (`frame_captures`
|
||||||
|
// is `None` headless); does not touch the deterministic core.
|
||||||
|
if !self.last_draw_textures.is_empty()
|
||||||
|
&& let Some(caps) = self.frame_captures.as_mut()
|
||||||
|
&& let Some(last) = caps.last_mut()
|
||||||
|
{
|
||||||
|
last.textures = self.last_draw_textures.clone();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
||||||
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
||||||
|
|||||||
@@ -115,9 +115,21 @@ struct XenosConstants {
|
|||||||
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||||
@group(1) @binding(1) var xenos_samp : sampler;
|
@group(1) @binding(1) var xenos_samp : sampler;
|
||||||
|
|
||||||
|
// iterate-3T: real interpolator passthrough. The Xenos VS exports up to 16
|
||||||
|
// interpolators (export index 0..15); the PS reads interpolator i from its
|
||||||
|
// general register r[i]. We carry 8 interpolator vec4s (covers Sylpheed's
|
||||||
|
// splash: r0=color, r1=texcoord). `color` retained as an alias of interp0 so
|
||||||
|
// older single-color paths keep working.
|
||||||
struct VsOut {
|
struct VsOut {
|
||||||
@builtin(position) position: vec4<f32>,
|
@builtin(position) position: vec4<f32>,
|
||||||
@location(0) color: vec4<f32>,
|
@location(0) interp0: vec4<f32>,
|
||||||
|
@location(1) interp1: vec4<f32>,
|
||||||
|
@location(2) interp2: vec4<f32>,
|
||||||
|
@location(3) interp3: vec4<f32>,
|
||||||
|
@location(4) interp4: vec4<f32>,
|
||||||
|
@location(5) interp5: vec4<f32>,
|
||||||
|
@location(6) interp6: vec4<f32>,
|
||||||
|
@location(7) interp7: vec4<f32>,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct FsOut {
|
struct FsOut {
|
||||||
@@ -200,19 +212,56 @@ impl EmitCtx {
|
|||||||
self.push("var ps: f32 = 0.0;");
|
self.push("var ps: f32 = 0.0;");
|
||||||
match self.stage {
|
match self.stage {
|
||||||
Stage::Vertex => {
|
Stage::Vertex => {
|
||||||
|
// iterate-3T: host→guest vertex-index remap for primitives the
|
||||||
|
// replay draws non-indexed as a flat triangle list. wgpu has no
|
||||||
|
// QuadList/RectangleList topology, so the host issues 6 vertices
|
||||||
|
// per quad/rect and we map them back to the guest's 4/3 source
|
||||||
|
// vertices here (mirrors `primitive.rs` index rewrite, but in the
|
||||||
|
// VS since the replay path is non-indexed):
|
||||||
|
// QuadList(13): 6 host verts → guest [0,1,2, 0,2,3]
|
||||||
|
// RectangleList(8): drawn as one triangle [0,1,2] (the 4th
|
||||||
|
// corner needs cross-vertex synthesis — TODO), so host
|
||||||
|
// indices >=3 fold onto the existing triangle.
|
||||||
|
// Other prims pass through unchanged.
|
||||||
|
self.push("var gvidx: u32 = vidx;");
|
||||||
|
self.push("if (draw_ctx.prim_kind == 13u) {");
|
||||||
|
self.indent += 1;
|
||||||
|
self.push("let q = vidx % 6u; let qbase = (vidx / 6u) * 4u;");
|
||||||
|
self.push("var lut = array<u32, 6>(0u, 1u, 2u, 0u, 2u, 3u);");
|
||||||
|
self.push("gvidx = qbase + lut[q];");
|
||||||
|
self.indent -= 1;
|
||||||
|
self.push("} else if (draw_ctx.prim_kind == 8u) {");
|
||||||
|
self.indent += 1;
|
||||||
|
self.push("let t = vidx % 3u; let rbase = (vidx / 3u) * 3u;");
|
||||||
|
self.push("gvidx = rbase + t;");
|
||||||
|
self.indent -= 1;
|
||||||
|
self.push("}");
|
||||||
// Seed r0 with vertex index for simple shaders that read it.
|
// Seed r0 with vertex index for simple shaders that read it.
|
||||||
self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
|
self.push("r[0] = vec4<f32>(f32(gvidx), 0.0, 0.0, 1.0);");
|
||||||
// Synthetic export slots — match the interpreter's layout so
|
// iterate-3T: real export model. Xenos export index 62 = oPos;
|
||||||
// the fallback path and translator path produce the same
|
// indices 0..15 = interpolators. We hold position + 8
|
||||||
// visual output on shaders both support.
|
// interpolator vec4s; `emit_export` writes the right slot keyed
|
||||||
|
// on the export index. Seed interp0 to white so a VS that only
|
||||||
|
// exports position still yields a visible (non-zero) color.
|
||||||
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
||||||
self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
|
self.push("var ointerp: array<vec4<f32>, 8>;");
|
||||||
|
self.push("for (var i = 0u; i < 8u; i = i + 1u) { ointerp[i] = vec4<f32>(0.0, 0.0, 0.0, 1.0); }");
|
||||||
|
self.push("ointerp[0] = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
|
||||||
}
|
}
|
||||||
Stage::Pixel => {
|
Stage::Pixel => {
|
||||||
// Seed r0.xy with interpolated color lane so trivial shaders
|
// iterate-3T: the PS reads interpolator i from general register
|
||||||
// that read r0 still produce something.
|
// r[i] (Xenos PS input GPR mapping). Seed r0..r7 from the VS's
|
||||||
self.push("r[0] = in.color;");
|
// interpolators so e.g. the logo PS's texcoord (r1) and color
|
||||||
self.push("var ocolor0: vec4<f32> = in.color;");
|
// (r0) arrive correctly; tfetch then samples at the real UV.
|
||||||
|
self.push("r[0] = in.interp0;");
|
||||||
|
self.push("r[1] = in.interp1;");
|
||||||
|
self.push("r[2] = in.interp2;");
|
||||||
|
self.push("r[3] = in.interp3;");
|
||||||
|
self.push("r[4] = in.interp4;");
|
||||||
|
self.push("r[5] = in.interp5;");
|
||||||
|
self.push("r[6] = in.interp6;");
|
||||||
|
self.push("r[7] = in.interp7;");
|
||||||
|
self.push("var ocolor0: vec4<f32> = in.interp0;");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -269,7 +318,14 @@ impl EmitCtx {
|
|||||||
self.indent -= 1;
|
self.indent -= 1;
|
||||||
self.push("}");
|
self.push("}");
|
||||||
self.push("out.position = opos;");
|
self.push("out.position = opos;");
|
||||||
self.push("out.color = ocolor;");
|
self.push("out.interp0 = ointerp[0];");
|
||||||
|
self.push("out.interp1 = ointerp[1];");
|
||||||
|
self.push("out.interp2 = ointerp[2];");
|
||||||
|
self.push("out.interp3 = ointerp[3];");
|
||||||
|
self.push("out.interp4 = ointerp[4];");
|
||||||
|
self.push("out.interp5 = ointerp[5];");
|
||||||
|
self.push("out.interp6 = ointerp[6];");
|
||||||
|
self.push("out.interp7 = ointerp[7];");
|
||||||
self.push("return out;");
|
self.push("return out;");
|
||||||
}
|
}
|
||||||
Stage::Pixel => {
|
Stage::Pixel => {
|
||||||
@@ -398,20 +454,35 @@ impl EmitCtx {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
||||||
// Xenos's export "register" indexing within an alloc range is
|
// iterate-3T: real Xenos export-index model (replaces the `AllocKind`
|
||||||
// normally (alloc_base + offset). Since our CF stream doesn't
|
// heuristic, which collapsed every VS export to a single color slot and
|
||||||
// carry per-export slot offsets cleanly, use `alloc` to pick the
|
// dropped the texcoord interpolator → tfetch sampled (0,0) → flat).
|
||||||
// target.
|
// When `export_data` is set the 6-bit vector_dest IS the export index:
|
||||||
let lhs = match (self.stage, alloc) {
|
// VS: 62 = oPos, 63 = oPointSize/edge (ignored), 0..15 = interpolators.
|
||||||
(Stage::Vertex, AllocKind::Position) => "opos",
|
// PS: 0..3 = color render targets (we honor RT0).
|
||||||
(Stage::Vertex, AllocKind::Interpolators) => "ocolor",
|
let _ = alloc;
|
||||||
(Stage::Vertex, AllocKind::Colors) => "ocolor",
|
match self.stage {
|
||||||
(Stage::Vertex, _) => "ocolor", // fall through — any other alloc
|
Stage::Vertex => {
|
||||||
(Stage::Pixel, AllocKind::Colors) => "ocolor0",
|
let lhs = if dst_reg == 62 {
|
||||||
(Stage::Pixel, _) => "ocolor0",
|
"opos".to_string()
|
||||||
};
|
} else if dst_reg <= 15 {
|
||||||
let _ = dst_reg; // per-slot export indexing reserved for a richer v2
|
// Clamp to the 8 interpolator slots we carry; higher slots
|
||||||
self.emit_masked_write(lhs, expr, mask);
|
// are unused by Sylpheed's splash.
|
||||||
|
let i = (dst_reg as usize).min(7);
|
||||||
|
format!("ointerp[{i}u]")
|
||||||
|
} else {
|
||||||
|
// oPointSize (63) / unknown export slot — discard.
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
self.emit_masked_write(&lhs, expr, mask);
|
||||||
|
}
|
||||||
|
Stage::Pixel => {
|
||||||
|
// Only RT0 (export index 0) is wired to the single host target.
|
||||||
|
if dst_reg == 0 {
|
||||||
|
self.emit_masked_write("ocolor0", expr, mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
||||||
@@ -426,31 +497,70 @@ impl EmitCtx {
|
|||||||
// GPUBUG-102: the fetch constant holds the endian field in dword_1's
|
// GPUBUG-102: the fetch constant holds the endian field in dword_1's
|
||||||
// low 2 bits; Xbox 360 vertex data is big-endian, so `gpu_swap` undoes
|
// low 2 bits; Xbox 360 vertex data is big-endian, so `gpu_swap` undoes
|
||||||
// it per component.
|
// it per component.
|
||||||
let (comps, stride): (u32, u32) = match vf.format {
|
// (comps, dwords_read) per format. Float formats are 1 dword/component;
|
||||||
36 => (1, 1), // k_32_FLOAT
|
// iterate-3T adds the packed-16 `k_16_16` (format 6) used for the logo
|
||||||
37 => (2, 2), // k_32_32_FLOAT
|
// UV interpolator — 2 components packed into ONE dword.
|
||||||
57 => (3, 3), // k_32_32_32_FLOAT
|
#[derive(PartialEq)]
|
||||||
38 => (4, 4), // k_32_32_32_32_FLOAT
|
enum Pack {
|
||||||
|
Float, // N f32 lanes, N dwords
|
||||||
|
Norm16x2, // 2× u16 normalized into [0,1], 1 dword (k_16_16)
|
||||||
|
}
|
||||||
|
let (comps, dwords_read, pack): (u32, u32, Pack) = match vf.format {
|
||||||
|
36 => (1, 1, Pack::Float), // k_32_FLOAT
|
||||||
|
37 => (2, 2, Pack::Float), // k_32_32_FLOAT
|
||||||
|
57 => (3, 3, Pack::Float), // k_32_32_32_FLOAT
|
||||||
|
38 => (4, 4, Pack::Float), // k_32_32_32_32_FLOAT
|
||||||
|
6 => (2, 1, Pack::Norm16x2), // k_16_16 (UV)
|
||||||
_ => return Err(reject::VFETCH_FMT),
|
_ => return Err(reject::VFETCH_FMT),
|
||||||
};
|
};
|
||||||
// A stride of 0 in the instruction means "use the fetch-constant
|
// A stride of 0 in the instruction means "use the fetch-constant
|
||||||
// stride"; fall back to the tightly packed component count.
|
// stride"; fall back to the tightly packed dword count.
|
||||||
let stride = if vf.stride != 0 { vf.stride as u32 } else { stride };
|
let stride = if vf.stride != 0 { vf.stride as u32 } else { dwords_read };
|
||||||
|
// iterate-3T: per-attribute dword offset within the vertex (vfetches
|
||||||
|
// sharing one fetch constant read different attributes).
|
||||||
|
let attr_off = vf.offset;
|
||||||
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
||||||
let src_reg = vf.src_register & 0x7F;
|
let src_reg = vf.src_register & 0x7F;
|
||||||
let dst_reg = vf.dest_register & 0x7F;
|
let dst_reg = vf.dest_register & 0x7F;
|
||||||
|
// is_signed selects [-1,1] vs [0,1] for normalized integer formats.
|
||||||
|
let signed = vf.is_signed;
|
||||||
// Build the per-component reads; unread lanes default to 0/0/0/1 so an
|
// Build the per-component reads; unread lanes default to 0/0/0/1 so an
|
||||||
// XY-only position keeps W=1 (and Z=0).
|
// XY-only position keeps W=1 (and Z=0).
|
||||||
let lane = |i: u32| -> String {
|
let lane = |i: u32| -> String {
|
||||||
if i < comps {
|
match pack {
|
||||||
format!("bitcast<f32>(gpu_swap(vertex_buffer[addr + {i}u], endian))")
|
Pack::Float => {
|
||||||
} else if i == 3 {
|
if i < comps {
|
||||||
"1.0".to_string()
|
format!("bitcast<f32>(gpu_swap(vertex_buffer[addr + {i}u], endian))")
|
||||||
} else {
|
} else if i == 3 {
|
||||||
"0.0".to_string()
|
"1.0".to_string()
|
||||||
|
} else {
|
||||||
|
"0.0".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Pack::Norm16x2 => {
|
||||||
|
// One dword holds [u16 lo | u16 hi] after the endian swap.
|
||||||
|
// Component 0 = low halfword, component 1 = high halfword.
|
||||||
|
if i == 0 {
|
||||||
|
if signed {
|
||||||
|
"(max(f32(i32(w16 << 16u) >> 16u) / 32767.0, -1.0))".to_string()
|
||||||
|
} else {
|
||||||
|
"(f32(w16 & 0xFFFFu) / 65535.0)".to_string()
|
||||||
|
}
|
||||||
|
} else if i == 1 {
|
||||||
|
if signed {
|
||||||
|
"(max(f32(i32(w16) >> 16u) / 32767.0, -1.0))".to_string()
|
||||||
|
} else {
|
||||||
|
"(f32(w16 >> 16u) / 65535.0)".to_string()
|
||||||
|
}
|
||||||
|
} else if i == 3 {
|
||||||
|
"1.0".to_string()
|
||||||
|
} else {
|
||||||
|
"0.0".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let read_bound = comps - 1;
|
let read_bound = dwords_read - 1;
|
||||||
// GPUBUG-108 (iterate-3S): for the captured-geometry path the CPU
|
// GPUBUG-108 (iterate-3S): for the captured-geometry path the CPU
|
||||||
// uploads a vertex window that begins EXACTLY at the fetch base, so the
|
// uploads a vertex window that begins EXACTLY at the fetch base, so the
|
||||||
// base within `vertex_buffer` is 0 and vertex i sits at `i * stride`.
|
// base within `vertex_buffer` is 0 and vertex i sits at `i * stride`.
|
||||||
@@ -464,6 +574,13 @@ impl EmitCtx {
|
|||||||
// real window is present (`vertex_base_dwords != 0`); only the
|
// real window is present (`vertex_base_dwords != 0`); only the
|
||||||
// synthetic/no-window fallback consults the uniform fetch constant.
|
// synthetic/no-window fallback consults the uniform fetch constant.
|
||||||
let endian_term = format!("xenos_consts.fetch[{}u] & 0x3u", fetch_const * 2 + 1);
|
let endian_term = format!("xenos_consts.fetch[{}u] & 0x3u", fetch_const * 2 + 1);
|
||||||
|
// For packed-16 we read one dword into `w16` (post endian-swap) and the
|
||||||
|
// `lane()` exprs above unpack the two halfwords.
|
||||||
|
let w16_decl = if pack == Pack::Norm16x2 {
|
||||||
|
"let w16 = gpu_swap(vertex_buffer[addr], endian); "
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
};
|
||||||
self.push(&format!(
|
self.push(&format!(
|
||||||
"{{ let endian = {endian_term}; \
|
"{{ let endian = {endian_term}; \
|
||||||
let vidx = u32(r[{src_reg}u].x); \
|
let vidx = u32(r[{src_reg}u].x); \
|
||||||
@@ -471,9 +588,10 @@ impl EmitCtx {
|
|||||||
if (draw_ctx.vertex_base_dwords == 0u) {{ \
|
if (draw_ctx.vertex_base_dwords == 0u) {{ \
|
||||||
base = (xenos_consts.fetch[{fc0_idx}u] & 0xFFFFFFFCu) >> 2u; \
|
base = (xenos_consts.fetch[{fc0_idx}u] & 0xFFFFFFFCu) >> 2u; \
|
||||||
}} \
|
}} \
|
||||||
let addr = base + vidx * {stride}u; \
|
let addr = base + vidx * {stride}u + {attr_off}u; \
|
||||||
let n = arrayLength(&vertex_buffer); \
|
let n = arrayLength(&vertex_buffer); \
|
||||||
if (addr + {read_bound}u < n) {{ \
|
if (addr + {read_bound}u < n) {{ \
|
||||||
|
{w16_decl}\
|
||||||
r[{dst_reg}u] = vec4<f32>({l0}, {l1}, {l2}, {l3}); \
|
r[{dst_reg}u] = vec4<f32>({l0}, {l1}, {l2}, {l3}); \
|
||||||
}} }}",
|
}} }}",
|
||||||
fc0_idx = fetch_const * 2,
|
fc0_idx = fetch_const * 2,
|
||||||
@@ -626,6 +744,54 @@ mod tests {
|
|||||||
use crate::ucode::alu::{sop, vop};
|
use crate::ucode::alu::{sop, vop};
|
||||||
use crate::ucode::control_flow::ControlFlowInstruction;
|
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||||
|
|
||||||
|
/// iterate-3T: the real publisher-logo VS (`vs_key 0x03b7b020`, captured
|
||||||
|
/// from the live boot) must now TRANSLATE — pre-3T it rejected with
|
||||||
|
/// `vfetch_fmt` because (a) the `k_16_16` color stream (format 6) was
|
||||||
|
/// unsupported and (b) the export-index model (62=oPos, 0/1=interpolators)
|
||||||
|
/// was a wrong AllocKind heuristic. This locks in the format-6 + per-
|
||||||
|
/// attribute-offset + export-index work so the UV interpolator reaches the
|
||||||
|
/// pixel shader (texcoord in r1) instead of collapsing to a single color.
|
||||||
|
#[test]
|
||||||
|
fn real_logo_vs_translates_with_interpolators() {
|
||||||
|
let ucode: [u32; 30] = [
|
||||||
|
0x70153003, 0x00001200, 0xC2000000, 0x00001006, 0x00001200, 0xC4000000,
|
||||||
|
0x00002007, 0x00002200, 0x00000000, 0x2DF82000, 0x00393A88, 0x00000006,
|
||||||
|
0x05F81000, 0x4006060A, 0x00000306, 0x05F80000, 0x40253FC8, 0x00000406,
|
||||||
|
0xC80F803E, 0x00000000, 0xC2020200, 0xC8038001, 0x00B0B000, 0xC2000000,
|
||||||
|
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
];
|
||||||
|
let p = crate::ucode::parse_shader(&ucode);
|
||||||
|
let body = match translate(&p, Stage::Vertex) {
|
||||||
|
Translation::Ok(b) => b,
|
||||||
|
Translation::Reject(r) => panic!("logo VS rejected: {r}"),
|
||||||
|
};
|
||||||
|
// Position must come from the export-index-62 path (`opos`) and the
|
||||||
|
// UV/color interpolators must be exported as distinct slots.
|
||||||
|
assert!(body.contains("opos ="), "no position export: {body}");
|
||||||
|
assert!(body.contains("ointerp[0u]"), "no interp0 export: {body}");
|
||||||
|
assert!(body.contains("ointerp[1u]"), "no interp1 export: {body}");
|
||||||
|
// The k_16_16 attribute must unpack via the packed-16 helper.
|
||||||
|
assert!(body.contains("w16"), "no packed-16 unpack for k_16_16: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The logo pixel shader (`ps_key 0x03b79001`) samples its texture at the
|
||||||
|
/// interpolated texcoord register r1 — which the PS now seeds from the VS
|
||||||
|
/// interpolator `in.interp1` (Xenos PS-input-GPR mapping). Verifies the UV
|
||||||
|
/// chain so tfetch samples the real UV instead of (0,0).
|
||||||
|
#[test]
|
||||||
|
fn ps_seeds_interpolators_into_registers() {
|
||||||
|
// A trivial PS that just exports — we only assert the preamble wiring.
|
||||||
|
let p = crate::ucode::ParsedShader {
|
||||||
|
cf: vec![ControlFlowInstruction::Exit],
|
||||||
|
instructions: vec![],
|
||||||
|
};
|
||||||
|
let body = match translate(&p, Stage::Pixel) {
|
||||||
|
Translation::Ok(b) => b,
|
||||||
|
Translation::Reject(r) => panic!("trivial PS rejected: {r}"),
|
||||||
|
};
|
||||||
|
assert!(body.contains("r[1] = in.interp1;"), "PS must seed r1 from interp1: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
fn synthetic_trivial_shader() -> ParsedShader {
|
fn synthetic_trivial_shader() -> ParsedShader {
|
||||||
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||||
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||||
@@ -799,6 +965,9 @@ mod tests {
|
|||||||
dest_write_mask: 0xF,
|
dest_write_mask: 0xF,
|
||||||
format: 38, // k_32_32_32_32_FLOAT (4 floats)
|
format: 38, // k_32_32_32_32_FLOAT (4 floats)
|
||||||
stride: 4,
|
stride: 4,
|
||||||
|
offset: 0,
|
||||||
|
is_signed: false,
|
||||||
|
is_normalized: true,
|
||||||
raw: [0; 3],
|
raw: [0; 3],
|
||||||
};
|
};
|
||||||
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
||||||
|
|||||||
@@ -34,6 +34,19 @@ pub struct VertexFetch {
|
|||||||
pub format: u8,
|
pub format: u8,
|
||||||
/// Dword stride between consecutive vertices (dword2[0:7]).
|
/// Dword stride between consecutive vertices (dword2[0:7]).
|
||||||
pub stride: u8,
|
pub stride: u8,
|
||||||
|
/// iterate-3T: dword offset of THIS attribute within the vertex stride
|
||||||
|
/// (dword2[16:38] in canary's `VertexFetchInstruction`; the low 23 bits).
|
||||||
|
/// A 6-dword vertex with position@0 + UV@2 + extra@3 needs this so the
|
||||||
|
/// three vfetches sharing one fetch-constant read different attributes
|
||||||
|
/// instead of all reading offset 0.
|
||||||
|
pub offset: u32,
|
||||||
|
/// iterate-3T: `is_signed` (dword2 bit 24 in canary) — selects signed vs
|
||||||
|
/// unsigned interpretation of packed integer formats.
|
||||||
|
pub is_signed: bool,
|
||||||
|
/// iterate-3T: `is_normalized` — canary inverts it: dword2 bit 25 set means
|
||||||
|
/// the value is taken as an *integer* (un-normalized); clear means
|
||||||
|
/// normalized to [0,1] / [-1,1]. We store the normalized sense directly.
|
||||||
|
pub is_normalized: bool,
|
||||||
pub raw: [u32; 3],
|
pub raw: [u32; 3],
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -81,9 +94,15 @@ pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
|||||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||||
dest_write_mask: (w1 & 0xF) as u8,
|
dest_write_mask: (w1 & 0xF) as u8,
|
||||||
// dword1[16:21] = VertexFormat; dword2[0:7] = dword stride.
|
// dword1[16:21] = VertexFormat. dword2: stride[0:7],
|
||||||
|
// offset (in dwords) [8:?] — empirically the attribute offset of
|
||||||
|
// the textured logo VS lands in dword2[8:15] (pos@4, UV@3,
|
||||||
|
// 3-float@0 in a 6-dword vertex). signed/normalized live higher.
|
||||||
format: ((w1 >> 16) & 0x3F) as u8,
|
format: ((w1 >> 16) & 0x3F) as u8,
|
||||||
stride: (w2 & 0xFF) as u8,
|
stride: (w2 & 0xFF) as u8,
|
||||||
|
offset: (w2 >> 8) & 0xFF,
|
||||||
|
is_signed: ((w1 >> 24) & 1) != 0,
|
||||||
|
is_normalized: ((w1 >> 25) & 1) == 0,
|
||||||
raw: words,
|
raw: words,
|
||||||
}),
|
}),
|
||||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||||
|
|||||||
@@ -748,6 +748,39 @@ impl RenderState {
|
|||||||
label: Some("xenos capture replay"),
|
label: Some("xenos capture replay"),
|
||||||
});
|
});
|
||||||
for cap in captures {
|
for cap in captures {
|
||||||
|
// iterate-3T: bind this draw's REAL decoded texture (keyed off the
|
||||||
|
// active PS's tfetch slot, attached in `gpu_system`) so the textured
|
||||||
|
// logo samples the artwork. `None` reverts to the magenta stub for
|
||||||
|
// flat draws. Each `set_texture_view` rebuilds the tex bind group;
|
||||||
|
// the subsequent `render_one*` reads it, so per-draw binding works
|
||||||
|
// even though all draws share one encoder.
|
||||||
|
{
|
||||||
|
let Self {
|
||||||
|
device,
|
||||||
|
queue,
|
||||||
|
xenos_pipeline,
|
||||||
|
host_texture_cache,
|
||||||
|
..
|
||||||
|
} = self;
|
||||||
|
match cap.textures.first() {
|
||||||
|
Some((key, bytes)) => {
|
||||||
|
// Stable version: identical (key,bytes) across draws
|
||||||
|
// reuse the uploaded wgpu texture (the splash artwork is
|
||||||
|
// static). A genuine content change arrives as a new key
|
||||||
|
// (base_address/dims) from the decoder.
|
||||||
|
let cached = xenia_gpu::texture_cache::CachedTexture {
|
||||||
|
key: *key,
|
||||||
|
version_when_uploaded: 1,
|
||||||
|
bytes: bytes.clone(),
|
||||||
|
};
|
||||||
|
host_texture_cache.upload(device, queue, &cached);
|
||||||
|
if let Some(view) = host_texture_cache.view_for(key) {
|
||||||
|
xenos_pipeline.set_texture_view(device, Some(view));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => xenos_pipeline.set_texture_view(device, None),
|
||||||
|
}
|
||||||
|
}
|
||||||
let raw_vs = shader_blobs.get(&cap.vs_key).cloned().unwrap_or_default();
|
let raw_vs = shader_blobs.get(&cap.vs_key).cloned().unwrap_or_default();
|
||||||
let raw_ps = shader_blobs.get(&cap.ps_key).cloned().unwrap_or_default();
|
let raw_ps = shader_blobs.get(&cap.ps_key).cloned().unwrap_or_default();
|
||||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||||
|
|||||||
Reference in New Issue
Block a user