Files
xenia-rs/crates/xenia-gpu/src/shader_metrics.rs
MechaCat02 2f55d1fd7d [iterate-2X] Texture pipeline: un-stub RectangleList + draw-time texture decode
Two faithful, deterministic GPU-backend changes that make the texture path
correct for whatever textured draw the splash eventually dispatches. Both are
currently inert on Sylpheed (the textured logo draw is still gated downstream
— see below), but neither shifts the stable-digest golden, so they land safely.

1. Un-stub RectangleList primitive expansion (primitive.rs). The splash submits
   2819 RectangleList draws at 200M, all of which were REJECTED by the P3 stub
   (`gpu.primitive.rejected{rectangle_list}`) → only ~592 flat point/quad draws
   rasterized. Mirror canary's intent (primitive_processor.cc:389-456
   kRectangleListAsTriangleStrip) within our CPU index-rewrite idiom: emit each
   rect's 3 real vertices as one TriangleList triangle (v0,v1,v2), rejected=false,
   faithful host_vertex_count. The full quad (synthesized 4th corner v3=v0+v2-v1)
   needs real vertex fetch in vs_main — left as a documented TODO. Rejection
   warnings drop 2819→0.

2. Draw-time texture decode keyed off the active PS's real tfetch slots
   (gpu_system.rs + exports.rs vd_swap). Previously vd_swap decoded a hardcoded
   fetch-constant slot 0 at swap time. Now the DRAW handler parses the bound
   pixel shader (ucode::parse_shader), collects its tfetch fetch_const slots via
   new shader_metrics::tfetch_slots, reads each 6-dword fetch constant, and
   decode+caches it into GpuSystem::last_draw_textures. vd_swap publishes the
   first of these (UI binds one texture today), falling back to the legacy slot-0
   probe on flat-only frames. New span_max_version helper walks page_version over
   the trait (draw-time &dyn MemoryAccess lacks the heap's inherent
   max_page_version). Pure function of guest writes — deterministic.

Status: texture_decodes stays 0 on Sylpheed because all 6 live shaders are flat
(no tfetch); canary's textured logo shaders E59B2B3D/F7B1457 are not yet
dispatched by ours (a downstream title-state gate, the next frontier). The full
P5 decode→publish→upload→sample path is already wired; this makes the decode
side key off the real shader instead of a guess.

Validation: stable-digest golden sylpheed_n50m unchanged (draws=718 swaps=147
tex=0), regenerated twice byte-identical; 200M run shows 0 RectangleList
rejections. cargo test --workspace green (677, +2: rectangle_list_expansion,
tfetch_slots_extracts_texture_fetch_constants). No temp hooks. Branch only;
not pushed/merged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-14 21:34:43 +02:00

434 lines
16 KiB
Rust

//! Host-side static analysis over a [`ParsedShader`], emitted once per unique
//! shader blob. Produces the observability the plan's P3b/P3c sections call
//! for (`gpu.shader.interpret{stage,kind}` + `gpu.shader.reject{reason}`), so
//! the HUD can show when a game is reaching ops the WGSL interpreter falls
//! back on.
//!
//! Analysis is intentionally cheap: it scans each exec clause's instruction
//! triples, classifies them as ALU / vertex-fetch / texture-fetch using the
//! owning clause's sequence bitmap, and bumps counters accordingly. No GPU
//! readback is required — `reject` reasons are inferred from opcode values
//! alone.
use metrics::counter;
use crate::ucode::alu::{decode_alu, sop, vop};
use crate::ucode::control_flow::ControlFlowInstruction;
use crate::ucode::fetch::{FetchInstruction, decode_fetch};
use crate::ucode::ParsedShader;
/// Walk `parsed` once and emit `gpu.shader.interpret` + `gpu.shader.reject`
/// counters. `stage` should be `"vs"` or `"ps"`.
pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
let mut alu_count: u64 = 0;
let mut vfetch_count: u64 = 0;
let mut tfetch_count: u64 = 0;
let mut rejects: Vec<(&'static str, u64)> = Vec::new();
let mut features: Vec<&'static str> = Vec::new();
for clause in &parsed.cf {
match clause {
ControlFlowInstruction::Exec {
address,
count,
sequence,
..
} => {
for i in 0..(*count as usize) {
let triple_idx = *address as usize + i;
let base = triple_idx * 3;
if base + 2 >= parsed.instructions.len() {
break;
}
let words = [
parsed.instructions[base],
parsed.instructions[base + 1],
parsed.instructions[base + 2],
];
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
if is_fetch {
match decode_fetch(words) {
FetchInstruction::Vertex(_) => vfetch_count += 1,
FetchInstruction::Texture(tf) => {
tfetch_count += 1;
match tf.dimension {
0 => mark_feature(&mut features, "tfetch_1d"),
2 => mark_feature(&mut features, "tfetch_3d"),
3 => mark_feature(&mut features, "tfetch_cube"),
_ => {}
}
if tf.dimension != 1 {
bump(&mut rejects, "texfetch_dimension");
}
}
FetchInstruction::Unknown { .. } => {
bump(&mut rejects, "fetch_unknown");
}
}
} else {
alu_count += 1;
let alu = decode_alu(words);
if !vec_op_supported(alu.vector_opcode) {
bump(&mut rejects, "alu_vec_unsupported");
}
if !scl_op_supported(alu.scalar_opcode) {
bump(&mut rejects, "alu_scl_unsupported");
}
// Feature-of-interest detection for future phases.
// Transcendentals + kill + setp + cube/max4 are the
// high-value signals: they tell us which of the
// deferred capabilities Sylpheed actually exercises.
match alu.vector_opcode {
v if v == vop::CUBE => mark_feature(&mut features, "vec_cube"),
v if v == vop::MAX4 => mark_feature(&mut features, "vec_max4"),
v if v == vop::KILL_EQ
|| v == vop::KILL_GT
|| v == vop::KILL_GE
|| v == vop::KILL_NE =>
{
mark_feature(&mut features, "vec_kill");
}
v if v == vop::CND_EQ
|| v == vop::CND_GE
|| v == vop::CND_GT =>
{
mark_feature(&mut features, "vec_cnd");
}
_ => {}
}
match alu.scalar_opcode {
s if s == sop::EXP
|| s == sop::LOG
|| s == sop::LOGC
|| s == sop::SIN
|| s == sop::COS =>
{
mark_feature(&mut features, "scl_transcendental");
}
s if s == sop::RSQ
|| s == sop::RSQC
|| s == sop::RSQF
|| s == sop::SQRT =>
{
mark_feature(&mut features, "scl_sqrt_family");
}
s if s == sop::SETP_EQ
|| s == sop::SETP_NE
|| s == sop::SETP_GT
|| s == sop::SETP_GE
|| s == sop::SETP_INV
|| s == sop::SETP_POP
|| s == sop::SETP_CLR
|| s == sop::SETP_RSTR =>
{
mark_feature(&mut features, "scl_setp");
}
s if s == sop::KILLS_EQ
|| s == sop::KILLS_GT
|| s == sop::KILLS_GE
|| s == sop::KILLS_NE
|| s == sop::KILLS_ONE =>
{
mark_feature(&mut features, "scl_kills");
}
_ => {}
}
if alu.predicated {
mark_feature(&mut features, "alu_predicated");
}
}
}
}
ControlFlowInstruction::LoopStart { .. }
| ControlFlowInstruction::LoopEnd { .. } => {
mark_feature(&mut features, "cf_loop");
bump(&mut rejects, "cf_loop");
}
ControlFlowInstruction::CondJmp { .. } => {
mark_feature(&mut features, "cf_cond_jmp");
bump(&mut rejects, "cf_cond_jmp");
}
ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
mark_feature(&mut features, "cf_call_return");
bump(&mut rejects, "cf_call_return");
}
ControlFlowInstruction::Unknown { .. } => {
bump(&mut rejects, "cf_unknown");
}
_ => {}
}
}
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "alu")
.increment(alu_count);
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "vfetch")
.increment(vfetch_count);
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "tfetch")
.increment(tfetch_count);
for (reason, n) in rejects {
counter!("gpu.shader.reject", "stage" => stage, "reason" => reason).increment(n);
}
for name in features {
counter!("gpu.feature.used", "stage" => stage, "name" => name).increment(1);
}
}
/// Collect the unique texture-fetch-constant slot indices a shader samples.
///
/// Walks the same exec-clause / sequence-bitmap path as [`emit_for`] but only
/// extracts `TextureFetch.fetch_const` slots, deduplicated and in first-seen
/// order. The GPU draw handler uses this to decide which fetch constants to
/// decode + cache at draw time (keyed off the *active* pixel shader's real
/// `tfetch` instructions rather than a hardcoded slot).
pub fn tfetch_slots(parsed: &ParsedShader) -> Vec<u8> {
let mut slots: Vec<u8> = Vec::new();
for clause in &parsed.cf {
if let ControlFlowInstruction::Exec {
address,
count,
sequence,
..
} = clause
{
for i in 0..(*count as usize) {
let base = (*address as usize + i) * 3;
if base + 2 >= parsed.instructions.len() {
break;
}
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
if !is_fetch {
continue;
}
let words = [
parsed.instructions[base],
parsed.instructions[base + 1],
parsed.instructions[base + 2],
];
if let FetchInstruction::Texture(tf) = decode_fetch(words) {
if !slots.contains(&tf.fetch_const) {
slots.push(tf.fetch_const);
}
}
}
}
}
slots
}
fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
if !buf.contains(&name) {
buf.push(name);
}
}
fn bump(buf: &mut Vec<(&'static str, u64)>, reason: &'static str) {
for entry in buf.iter_mut() {
if entry.0 == reason {
entry.1 += 1;
return;
}
}
buf.push((reason, 1));
}
fn vec_op_supported(op: u8) -> bool {
matches!(
op,
vop::ADD
| vop::MUL
| vop::MAX
| vop::MIN
| vop::SEQ
| vop::SGT
| vop::SGE
| vop::SNE
| vop::FRC
| vop::TRUNC
| vop::FLOOR
| vop::MAD
| vop::CND_EQ
| vop::CND_GE
| vop::CND_GT
| vop::DOT4
| vop::DOT3
| vop::DOT2_ADD
| vop::MAX4
| vop::KILL_EQ
| vop::KILL_GT
| vop::KILL_GE
| vop::KILL_NE
| vop::DST
)
}
fn scl_op_supported(op: u8) -> bool {
matches!(
op,
sop::ADDS
| sop::ADDS_PREV
| sop::MULS
| sop::MULS_PREV
| sop::MAXS
| sop::MINS
| sop::SEQS
| sop::SGTS
| sop::SGES
| sop::SNES
| sop::FRCS
| sop::TRUNCS
| sop::FLOORS
| sop::EXP
| sop::LOG
| sop::LOGC
| sop::RCP
| sop::RCPC
| sop::RCPF
| sop::RSQ
| sop::RSQC
| sop::RSQF
| sop::SQRT
| sop::SUBS
| sop::SUBS_PREV
| sop::SETP_EQ
| sop::SETP_NE
| sop::SETP_GT
| sop::SETP_GE
| sop::SETP_INV
| sop::SETP_POP
| sop::SETP_CLR
| sop::SETP_RSTR
| sop::KILLS_EQ
| sop::KILLS_GT
| sop::KILLS_GE
| sop::KILLS_NE
| sop::KILLS_ONE
| sop::SIN
| sop::COS
| sop::RETAIN_PREV
)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ucode::alu::{sop, vop};
use crate::ucode::control_flow::ControlFlowInstruction;
/// Build a minimal `ParsedShader` with one `Exec` clause containing
/// `count` ALU triples and assert the `alu` counter path works.
#[test]
fn emit_for_runs_on_synthetic_shader() {
let alu_w2 = (vop::ADD as u32) | ((sop::ADDS as u32) << 6) | (0xF << 12);
let shader = ParsedShader {
cf: vec![
ControlFlowInstruction::Exec {
address: 0,
count: 2,
sequence: 0, // all ALU (no is-fetch bits)
is_end: false,
predicated: false,
predicate_condition: false,
},
ControlFlowInstruction::Exit,
],
instructions: vec![0, 0, alu_w2, 0, 0, alu_w2],
};
// Just smoke: doesn't panic. Counters are validated via metrics
// exporters elsewhere; we only assert this doesn't throw on a
// well-formed ParsedShader.
emit_for(&shader, "vs");
}
/// `tfetch_slots` should extract the fetch-constant slot of a texture
/// fetch (and dedup), and return empty for a flat ALU-only shader.
#[test]
fn tfetch_slots_extracts_texture_fetch_constants() {
// word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, fetch_const=3 in
// bits[9:5] → 0x01 | (3 << 5) = 0x61.
let tfetch_w0: u32 = 0x01 | (3u32 << 5);
let shader = ParsedShader {
cf: vec![
ControlFlowInstruction::Exec {
address: 0,
count: 2,
// triple 0 is a fetch (hi bit of its 2-bit field set),
// triple 1 is ALU. is_fetch = (sequence >> (i*2+1)) & 1.
sequence: 0b00_10,
is_end: false,
predicated: false,
predicate_condition: false,
},
ControlFlowInstruction::Exit,
],
instructions: vec![tfetch_w0, 0, 0, /* ALU triple */ 0, 0, 0],
};
assert_eq!(tfetch_slots(&shader), vec![3]);
// Flat shader: no fetch bits → no slots.
let flat = ParsedShader {
cf: vec![ControlFlowInstruction::Exec {
address: 0,
count: 1,
sequence: 0,
is_end: false,
predicated: false,
predicate_condition: false,
}],
instructions: vec![0, 0, 0],
};
assert!(tfetch_slots(&flat).is_empty());
}
/// P8: a shader containing `LoopStart` should mark `cf_loop` as used
/// so the HUD can surface which deferred feature a game triggers.
#[test]
fn feature_detection_flags_loops_and_kills() {
let kill_alu_w2 =
(vop::KILL_EQ as u32) | ((sop::RETAIN_PREV as u32) << 6) | (0xF << 12);
let shader = ParsedShader {
cf: vec![
ControlFlowInstruction::LoopStart {
address: 0,
loop_id: 0,
},
ControlFlowInstruction::Exec {
address: 0,
count: 1,
sequence: 0,
is_end: true,
predicated: false,
predicate_condition: false,
},
],
instructions: vec![0, 0, kill_alu_w2],
};
// Smoke: emits cleanly.
emit_for(&shader, "ps");
}
#[test]
fn unsupported_ops_classified_as_rejects() {
// Opcode 63 is outside our supported sets for both pipes.
let alu_w2 = 63u32 | (63u32 << 6) | (0xF << 12);
let shader = ParsedShader {
cf: vec![
ControlFlowInstruction::Exec {
address: 0,
count: 1,
sequence: 0,
is_end: true,
predicated: false,
predicate_condition: false,
},
],
instructions: vec![0, 0, alu_w2],
};
// Again: smoke — but also confirm our static tables reject op 63.
assert!(!vec_op_supported(63));
assert!(!scl_op_supported(63));
emit_for(&shader, "ps");
}
}