Compare commits
1 Commits
iterate-2W
...
iterate-2X
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f55d1fd7d |
@@ -78,6 +78,30 @@ pub fn physical_to_backing(addr: u32) -> u32 {
|
||||
}
|
||||
}
|
||||
|
||||
/// Max guest page-version over the `[base, base+len)` span, walking 4 KiB
|
||||
/// pages via the `MemoryAccess` trait's `page_version`.
|
||||
///
|
||||
/// The concrete heap exposes an inherent `max_page_version(base, len)`, but
|
||||
/// the draw handler only holds `&dyn MemoryAccess` (which carries the coarser
|
||||
/// `page_version(addr)` accessor). This is byte-equivalent to
|
||||
/// `heap::max_page_version` and stays a pure function of the per-page write
|
||||
/// counters (no wall-clock), so texture-decode timing remains deterministic.
|
||||
fn span_max_version(mem: &dyn MemoryAccess, base: u32, len: u32) -> u64 {
|
||||
const PAGE: u32 = 0x1000;
|
||||
let last = base.saturating_add(len.saturating_sub(1));
|
||||
let mut page = base & !(PAGE - 1);
|
||||
let last_page = last & !(PAGE - 1);
|
||||
let mut max = 0u64;
|
||||
loop {
|
||||
max = max.max(mem.page_version(page));
|
||||
if page >= last_page {
|
||||
break;
|
||||
}
|
||||
page = page.wrapping_add(PAGE);
|
||||
}
|
||||
max
|
||||
}
|
||||
|
||||
/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ShaderBlob {
|
||||
@@ -400,6 +424,12 @@ pub struct GpuSystem {
|
||||
/// on every texture-fetch resolution; the UI thread sees the decoded
|
||||
/// bytes via `UiBridge::publish_texture`.
|
||||
pub texture_cache: crate::texture_cache::TextureCache,
|
||||
/// P5b: textures decoded at the most recent `PM4_DRAW_INDX*`, keyed off
|
||||
/// the *active* pixel shader's real `tfetch` fetch-constant slots (not a
|
||||
/// hardcoded slot). `vd_swap` publishes the first of these to the UI so
|
||||
/// the replay binds the texture the draw actually samples. Cleared and
|
||||
/// repopulated each draw; empty when the active PS issues no `tfetch`.
|
||||
pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, Vec<u8>)>,
|
||||
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
|
||||
/// (future) host-render-target readback; read by the resolve byte-copy
|
||||
/// path that writes tiled pixels into guest memory. Allocated once at
|
||||
@@ -431,6 +461,7 @@ impl GpuSystem {
|
||||
rt_cache: crate::render_target_cache::RenderTargetCache::new(),
|
||||
last_resolve: None,
|
||||
texture_cache: crate::texture_cache::TextureCache::new(),
|
||||
last_draw_textures: Vec::new(),
|
||||
edram: crate::edram::ShadowEdram::new(),
|
||||
}
|
||||
}
|
||||
@@ -1265,6 +1296,60 @@ impl GpuSystem {
|
||||
);
|
||||
self.last_draw = Some(ds);
|
||||
self.last_primitive = Some(processed);
|
||||
|
||||
// P5b: decode the textures the *active pixel shader* actually
|
||||
// samples. Parse the bound PS, collect its `tfetch`
|
||||
// fetch-constant slots, read each 6-dword fetch constant from
|
||||
// the register file, and decode+cache it. `vd_swap` publishes
|
||||
// the result. Empty for flat (no-tfetch) shaders — the
|
||||
// dominant case on Sylpheed's current splash, where this stays
|
||||
// inert until the textured logo draw is reached.
|
||||
self.last_draw_textures.clear();
|
||||
if let Some(ps_key) = self.active_ps_key {
|
||||
// Collect slots under an immutable borrow of `shader_blobs`,
|
||||
// then drop it before mutating `texture_cache`.
|
||||
let slots: Vec<u8> = match self.shader_blobs.get(&ps_key) {
|
||||
Some(blob) => {
|
||||
let parsed = crate::ucode::parse_shader(&blob.dwords);
|
||||
crate::shader_metrics::tfetch_slots(&parsed)
|
||||
}
|
||||
None => Vec::new(),
|
||||
};
|
||||
for slot in slots {
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (k, w) in fetch6.iter_mut().enumerate() {
|
||||
*w = self
|
||||
.register_file
|
||||
.read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32);
|
||||
}
|
||||
let Some(key) = crate::texture_cache::decode_fetch_constant(fetch6) else {
|
||||
continue;
|
||||
};
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
* (bi.bytes_per_block as u32)
|
||||
/ (bi.block_w as u32);
|
||||
let version = span_max_version(mem, key.base_address, span_bytes.max(4));
|
||||
match self.texture_cache.ensure_cached(key, version, mem) {
|
||||
Ok(entry) => {
|
||||
self.last_draw_textures.push((entry.key, entry.bytes.clone()));
|
||||
metrics::counter!(
|
||||
"gpu.texture.decode",
|
||||
"fmt" => format!("{:?}", key.format),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::counter!(
|
||||
"gpu.texture.reject",
|
||||
"reason" => format!("{e:?}"),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
||||
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
||||
|
||||
@@ -5,9 +5,8 @@
|
||||
//! rectangles) we rewrite indices on the CPU side so the host just sees a
|
||||
//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
|
||||
//!
|
||||
//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
|
||||
//! (list, strip, fan). Rectangle + quad expansions are stubs logged via
|
||||
//! `tracing::warn!` for later.
|
||||
//! Scope: list, strip, fan, quad, and rectangle expansions are all handled
|
||||
//! (rectangles via CPU triangle-list rewrite — see `expand_rectangles`).
|
||||
|
||||
use crate::draw_state::{IndexSize, PrimitiveType};
|
||||
|
||||
@@ -138,18 +137,43 @@ fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitiv
|
||||
}
|
||||
|
||||
/// Rectangle lists: a Xenos-specific primitive where each group of 3
|
||||
/// vertices defines a right-angle rectangle by its three non-repeated
|
||||
/// corners (the 4th is derived). The uber-shader doesn't support this yet;
|
||||
/// the ucode translator will emulate it as a geometry-stage fake. For P3
|
||||
/// we emit an empty draw.
|
||||
fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
|
||||
tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
|
||||
metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
|
||||
/// vertices defines a rectangle; the 4th corner is extrapolated as
|
||||
/// `v3 = v0 + v2 - v1` (parallelogram completion). Canary expands this in a
|
||||
/// host vertex-shader variant (`kRectangleListAsTriangleStrip`,
|
||||
/// `primitive_processor.cc:389-456`): a 4-vertex triangle strip per rect with
|
||||
/// the 4th corner synthesized *in the VS* from the host-vertex index.
|
||||
///
|
||||
/// Our replay pipeline has no host-VS corner synthesis (and the procedural
|
||||
/// `vs_main` does not consume `rewritten_indices` yet), so we mirror the
|
||||
/// `expand_quads`/`expand_fan` CPU idiom and emit the 3 real vertices of each
|
||||
/// rect as one triangle list `(v0,v1,v2)` — the visible lower half of the
|
||||
/// rect. This un-rejects the draw and gives a faithful `host_vertex_count`.
|
||||
///
|
||||
/// TODO: once `vs_main` does real vertex fetch + interpolation, upgrade to the
|
||||
/// full quad — 6 indices `[v0,v1,v2, v2,v1,v3]` with a synthesized `v3` corner
|
||||
/// — mirroring canary's `kRectangleListAsTriangleStrip`.
|
||||
fn expand_rectangles(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||
let rect_count = vertex_count / 3;
|
||||
let mut out = Vec::with_capacity(3 * rect_count as usize);
|
||||
let get = |i: u32| -> u32 {
|
||||
match indices {
|
||||
Some(buf) => buf[i as usize],
|
||||
None => i,
|
||||
}
|
||||
};
|
||||
for r in 0..rect_count {
|
||||
let base = r * 3;
|
||||
out.push(get(base));
|
||||
out.push(get(base + 1));
|
||||
out.push(get(base + 2));
|
||||
}
|
||||
let host_vertex_count = out.len() as u32;
|
||||
metrics::counter!("gpu.primitive.expanded", "shape" => "rectangle_list").increment(1);
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(Vec::new()),
|
||||
host_vertex_count: 0,
|
||||
rejected: true,
|
||||
rewritten_indices: Some(out),
|
||||
host_vertex_count,
|
||||
rejected: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,6 +237,17 @@ mod tests {
|
||||
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rectangle_list_expansion() {
|
||||
// 2 rects (6 verts) → one triangle (v0,v1,v2) per rect, not rejected.
|
||||
let p = process(PrimitiveType::RectangleList, 6, None);
|
||||
let idx = p.rewritten_indices.unwrap();
|
||||
assert_eq!(idx, vec![0, 1, 2, 3, 4, 5]);
|
||||
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||
assert_eq!(p.host_vertex_count, 6);
|
||||
assert!(!p.rejected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn widen_u16_indices_big_endian() {
|
||||
// 3 indices [1, 2, 0x1234] in BE u16.
|
||||
|
||||
@@ -174,6 +174,49 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect the unique texture-fetch-constant slot indices a shader samples.
|
||||
///
|
||||
/// Walks the same exec-clause / sequence-bitmap path as [`emit_for`] but only
|
||||
/// extracts `TextureFetch.fetch_const` slots, deduplicated and in first-seen
|
||||
/// order. The GPU draw handler uses this to decide which fetch constants to
|
||||
/// decode + cache at draw time (keyed off the *active* pixel shader's real
|
||||
/// `tfetch` instructions rather than a hardcoded slot).
|
||||
pub fn tfetch_slots(parsed: &ParsedShader) -> Vec<u8> {
|
||||
let mut slots: Vec<u8> = Vec::new();
|
||||
for clause in &parsed.cf {
|
||||
if let ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
..
|
||||
} = clause
|
||||
{
|
||||
for i in 0..(*count as usize) {
|
||||
let base = (*address as usize + i) * 3;
|
||||
if base + 2 >= parsed.instructions.len() {
|
||||
break;
|
||||
}
|
||||
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
if !is_fetch {
|
||||
continue;
|
||||
}
|
||||
let words = [
|
||||
parsed.instructions[base],
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
if let FetchInstruction::Texture(tf) = decode_fetch(words) {
|
||||
if !slots.contains(&tf.fetch_const) {
|
||||
slots.push(tf.fetch_const);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
slots
|
||||
}
|
||||
|
||||
fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
|
||||
if !buf.contains(&name) {
|
||||
buf.push(name);
|
||||
@@ -298,6 +341,46 @@ mod tests {
|
||||
emit_for(&shader, "vs");
|
||||
}
|
||||
|
||||
/// `tfetch_slots` should extract the fetch-constant slot of a texture
|
||||
/// fetch (and dedup), and return empty for a flat ALU-only shader.
|
||||
#[test]
|
||||
fn tfetch_slots_extracts_texture_fetch_constants() {
|
||||
// word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, fetch_const=3 in
|
||||
// bits[9:5] → 0x01 | (3 << 5) = 0x61.
|
||||
let tfetch_w0: u32 = 0x01 | (3u32 << 5);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 2,
|
||||
// triple 0 is a fetch (hi bit of its 2-bit field set),
|
||||
// triple 1 is ALU. is_fetch = (sequence >> (i*2+1)) & 1.
|
||||
sequence: 0b00_10,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
ControlFlowInstruction::Exit,
|
||||
],
|
||||
instructions: vec![tfetch_w0, 0, 0, /* ALU triple */ 0, 0, 0],
|
||||
};
|
||||
assert_eq!(tfetch_slots(&shader), vec![3]);
|
||||
|
||||
// Flat shader: no fetch bits → no slots.
|
||||
let flat = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
}],
|
||||
instructions: vec![0, 0, 0],
|
||||
};
|
||||
assert!(tfetch_slots(&flat).is_empty());
|
||||
}
|
||||
|
||||
/// P8: a shader containing `LoopStart` should mark `cf_loop` as used
|
||||
/// so the HUD can surface which deferred feature a game triggers.
|
||||
#[test]
|
||||
|
||||
@@ -3116,16 +3116,17 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
);
|
||||
ui.publish_assets(blobs, constants);
|
||||
|
||||
// P5: try to decode the primary texture (fetch constant slot 0).
|
||||
// Slot 0 is the convention most games use for their main bound
|
||||
// texture at draw time; full N-slot binding waits for P6+. If the
|
||||
// slot is unset or the format isn't supported (magenta stub kicks
|
||||
// in host-side), we skip.
|
||||
//
|
||||
// Texture fetch constants live at `CONST_BASE_FETCH + slot*6` in
|
||||
// the register file; we read the 6 dwords, decode the key, hit
|
||||
// the CPU cache (with page-version freshness), and clone the
|
||||
// decoded bytes across the bridge.
|
||||
// P5b: publish the texture the last draw's *active pixel shader*
|
||||
// actually sampled. The GPU draw handler decodes the PS's real
|
||||
// `tfetch` fetch-constant slots into `last_draw_textures`; we publish
|
||||
// the first (the UI binds a single texture today). When the last draw
|
||||
// used a flat (no-tfetch) shader the list is empty, so we fall back to
|
||||
// the legacy slot-0 probe to preserve behavior on flat-only frames.
|
||||
let published = gpu_inline.last_draw_textures.first().cloned().or_else(|| {
|
||||
// Fallback: probe fetch constant slot 0 directly. Texture fetch
|
||||
// constants live at `CONST_BASE_FETCH + slot*6` in the register
|
||||
// file; read 6 dwords, decode the key, hit the CPU cache with
|
||||
// page-version freshness, clone the bytes across the bridge.
|
||||
const TEX_SLOT: u32 = 0;
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (i, slot) in fetch6.iter_mut().enumerate() {
|
||||
@@ -3133,10 +3134,9 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
.register_file
|
||||
.read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32);
|
||||
}
|
||||
let published = if let Some(key) = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)
|
||||
{
|
||||
// Span over the entire tiled texture footprint to pick the
|
||||
// max page version covering it.
|
||||
let key = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)?;
|
||||
// Span over the entire tiled texture footprint to pick the max
|
||||
// page version covering it.
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
@@ -3154,9 +3154,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
});
|
||||
metrics::gauge!("gpu.texture_cache.entries")
|
||||
.set(gpu_inline.texture_cache.len() as f64);
|
||||
ui.publish_texture(published);
|
||||
|
||||
Reference in New Issue
Block a user