Compare commits
18 Commits
iterate-2Z
...
iterate-3O
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
acb29db444 | ||
|
|
dc1320cd4b | ||
|
|
9d24dd0eaa | ||
|
|
c62a355418 | ||
|
|
3f8d3b6f1c | ||
|
|
c0c6088e4d | ||
|
|
f6f3aac673 | ||
|
|
2a992db47b | ||
|
|
89b5c39d8a | ||
|
|
39723dfe37 | ||
|
|
da7c29b6d2 | ||
|
|
1b9918450f | ||
|
|
80fbff8bd1 | ||
|
|
6d8a2817a3 | ||
|
|
a3aa3cc7d6 | ||
|
|
6ff184694d | ||
|
|
504592ac13 | ||
|
|
6bb4355e3d |
@@ -2151,7 +2151,13 @@ fn coord_pre_round(
|
|||||||
let fired = if kernel.parallel_active {
|
let fired = if kernel.parallel_active {
|
||||||
kernel.interrupts.tick_vsync_wallclock()
|
kernel.interrupts.tick_vsync_wallclock()
|
||||||
} else {
|
} else {
|
||||||
kernel.interrupts.tick_vsync_instr(stats.instruction_count)
|
// iterate-3AJ: present-anchored — pass the guest's live present
|
||||||
|
// (`VdSwap`) count so vsync tracks the real present rate once the
|
||||||
|
// guest is presenting (≈1 vblank/present), instead of firing a
|
||||||
|
// fixed instruction quantum that over-fires ~66× during one heavy
|
||||||
|
// splash asset-load frame and collapsed the logo fade-in.
|
||||||
|
let presents = kernel.gpu.swaps_seen();
|
||||||
|
kernel.interrupts.tick_vsync_instr(stats.instruction_count, presents)
|
||||||
};
|
};
|
||||||
if fired {
|
if fired {
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::atomic::Ordering;
|
||||||
@@ -2320,8 +2326,19 @@ fn coord_post_round(
|
|||||||
let mut gpu_runs = (executed_this_round
|
let mut gpu_runs = (executed_this_round
|
||||||
/ xenia_cpu::scheduler::HW_THREAD_COUNT as u64)
|
/ xenia_cpu::scheduler::HW_THREAD_COUNT as u64)
|
||||||
.max(1);
|
.max(1);
|
||||||
if gpu_runs > 64 {
|
// Fairness cap on GPU commands drained per round. Must scale with the
|
||||||
gpu_runs = 64;
|
// per-round instruction volume: with the superblock runner a single
|
||||||
|
// round legitimately retires up to ~SUPERBLOCK_INSTR_BUDGET per slot
|
||||||
|
// (vs ~6 for the old one-block path), so the rate `executed/6` is much
|
||||||
|
// higher and a flat cap of 64 throttled GPU command processing ~17×
|
||||||
|
// (packets 50279→1861 @50M) — collapsing the present loop / splash.
|
||||||
|
// Cap at the budget so the GPU keeps pace with the CPU at the same
|
||||||
|
// per-instruction rate the one-block path had. The inner loop already
|
||||||
|
// early-breaks on `!gpu.is_ready`, so this only bounds a pathological
|
||||||
|
// backlog, never busy-spins.
|
||||||
|
let gpu_cap = superblock_budget().max(64);
|
||||||
|
if gpu_runs > gpu_cap {
|
||||||
|
gpu_runs = gpu_cap;
|
||||||
}
|
}
|
||||||
if let Some(gpu) = kernel.gpu.as_inline_mut() {
|
if let Some(gpu) = kernel.gpu.as_inline_mut() {
|
||||||
gpu.sync_with_mmio();
|
gpu.sync_with_mmio();
|
||||||
@@ -2453,10 +2470,19 @@ fn worker_prologue(
|
|||||||
// and println one record. Read-only; lockstep digest unaffected.
|
// and println one record. Read-only; lockstep digest unaffected.
|
||||||
// Empty set is the common case → single `is_empty()` test inside
|
// Empty set is the common case → single `is_empty()` test inside
|
||||||
// the helper, no overhead on the hot path.
|
// the helper, no overhead on the hot path.
|
||||||
|
// Perf (Tier-A #3): all four `fire_*_if_match` helpers early-return
|
||||||
|
// on an empty registry, but paying 4× call overhead per slot-visit
|
||||||
|
// (~3.2M visits boot-to-splash) is itself measurable. Gate the whole
|
||||||
|
// group behind a single `any_probe_active()` predicted branch so the
|
||||||
|
// common (no-probe) headless path never even makes the calls. When a
|
||||||
|
// probe IS configured each helper still re-checks its own set, so
|
||||||
|
// behaviour is identical either way.
|
||||||
|
if kernel.any_probe_active() {
|
||||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||||
kernel.fire_branch_probe_if_match(hw_id);
|
kernel.fire_branch_probe_if_match(hw_id);
|
||||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||||
kernel.fire_lr_trace_if_match(hw_id);
|
kernel.fire_lr_trace_if_match(hw_id);
|
||||||
|
}
|
||||||
|
|
||||||
if mem.has_mem_watch() {
|
if mem.has_mem_watch() {
|
||||||
let ctx = kernel.scheduler.ctx(hw_id);
|
let ctx = kernel.scheduler.ctx(hw_id);
|
||||||
@@ -2522,8 +2548,15 @@ fn worker_prologue(
|
|||||||
return PrologueOutcome::Continue;
|
return PrologueOutcome::Continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) Import thunk intercept.
|
// 2) Import thunk intercept. Perf (Tier-A #4): import thunks occupy a
|
||||||
if let Some((module, ordinal, name)) = thunk_map.get(&pc) {
|
// small contiguous address band; the overwhelming majority of executing
|
||||||
|
// PCs are ordinary guest code outside it. Range-reject against the band
|
||||||
|
// (two integer compares) before paying the `thunk_map` hash. Faithful
|
||||||
|
// no-op — any in-band PC still goes through the exact map lookup, and an
|
||||||
|
// out-of-band PC can never be a registered thunk.
|
||||||
|
if kernel.pc_in_thunk_band(pc)
|
||||||
|
&& let Some((module, ordinal, name)) = thunk_map.get(&pc)
|
||||||
|
{
|
||||||
let module = *module;
|
let module = *module;
|
||||||
let ordinal_u32 = *ordinal as u32;
|
let ordinal_u32 = *ordinal as u32;
|
||||||
let thunk_pc = pc;
|
let thunk_pc = pc;
|
||||||
@@ -2790,6 +2823,160 @@ fn worker_epilogue(
|
|||||||
SlotOutcome::Continue
|
SlotOutcome::Continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Hard cap on the number of guest instructions a single superblock
|
||||||
|
/// runner invocation executes before returning to the round-robin
|
||||||
|
/// scheduler. Bounds how coarse the lockstep interleaving can get: a
|
||||||
|
/// larger budget amortizes more per-round/per-slot tax (faster) but
|
||||||
|
/// runs one HW thread for longer between scheduler returns (coarser
|
||||||
|
/// cross-thread interleaving). 1024 keeps a slot-visit ~170× longer
|
||||||
|
/// than the old single-block (~6 instr) granularity while still
|
||||||
|
/// returning to the round well inside a single 50k quantum. Purely an
|
||||||
|
/// instruction count → deterministic, schedule reproduces byte-identically.
|
||||||
|
///
|
||||||
|
/// Tuned empirically on the Sylpheed boot-to-splash workload (iterate-3AL):
|
||||||
|
/// budgets up to 256 keep boot progression byte-for-byte healthy (draws /
|
||||||
|
/// swaps / packets track the one-block baseline), then a sharp cliff at
|
||||||
|
/// ~384 collapses the present loop (a producer/consumer boot handoff
|
||||||
|
/// starves when one slot runs too long without returning to the round).
|
||||||
|
/// 128 sits 3× below that cliff with ~1.65× boot-to-splash speedup — a
|
||||||
|
/// deliberately conservative pick (correctness over the last few %). The
|
||||||
|
/// `XENIA_SUPERBLOCK_BUDGET` env var overrides it for further tuning.
|
||||||
|
const SUPERBLOCK_INSTR_BUDGET: u64 = 128;
|
||||||
|
|
||||||
|
/// Effective superblock budget. Defaults to [`SUPERBLOCK_INSTR_BUDGET`];
|
||||||
|
/// `XENIA_SUPERBLOCK_BUDGET` overrides it (A/B tuning without a rebuild).
|
||||||
|
/// A budget of 1 reproduces the old one-block-per-slot-visit behaviour
|
||||||
|
/// (the chain always stops after the first block). Read once and cached.
|
||||||
|
fn superblock_budget() -> u64 {
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
static BUDGET: OnceLock<u64> = OnceLock::new();
|
||||||
|
*BUDGET.get_or_init(|| {
|
||||||
|
std::env::var("XENIA_SUPERBLOCK_BUDGET")
|
||||||
|
.ok()
|
||||||
|
.and_then(|v| v.parse::<u64>().ok())
|
||||||
|
.filter(|&v| v >= 1)
|
||||||
|
.unwrap_or(SUPERBLOCK_INSTR_BUDGET)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Superblock runner (iterate-3AL). Executes a *chain* of basic blocks
|
||||||
|
/// for one slot-visit — following each block's terminating branch into
|
||||||
|
/// the next block — instead of a single block, amortizing the per-round
|
||||||
|
/// (timebase / coord / `round_schedule`) and per-slot (`worker_prologue`)
|
||||||
|
/// dispatch tax over up to [`SUPERBLOCK_INSTR_BUDGET`] guest instructions.
|
||||||
|
///
|
||||||
|
/// Determinism + cross-thread correctness: the chain ENDS (returns to the
|
||||||
|
/// round) at exactly the points where lockstep granularity matters, all
|
||||||
|
/// pure functions of guest state (never wall-clock):
|
||||||
|
/// - a non-`Continue` step result (Yield / SystemCall / Trap / Unimpl /
|
||||||
|
/// Halted) — `step_block` already bails on these; `Yield` in
|
||||||
|
/// particular is the db16cyc spin-wait hand-off that prevents a
|
||||||
|
/// spinner from starving its producer.
|
||||||
|
/// - the just-run block was `sync_sensitive` (reserved load/store or a
|
||||||
|
/// memory barrier) — the guest's own ordering points.
|
||||||
|
/// - the block touched MMIO (the `mem.mmio_access_count()` watermark
|
||||||
|
/// advanced) — GPU/register ordering vs other HW threads stays at the
|
||||||
|
/// same fine granularity as the old one-block path.
|
||||||
|
/// - the next PC leaves ordinary guest code: an import thunk, the halt
|
||||||
|
/// sentinel, or unmapped memory — those need the full `worker_prologue`
|
||||||
|
/// dispatch, so we stop and let the next round's prologue handle them.
|
||||||
|
/// - the instruction budget is reached.
|
||||||
|
///
|
||||||
|
/// Instruction-count / clock accounting stays exact: `executed` is summed
|
||||||
|
/// from the per-block `cycle_count` delta across every chained block and
|
||||||
|
/// handed to `worker_epilogue` once, which advances `stats.instruction_count`
|
||||||
|
/// and `decrement_quantum` by precisely the retired count — identical to
|
||||||
|
/// dispatching each block separately.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
fn run_superblock(
|
||||||
|
wc: &mut WorkerCtx,
|
||||||
|
kernel: &mut xenia_kernel::KernelState,
|
||||||
|
mem: &xenia_memory::GuestMemory,
|
||||||
|
debugger: &mut xenia_debugger::Debugger,
|
||||||
|
thunk_map: &HashMap<u32, (ModuleId, u16, String)>,
|
||||||
|
stats: &mut ExecStats,
|
||||||
|
tid: Option<u32>,
|
||||||
|
thread_ref: xenia_cpu::ThreadRef,
|
||||||
|
first_block_ptr: *const xenia_cpu::block_cache::DecodedBlock,
|
||||||
|
first_pc_before: u32,
|
||||||
|
) -> SlotOutcome {
|
||||||
|
use xenia_cpu::interpreter::{step_block, StepResult};
|
||||||
|
const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;
|
||||||
|
|
||||||
|
let budget = superblock_budget();
|
||||||
|
|
||||||
|
// Probe / mem-watch / debugger-hook modes need per-block-entry
|
||||||
|
// observability; in those modes never chain (run exactly one block,
|
||||||
|
// identical to the pre-superblock behaviour). The block-cache fast
|
||||||
|
// path is only entered when hooks/DB are off anyway, but a probe or
|
||||||
|
// mem-watch can be armed alongside it.
|
||||||
|
let chain_allowed = !kernel.any_probe_active() && !mem.has_mem_watch();
|
||||||
|
|
||||||
|
let mut block_ptr = first_block_ptr;
|
||||||
|
let mut pc_before = first_pc_before;
|
||||||
|
let mut total_executed: u64 = 0;
|
||||||
|
|
||||||
|
let (result, last_block_ptr, last_pc_before) = loop {
|
||||||
|
let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
|
||||||
|
let mmio_before = mem.mmio_access_count();
|
||||||
|
let block = unsafe { &*block_ptr };
|
||||||
|
let result = {
|
||||||
|
let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
|
||||||
|
step_block(ctx, mem, block)
|
||||||
|
};
|
||||||
|
let executed = kernel
|
||||||
|
.scheduler
|
||||||
|
.ctx_mut_ref(thread_ref)
|
||||||
|
.cycle_count
|
||||||
|
.saturating_sub(cycle_before);
|
||||||
|
total_executed = total_executed.saturating_add(executed);
|
||||||
|
|
||||||
|
// STOP conditions (any → end the superblock, hand to epilogue):
|
||||||
|
// non-Continue result (let the epilogue apply it), chaining
|
||||||
|
// disabled, a sync-sensitive block just ran, MMIO was touched,
|
||||||
|
// or the budget is spent.
|
||||||
|
if !chain_allowed
|
||||||
|
|| !matches!(result, StepResult::Continue)
|
||||||
|
|| block.sync_sensitive
|
||||||
|
|| mem.mmio_access_count() != mmio_before
|
||||||
|
|| total_executed >= budget
|
||||||
|
{
|
||||||
|
break (result, block_ptr, pc_before);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decide whether the NEXT PC is an ordinary guest block we can
|
||||||
|
// chain into. Anything else (thunk / halt sentinel / unmapped)
|
||||||
|
// needs the full prologue dispatch next round.
|
||||||
|
let next_pc = kernel.scheduler.ctx(wc.hw_id).pc;
|
||||||
|
if next_pc == LR_HALT
|
||||||
|
|| (kernel.pc_in_thunk_band(next_pc) && thunk_map.contains_key(&next_pc))
|
||||||
|
|| !mem.is_mapped(next_pc)
|
||||||
|
{
|
||||||
|
break (result, block_ptr, pc_before);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chain: build/fetch the next block. Re-borrows `wc.block_cache`,
|
||||||
|
// which invalidates the previous `block_ptr` — but we've already
|
||||||
|
// finished using it (only `sync_sensitive`/diagnostics were read,
|
||||||
|
// above), so the raw-pointer aliasing rule is respected.
|
||||||
|
pc_before = next_pc;
|
||||||
|
block_ptr = wc.block_cache.lookup_or_build(next_pc, mem) as *const _;
|
||||||
|
};
|
||||||
|
|
||||||
|
worker_epilogue(
|
||||||
|
wc,
|
||||||
|
kernel,
|
||||||
|
debugger,
|
||||||
|
stats,
|
||||||
|
tid,
|
||||||
|
thread_ref,
|
||||||
|
last_block_ptr,
|
||||||
|
last_pc_before,
|
||||||
|
result,
|
||||||
|
total_executed,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))]
|
#[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))]
|
||||||
fn run_execution(
|
fn run_execution(
|
||||||
mem: &xenia_memory::GuestMemory,
|
mem: &xenia_memory::GuestMemory,
|
||||||
@@ -2803,8 +2990,6 @@ fn run_execution(
|
|||||||
halt_on_deadlock: bool,
|
halt_on_deadlock: bool,
|
||||||
shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||||
) -> ExecStats {
|
) -> ExecStats {
|
||||||
use xenia_cpu::interpreter::step_block;
|
|
||||||
|
|
||||||
let mut stats = ExecStats::default();
|
let mut stats = ExecStats::default();
|
||||||
let _ = quiet; // retained for future per-kind suppression
|
let _ = quiet; // retained for future per-kind suppression
|
||||||
|
|
||||||
@@ -2848,6 +3033,10 @@ fn run_execution(
|
|||||||
// re-decoding the same handful of pages 60×/s.
|
// re-decoding the same handful of pages 60×/s.
|
||||||
let mut isr_decode_cache = xenia_cpu::decoder::DecodeCache::new();
|
let mut isr_decode_cache = xenia_cpu::decoder::DecodeCache::new();
|
||||||
|
|
||||||
|
// Tier-A perf #2: reusable buffer for `round_schedule_into` so the round
|
||||||
|
// loop doesn't heap-allocate a `Vec<u8>` every iteration.
|
||||||
|
let mut order_buf = [0u8; xenia_cpu::scheduler::HW_THREAD_COUNT];
|
||||||
|
|
||||||
'outer: loop {
|
'outer: loop {
|
||||||
// Per-round prologue: budget / shutdown / heartbeat / vsync /
|
// Per-round prologue: budget / shutdown / heartbeat / vsync /
|
||||||
// timers / audio-interrupt injection. Carved into
|
// timers / audio-interrupt injection. Carved into
|
||||||
@@ -2902,10 +3091,12 @@ fn run_execution(
|
|||||||
thunk_map,
|
thunk_map,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Snapshot round schedule. `round_schedule` also advances rng state
|
// Snapshot round schedule. `round_schedule_into` also advances rng
|
||||||
// when seeded; mutation is intentional.
|
// state when seeded; mutation is intentional. Perf (Tier-A #2): fill
|
||||||
|
// a reusable stack array instead of allocating a fresh Vec per round.
|
||||||
kernel.scheduler.begin_round();
|
kernel.scheduler.begin_round();
|
||||||
let order = kernel.scheduler.round_schedule();
|
let order_n = kernel.scheduler.round_schedule_into(&mut order_buf);
|
||||||
|
let order = &order_buf[..order_n];
|
||||||
|
|
||||||
if order.is_empty() {
|
if order.is_empty() {
|
||||||
// No Ready threads — advance time to the earliest pending
|
// No Ready threads — advance time to the earliest pending
|
||||||
@@ -2927,7 +3118,7 @@ fn run_execution(
|
|||||||
// GPU when block dispatch engages.
|
// GPU when block dispatch engages.
|
||||||
let instrs_at_round_start = stats.instruction_count;
|
let instrs_at_round_start = stats.instruction_count;
|
||||||
|
|
||||||
for hw_id in order {
|
for &hw_id in order {
|
||||||
let wc = &mut workers[hw_id as usize];
|
let wc = &mut workers[hw_id as usize];
|
||||||
match worker_prologue(
|
match worker_prologue(
|
||||||
wc,
|
wc,
|
||||||
@@ -2946,34 +3137,25 @@ fn run_execution(
|
|||||||
block_ptr,
|
block_ptr,
|
||||||
pc_before,
|
pc_before,
|
||||||
} => {
|
} => {
|
||||||
// Block-cache step. The lockstep path keeps the
|
// SUPERBLOCK runner (iterate-3AL). Instead of one
|
||||||
// kernel state borrowed straight through (single
|
// basic block per slot-visit, chain straight-line
|
||||||
// host thread, no contention). Step 03 of the
|
// blocks through their branches up to a deterministic
|
||||||
// M3 real-parallelism plan introduces a
|
// instruction budget, yielding back to the round only
|
||||||
// drop-and-reacquire window around `step_block`
|
// at cross-thread synchronization points. Amortizes
|
||||||
// for the parallel branch.
|
// the per-round (timebase / coord / round_schedule)
|
||||||
let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
|
// and per-slot (prologue) tax over hundreds of
|
||||||
let block = unsafe { &*block_ptr };
|
// instructions instead of ~6. See `run_superblock`.
|
||||||
let result = {
|
match run_superblock(
|
||||||
let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
|
|
||||||
step_block(ctx, mem, block)
|
|
||||||
};
|
|
||||||
let executed = kernel
|
|
||||||
.scheduler
|
|
||||||
.ctx_mut_ref(thread_ref)
|
|
||||||
.cycle_count
|
|
||||||
.saturating_sub(cycle_before);
|
|
||||||
match worker_epilogue(
|
|
||||||
wc,
|
wc,
|
||||||
kernel,
|
kernel,
|
||||||
|
mem,
|
||||||
debugger,
|
debugger,
|
||||||
|
thunk_map,
|
||||||
&mut stats,
|
&mut stats,
|
||||||
tid,
|
tid,
|
||||||
thread_ref,
|
thread_ref,
|
||||||
block_ptr,
|
block_ptr,
|
||||||
pc_before,
|
pc_before,
|
||||||
result,
|
|
||||||
executed,
|
|
||||||
) {
|
) {
|
||||||
SlotOutcome::Continue => continue,
|
SlotOutcome::Continue => continue,
|
||||||
SlotOutcome::BreakOuter => break 'outer,
|
SlotOutcome::BreakOuter => break 'outer,
|
||||||
@@ -4423,6 +4605,12 @@ fn run_with_ui(
|
|||||||
.map_err(|e| anyhow::anyhow!("winit event loop build failed: {e}"))?;
|
.map_err(|e| anyhow::anyhow!("winit event loop build failed: {e}"))?;
|
||||||
let (ui_handles, kernel_bridge) = xenia_ui::build(event_loop.create_proxy());
|
let (ui_handles, kernel_bridge) = xenia_ui::build(event_loop.create_proxy());
|
||||||
kernel.ui = Some(kernel_bridge);
|
kernel.ui = Some(kernel_bridge);
|
||||||
|
// iterate-3O: enable per-draw geometry capture so the UI can replay real
|
||||||
|
// guest draws. Only on the `--ui` path; headless `check` never gets here,
|
||||||
|
// so the deterministic core/golden stays untouched.
|
||||||
|
if let Some(gpu) = kernel.gpu.as_inline_mut() {
|
||||||
|
gpu.enable_frame_capture();
|
||||||
|
}
|
||||||
|
|
||||||
let shutdown = std::sync::Arc::clone(&ui_handles.shutdown);
|
let shutdown = std::sync::Arc::clone(&ui_handles.shutdown);
|
||||||
let title_owned = std::path::Path::new(title)
|
let title_owned = std::path::Path::new(title)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"instructions": 2000005,
|
"instructions": 2000073,
|
||||||
"imports": 5635,
|
"imports": 5635,
|
||||||
"unimpl": 0,
|
"unimpl": 0,
|
||||||
"draws": 0,
|
"draws": 0,
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
{
|
{
|
||||||
"instructions": 50000014,
|
"instructions": 50000110,
|
||||||
"imports": 352251,
|
"imports": 243387,
|
||||||
"unimpl": 0,
|
"unimpl": 0,
|
||||||
"draws": 718,
|
"draws": 1279,
|
||||||
"swaps": 147,
|
"swaps": 260,
|
||||||
"unique_render_targets": 2,
|
"unique_render_targets": 2,
|
||||||
"shader_blobs_live": 6,
|
"shader_blobs_live": 6,
|
||||||
"texture_cache_entries": 0
|
"texture_cache_entries": 1
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,6 +79,14 @@ pub struct DecodedBlock {
|
|||||||
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
|
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
|
||||||
/// pushes the first decoded word unconditionally).
|
/// pushes the first decoded word unconditionally).
|
||||||
pub instrs: Vec<DecodedInstr>,
|
pub instrs: Vec<DecodedInstr>,
|
||||||
|
/// True if this block contains a cross-thread synchronization point
|
||||||
|
/// (`PpcOpcode::is_sync_sensitive`: reserved load/store or a memory
|
||||||
|
/// barrier). Computed once at build time. The superblock runner ends
|
||||||
|
/// the run after executing a sync-sensitive block so the lockstep
|
||||||
|
/// interleaving stays fine-grained at exactly those points (preserving
|
||||||
|
/// the cross-thread ordering the 2E/2F/2J boot work depends on),
|
||||||
|
/// while chaining freely through ordinary straight-line blocks.
|
||||||
|
pub sync_sensitive: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Per-slot status from a `lookup_or_build` probe. Internal only.
|
/// Per-slot status from a `lookup_or_build` probe. Internal only.
|
||||||
@@ -187,11 +195,13 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
|
|||||||
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
|
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
|
||||||
let page_base = start_pc & GUEST_PAGE_MASK;
|
let page_base = start_pc & GUEST_PAGE_MASK;
|
||||||
let mut cur = start_pc;
|
let mut cur = start_pc;
|
||||||
|
let mut sync_sensitive = false;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let raw = mem.read_u32(cur);
|
let raw = mem.read_u32(cur);
|
||||||
let decoded = decode(raw, cur);
|
let decoded = decode(raw, cur);
|
||||||
let terminates = decoded.opcode.terminates_block();
|
let terminates = decoded.opcode.terminates_block();
|
||||||
|
sync_sensitive |= decoded.opcode.is_sync_sensitive();
|
||||||
instrs.push(decoded);
|
instrs.push(decoded);
|
||||||
|
|
||||||
if terminates {
|
if terminates {
|
||||||
@@ -215,6 +225,7 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
|
|||||||
end_pc,
|
end_pc,
|
||||||
page_version,
|
page_version,
|
||||||
instrs,
|
instrs,
|
||||||
|
sync_sensitive,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -335,6 +346,40 @@ mod tests {
|
|||||||
assert_eq!(b.end_pc, 0x110);
|
assert_eq!(b.end_pc, 0x110);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_sensitive_flag_set_for_barrier_block() {
|
||||||
|
// A block containing `sync` (0x7C0004AC) must flag sync_sensitive
|
||||||
|
// so the superblock runner ends the chain there (cross-thread
|
||||||
|
// ordering point). `sync` does NOT terminate a block, so it sits
|
||||||
|
// mid-block followed by straight-line code up to a terminator.
|
||||||
|
let mem = BlockTestMem::new();
|
||||||
|
mem.put(0x100, enc_addi(3, 3, 1));
|
||||||
|
mem.put(0x104, 0x7C00_04AC); // sync
|
||||||
|
mem.put(0x108, enc_addi(3, 3, 1));
|
||||||
|
mem.put(0x10C, enc_b_self()); // terminator
|
||||||
|
let mut bc = BlockCache::new();
|
||||||
|
let b = bc.lookup_or_build(0x100, &mem);
|
||||||
|
assert!(
|
||||||
|
b.sync_sensitive,
|
||||||
|
"block containing `sync` must flag sync_sensitive; decoded last={:?}",
|
||||||
|
b.instrs.iter().map(|i| i.opcode).collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sync_sensitive_flag_clear_for_plain_block() {
|
||||||
|
// A straight-line ALU block with no reserved-op / barrier must
|
||||||
|
// NOT flag sync_sensitive (so the superblock runner is free to
|
||||||
|
// chain through it).
|
||||||
|
let mem = BlockTestMem::new();
|
||||||
|
mem.put(0x100, enc_addi(3, 3, 1));
|
||||||
|
mem.put(0x104, enc_addi(3, 3, 1));
|
||||||
|
mem.put(0x108, enc_b_self());
|
||||||
|
let mut bc = BlockCache::new();
|
||||||
|
let b = bc.lookup_or_build(0x100, &mem);
|
||||||
|
assert!(!b.sync_sensitive, "plain ALU block must not flag sync_sensitive");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn block_stops_at_page_boundary() {
|
fn block_stops_at_page_boundary() {
|
||||||
// Build from 0x1FFC. The next PC (0x2000) is in a different
|
// Build from 0x1FFC. The next PC (0x2000) is in a different
|
||||||
|
|||||||
@@ -204,6 +204,34 @@ impl PpcOpcode {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if this opcode is a cross-thread synchronization
|
||||||
|
/// point at which the superblock runner MUST yield back to the
|
||||||
|
/// round-robin scheduler so the lockstep interleaving stays
|
||||||
|
/// fine-grained enough to preserve correct cross-thread ordering:
|
||||||
|
///
|
||||||
|
/// - reserved load/store (`lwarx`/`ldarx`/`stwcx.`/`stdcx.`): the
|
||||||
|
/// atomic primitive other threads race on. Running past one
|
||||||
|
/// without returning to the scheduler would let a single slot
|
||||||
|
/// win/lose a reservation across many blocks before any peer
|
||||||
|
/// observes it.
|
||||||
|
/// - memory barriers (`sync`/`eieio`/`isync`): the guest explicitly
|
||||||
|
/// demands a global ordering point here; honour it by ending the
|
||||||
|
/// superblock so the scheduler re-interleaves.
|
||||||
|
///
|
||||||
|
/// Purely a function of the opcode (no guest data), so the yield
|
||||||
|
/// decision is deterministic and the schedule reproduces byte-identically.
|
||||||
|
/// Note: `sc` (syscall) and traps already `terminates_block`, and
|
||||||
|
/// import-thunk / halt-sentinel PCs are handled by the per-block
|
||||||
|
/// prologue re-check in the superblock loop — they are not listed here.
|
||||||
|
#[inline]
|
||||||
|
pub fn is_sync_sensitive(&self) -> bool {
|
||||||
|
matches!(
|
||||||
|
self,
|
||||||
|
Self::lwarx | Self::ldarx | Self::stwcx | Self::stdcx
|
||||||
|
| Self::sync | Self::eieio | Self::isync
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn name(&self) -> &'static str {
|
pub fn name(&self) -> &'static str {
|
||||||
match self {
|
match self {
|
||||||
Self::Invalid => "invalid",
|
Self::Invalid => "invalid",
|
||||||
|
|||||||
@@ -795,31 +795,46 @@ impl Scheduler {
|
|||||||
/// the fast path — zero bits mean no slot has work and the caller
|
/// the fast path — zero bits mean no slot has work and the caller
|
||||||
/// falls through to `advance_to_next_wake`.
|
/// falls through to `advance_to_next_wake`.
|
||||||
pub fn round_schedule(&mut self) -> Vec<u8> {
|
pub fn round_schedule(&mut self) -> Vec<u8> {
|
||||||
|
let mut buf = [0u8; HW_THREAD_COUNT];
|
||||||
|
let n = self.round_schedule_into(&mut buf);
|
||||||
|
buf[..n].to_vec()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Allocation-free variant of [`Self::round_schedule`] (Tier-A perf #2).
|
||||||
|
/// Fills `buf` with the runnable slot ids and returns the count `n`; the
|
||||||
|
/// valid range is `buf[..n]`. The hot scheduler loop (lockstep +
|
||||||
|
/// parallel) calls this with a reusable stack array so it does not
|
||||||
|
/// `__rust_alloc`/`__rust_dealloc` a fresh `Vec` every round (~7 instr
|
||||||
|
/// apart at boot-to-splash → millions of churned allocations). Identical
|
||||||
|
/// ordering / RNG-advance semantics to `round_schedule`, so the schedule
|
||||||
|
/// — and thus the lockstep digest — is byte-for-byte unchanged.
|
||||||
|
pub fn round_schedule_into(&mut self, buf: &mut [u8; HW_THREAD_COUNT]) -> usize {
|
||||||
if self.non_empty_runnable == 0 {
|
if self.non_empty_runnable == 0 {
|
||||||
return Vec::new();
|
return 0;
|
||||||
}
|
}
|
||||||
let start = self.rotation_cursor as usize;
|
let start = self.rotation_cursor as usize;
|
||||||
let mut out: Vec<u8> = Vec::with_capacity(HW_THREAD_COUNT);
|
let mut n = 0usize;
|
||||||
for off in 0..HW_THREAD_COUNT {
|
for off in 0..HW_THREAD_COUNT {
|
||||||
let i = (start + off) % HW_THREAD_COUNT;
|
let i = (start + off) % HW_THREAD_COUNT;
|
||||||
if self.non_empty_runnable & (1 << i) != 0 {
|
if self.non_empty_runnable & (1 << i) != 0 {
|
||||||
out.push(i as u8);
|
buf[n] = i as u8;
|
||||||
|
n += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Seeded mode layers a deterministic shuffle on top of the
|
// Seeded mode layers a deterministic shuffle on top of the
|
||||||
// already-filtered list. Same spawn/wake sequence + same seed ⇒
|
// already-filtered list. Same spawn/wake sequence + same seed ⇒
|
||||||
// same schedule (invariant preserved from pre-Axis-1).
|
// same schedule (invariant preserved from pre-Axis-1).
|
||||||
if let OrderMode::Seeded { .. } = self.order {
|
if let OrderMode::Seeded { .. } = self.order {
|
||||||
for i in (1..out.len()).rev() {
|
for i in (1..n).rev() {
|
||||||
self.rng_state ^= self.rng_state << 13;
|
self.rng_state ^= self.rng_state << 13;
|
||||||
self.rng_state ^= self.rng_state >> 7;
|
self.rng_state ^= self.rng_state >> 7;
|
||||||
self.rng_state ^= self.rng_state << 17;
|
self.rng_state ^= self.rng_state << 17;
|
||||||
let j = (self.rng_state as usize) % (i + 1);
|
let j = (self.rng_state as usize) % (i + 1);
|
||||||
out.swap(i, j);
|
buf.swap(i, j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.rotation_cursor = ((start + 1) % HW_THREAD_COUNT) as u8;
|
self.rotation_cursor = ((start + 1) % HW_THREAD_COUNT) as u8;
|
||||||
out
|
n
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn begin_round(&mut self) {
|
pub fn begin_round(&mut self) {
|
||||||
@@ -1293,7 +1308,15 @@ impl Scheduler {
|
|||||||
};
|
};
|
||||||
t.quantum_remaining = QUANTUM_DEFAULT;
|
t.quantum_remaining = QUANTUM_DEFAULT;
|
||||||
self.recompute_slot_runnable(r.hw_id);
|
self.recompute_slot_runnable(r.hw_id);
|
||||||
tracing::info!(
|
// DEBUG, not INFO: this fires once per timed-wait deadline-wake, which
|
||||||
|
// during the boot idle-spin happens hundreds of thousands of times. At
|
||||||
|
// INFO it floods the console/log file and throttles the interactive
|
||||||
|
// `exec --ui` path so hard (≈286K lines flushed to disk) that the guest
|
||||||
|
// crawls and never reaches the ~30–150M-instruction splash window —
|
||||||
|
// which masqueraded as a "--ui early termination" (iterate-3R). The
|
||||||
|
// headless `check` path runs `--quiet` (WARN) so it was never throttled.
|
||||||
|
// No execution-semantics change; deterministic golden is unaffected.
|
||||||
|
tracing::debug!(
|
||||||
"scheduler: advanced to deadline {} waking hw={} idx={}",
|
"scheduler: advanced to deadline {} waking hw={} idx={}",
|
||||||
deadline,
|
deadline,
|
||||||
r.hw_id,
|
r.hw_id,
|
||||||
|
|||||||
372
crates/xenia-gpu/src/draw_capture.rs
Normal file
372
crates/xenia-gpu/src/draw_capture.rs
Normal file
@@ -0,0 +1,372 @@
|
|||||||
|
//! Per-draw geometry capture for the host UI's faithful-render path.
|
||||||
|
//!
|
||||||
|
//! The deterministic headless core (`check --gpu-inline`) never touches this
|
||||||
|
//! module — it is populated only when a UI bridge is installed and consumed
|
||||||
|
//! only by `crates/xenia-ui`. The goal is to hand the UI the *real* guest
|
||||||
|
//! geometry behind each `PM4_DRAW_INDX*` packet so it can rasterize the
|
||||||
|
//! actual splash vertices instead of synthetic placeholder shapes.
|
||||||
|
//!
|
||||||
|
//! What the WGSL pipeline needs to reconstruct one draw (see
|
||||||
|
//! `shaders/xenos_interp.wgsl` `vs_main` / `interpret_vertex_fetch`):
|
||||||
|
//! * the active VS/PS blob keys (already published as assets),
|
||||||
|
//! * the primitive type + the host vertex count to issue,
|
||||||
|
//! * the raw guest vertex-buffer bytes for the fetched window, and
|
||||||
|
//! * the *dword base* of that window so the shader can rebase the absolute
|
||||||
|
//! fetch-constant address into the uploaded buffer.
|
||||||
|
//!
|
||||||
|
//! The hard part is sourcing the vertex window: the VS reads a vertex-fetch
|
||||||
|
//! constant (`xe_gpu_vertex_fetch_t`) whose dword-0 carries the absolute
|
||||||
|
//! guest dword address. We parse the active VS, find its first vertex fetch,
|
||||||
|
//! read that fetch constant out of the register file, then copy a bounded
|
||||||
|
//! window of guest memory starting at the fetch base.
|
||||||
|
|
||||||
|
use xenia_memory::access::MemoryAccess;
|
||||||
|
|
||||||
|
use crate::draw_state::{IndexSize, IndexSource, PrimitiveType};
|
||||||
|
use crate::register_file::RegisterFile;
|
||||||
|
|
||||||
|
/// Texture-fetch / vertex-fetch constant region base, in register indices.
|
||||||
|
/// Each fetch constant is 6 dwords (`xe_gpu_*_fetch_t`).
|
||||||
|
const CONST_BASE_FETCH: u32 = 0x4800;
|
||||||
|
|
||||||
|
/// Upper bound (in dwords) on the vertex window we copy per draw. The splash
|
||||||
|
/// UI draws are tiny (3–4 verts × ≤4 dwords); 64 KiB of dwords is generous
|
||||||
|
/// slack while bounding the per-frame copy cost and the 16 MiB host buffer.
|
||||||
|
const MAX_WINDOW_DWORDS: u32 = 16 * 1024;
|
||||||
|
|
||||||
|
/// One captured draw, with enough real state for the UI to replay it through
|
||||||
|
/// the existing wgpu Xenos pipeline.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct DrawCapture {
|
||||||
|
/// Monotonic global draw index (matches `GpuStats::draws_seen` at capture).
|
||||||
|
pub draw_index: u32,
|
||||||
|
/// Xenos primitive-type code (see `SwapInfo::last_draw_prim` encoding).
|
||||||
|
pub prim_code: u32,
|
||||||
|
/// Host vertex count to issue (post primitive-processor rewrite).
|
||||||
|
pub host_vertex_count: u32,
|
||||||
|
/// Active VS blob key at draw time (0 = none).
|
||||||
|
pub vs_key: u32,
|
||||||
|
/// Active PS blob key at draw time (0 = none).
|
||||||
|
pub ps_key: u32,
|
||||||
|
/// Raw guest dwords of the fetched vertex window (host-endian as stored in
|
||||||
|
/// guest memory — the WGSL applies the per-format endian swap). `addr 0`
|
||||||
|
/// of this buffer corresponds to guest dword `window_base_dwords`.
|
||||||
|
pub vertex_dwords: Vec<u32>,
|
||||||
|
/// Guest dword address that maps to index 0 of `vertex_dwords`. The shader
|
||||||
|
/// subtracts this from the fetch-constant base to index `vertex_dwords`.
|
||||||
|
pub window_base_dwords: u32,
|
||||||
|
/// `true` when we successfully resolved a real vertex window. When `false`
|
||||||
|
/// the UI falls back to its procedural geometry for this draw (honest:
|
||||||
|
/// nothing faked, just "couldn't source real vertices").
|
||||||
|
pub has_real_vertices: bool,
|
||||||
|
/// iterate-3S: per-draw NDC transform derived from the guest viewport /
|
||||||
|
/// clip / VTE registers (mirrors canary `GetHostViewportInfo`). The host VS
|
||||||
|
/// converts the guest-VS position to wgpu clip space via
|
||||||
|
/// `clip.xy = pos.xy * ndc_scale + ndc_offset * pos.w`. The Y component
|
||||||
|
/// already carries the render-target → wgpu Y-flip (negated).
|
||||||
|
pub ndc_scale: [f32; 2],
|
||||||
|
pub ndc_offset: [f32; 2],
|
||||||
|
/// iterate-3T: the decoded texture(s) this draw's active pixel shader
|
||||||
|
/// samples, keyed off its real `tfetch` fetch-constant slots (the 3M
|
||||||
|
/// decoder makes these decode). The UI uploads + binds the FIRST entry
|
||||||
|
/// per-draw so the textured logo samples the real artwork instead of the
|
||||||
|
/// magenta stub. Empty for flat (no-tfetch) draws. Populated by
|
||||||
|
/// `gpu_system` after decode (left empty by `build`).
|
||||||
|
///
|
||||||
|
/// Each entry is `(key, content_version, bytes)`. iterate-3AD: the
|
||||||
|
/// `content_version` (from `span_max_version` over the texel span) lets the
|
||||||
|
/// UI host texture cache RE-UPLOAD when the guest fills more of an evolving
|
||||||
|
/// atlas. The publisher and the 2nd splash logo share one K8888 surface
|
||||||
|
/// (base `0x4dbee000`); the 2nd logo's texels are CPU-written *after* the
|
||||||
|
/// publisher's first upload. Without the real version the host cache (which
|
||||||
|
/// previously pinned `version_when_uploaded = 1`) kept the first partial
|
||||||
|
/// upload, so the 2nd logo sampled its still-zero atlas region as black.
|
||||||
|
pub textures: Vec<(crate::texture_cache::TextureKey, u64, Vec<u8>)>,
|
||||||
|
/// iterate-3Y: per-draw color/blend render state captured from the
|
||||||
|
/// register file so the host pipeline composites the way the guest
|
||||||
|
/// intends (instead of one fixed alpha-blend state). Mirrors the fields
|
||||||
|
/// canary feeds into `GetCurrentStateDescription` (D3D12
|
||||||
|
/// `pipeline_cache.cc`):
|
||||||
|
/// * `blend_control` = `RB_BLENDCONTROL0` (RT0 src/dst factors + op,
|
||||||
|
/// color and alpha). The Xbox 360 has no separate "blend enable" bit;
|
||||||
|
/// `One,Zero,Add` *is* the opaque case.
|
||||||
|
/// * `color_mask` = RT0 nibble of `RB_COLOR_MASK` (per-channel write
|
||||||
|
/// enable). When 0, canary forces `One,Zero` (no blend).
|
||||||
|
/// * `color_control` = `RB_COLORCONTROL` (alpha-test enable/func).
|
||||||
|
/// * `depth_control` = `RB_DEPTHCONTROL` (z-test enable/func/write).
|
||||||
|
pub blend_control: u32,
|
||||||
|
pub color_mask: u8,
|
||||||
|
pub color_control: u32,
|
||||||
|
pub depth_control: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// iterate-3S: compute the guest→host NDC XY transform for a draw, mirroring
|
||||||
|
/// canary's `draw_util.cc::GetHostViewportInfo` (the XY half). The Xbox 360 VS
|
||||||
|
/// emits a clip-space position which the HW then scales/offsets by the viewport
|
||||||
|
/// (`PA_CL_VPORT_*`, gated by `PA_CL_VTE_CNTL`) into render-target pixels, OR,
|
||||||
|
/// when clipping is disabled (`PA_CL_CLIP_CNTL.clip_disable`), the VS emits
|
||||||
|
/// render-target-pixel coordinates directly (the screen-space UI / clear case —
|
||||||
|
/// this is what Sylpheed's splash quads do). Either way we must rescale into the
|
||||||
|
/// host's [-1,1] clip space and flip Y (render-target Y-down → wgpu Y-up).
|
||||||
|
///
|
||||||
|
/// Returns `(ndc_scale[2], ndc_offset[2])` such that
|
||||||
|
/// `host_clip.xy = guest_pos.xy * ndc_scale + ndc_offset * guest_pos.w`.
|
||||||
|
/// The Y entries are pre-negated to flip into wgpu's Y-up clip space.
|
||||||
|
pub fn compute_ndc_xy(rf: &RegisterFile) -> ([f32; 2], [f32; 2]) {
|
||||||
|
const PA_CL_CLIP_CNTL: u32 = 0x2204;
|
||||||
|
const PA_SU_SC_MODE_CNTL: u32 = 0x2205;
|
||||||
|
const PA_CL_VTE_CNTL: u32 = 0x2206;
|
||||||
|
const PA_SU_VTX_CNTL: u32 = 0x2302;
|
||||||
|
const PA_CL_VPORT_XSCALE: u32 = 0x210F;
|
||||||
|
const PA_CL_VPORT_XOFFSET: u32 = 0x2110;
|
||||||
|
const PA_CL_VPORT_YSCALE: u32 = 0x2111;
|
||||||
|
const PA_CL_VPORT_YOFFSET: u32 = 0x2112;
|
||||||
|
const PA_SC_WINDOW_OFFSET: u32 = 0x2080;
|
||||||
|
const PA_SC_WINDOW_SCISSOR_BR: u32 = 0x2082;
|
||||||
|
const RB_SURFACE_INFO: u32 = 0x2000;
|
||||||
|
|
||||||
|
let clip_cntl = rf.read(PA_CL_CLIP_CNTL);
|
||||||
|
let vte = rf.read(PA_CL_VTE_CNTL);
|
||||||
|
let su_sc_mode = rf.read(PA_SU_SC_MODE_CNTL);
|
||||||
|
let su_vtx = rf.read(PA_SU_VTX_CNTL);
|
||||||
|
let fbits = |r: u32| f32::from_bits(rf.read(r));
|
||||||
|
|
||||||
|
// VTE enable bits (xenos.h PA_CL_VTE_CNTL): bit0 vport_x_scale_ena,
|
||||||
|
// bit1 vport_x_offset_ena, bit2 vport_y_scale_ena, bit3 vport_y_offset_ena.
|
||||||
|
let scale_x = if vte & (1 << 0) != 0 { fbits(PA_CL_VPORT_XSCALE) } else { 1.0 };
|
||||||
|
let off_x = if vte & (1 << 1) != 0 { fbits(PA_CL_VPORT_XOFFSET) } else { 0.0 };
|
||||||
|
let scale_y = if vte & (1 << 2) != 0 { fbits(PA_CL_VPORT_YSCALE) } else { 1.0 };
|
||||||
|
let off_y = if vte & (1 << 3) != 0 { fbits(PA_CL_VPORT_YOFFSET) } else { 0.0 };
|
||||||
|
|
||||||
|
// Render-target extent in guest pixels: clamp to the texture max (2048),
|
||||||
|
// sourced from the window scissor BR (matches canary `x_max`/`y_max`).
|
||||||
|
let br = rf.read(PA_SC_WINDOW_SCISSOR_BR);
|
||||||
|
let x_max = ((br & 0x7FFF).max(1)).min(2048) as f32;
|
||||||
|
let y_max = (((br >> 16) & 0x7FFF).max(1)).min(2048) as f32;
|
||||||
|
let _ = RB_SURFACE_INFO;
|
||||||
|
|
||||||
|
// Half-pixel + window offsets added in render-target pixels.
|
||||||
|
let mut add_x = 0.0f32;
|
||||||
|
let mut add_y = 0.0f32;
|
||||||
|
if su_sc_mode & (1 << 16) != 0 {
|
||||||
|
let wo = rf.read(PA_SC_WINDOW_OFFSET);
|
||||||
|
// 15-bit signed each (x: [14:0], y: [30:16]).
|
||||||
|
let sx = (((wo & 0x7FFF) << 1) as i32) >> 1;
|
||||||
|
let sy = ((((wo >> 16) & 0x7FFF) << 1) as i32) >> 1;
|
||||||
|
add_x += sx as f32;
|
||||||
|
add_y += sy as f32;
|
||||||
|
}
|
||||||
|
if su_vtx & 1 == 0 {
|
||||||
|
// pix_center == kD3DZero → +0.5 half-pixel offset.
|
||||||
|
add_x += 0.5;
|
||||||
|
add_y += 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (s, o);
|
||||||
|
if clip_cntl & (1 << 16) != 0 {
|
||||||
|
// clip_disable: VS outputs render-target-*pixel* coords (Y-DOWN: pixel
|
||||||
|
// y=0 is the top row of the render target). Rescale the whole RT extent
|
||||||
|
// to [-1,1] and FLIP Y so pixel-top → wgpu clip-top (canary's
|
||||||
|
// huge-host-viewport path; the framebuffer→clip flip is real here).
|
||||||
|
let px2ndc_x = 2.0 / x_max;
|
||||||
|
let px2ndc_y = 2.0 / y_max;
|
||||||
|
let sx = scale_x * px2ndc_x;
|
||||||
|
let ox = (off_x - x_max * 0.5 + add_x) * px2ndc_x;
|
||||||
|
let sy = scale_y * px2ndc_y;
|
||||||
|
let oy = (off_y - y_max * 0.5 + add_y) * px2ndc_y;
|
||||||
|
// Flip Y: pixel-Y-down → wgpu clip-Y-up.
|
||||||
|
s = [sx, -sy];
|
||||||
|
o = [ox, -oy];
|
||||||
|
} else {
|
||||||
|
// iterate-3AA (DEFECT 1 ROOT): clipping enabled → the VS already emits
|
||||||
|
// *clip-space* coordinates (Y-UP: +Y is the top of the screen), exactly
|
||||||
|
// the convention the Xbox 360's D3D9 and wgpu BOTH use for clip space
|
||||||
|
// (NDC +Y → framebuffer top in each API; the framebuffer Y-direction is
|
||||||
|
// an internal viewport detail handled identically by both). A clip-space
|
||||||
|
// position is therefore portable to wgpu with NO Y-flip. The previous
|
||||||
|
// code unconditionally negated Y (the same flip the screen-space pixel
|
||||||
|
// path needs), which mirrored the publisher logo vertically: its quad is
|
||||||
|
// centered (±0.085 around 0) so the *position* stayed centered, but the
|
||||||
|
// negation swapped top↔bottom vertices while the texture V was unchanged
|
||||||
|
// → the sampled sub-rect (UV v 0.001→0.090) read bottom-up → "SQUARE
|
||||||
|
// ENIX" rendered upside down in place. Measured (readback): the red dots
|
||||||
|
// sit at 43% from the texture top but rendered at 58% from the top
|
||||||
|
// (= a clean vertical mirror); removing the flip restores them to 43%.
|
||||||
|
// Identity XY (no flip) maps guest clip-Y-up straight to wgpu clip-Y-up.
|
||||||
|
s = [1.0, 1.0];
|
||||||
|
o = [0.0, 0.0];
|
||||||
|
return (s, o);
|
||||||
|
}
|
||||||
|
(s, o)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a [`PrimitiveType`] as the raw Xenos code used across the bridge.
|
||||||
|
pub fn prim_code(p: PrimitiveType) -> u32 {
|
||||||
|
match p {
|
||||||
|
PrimitiveType::None => 0,
|
||||||
|
PrimitiveType::PointList => 1,
|
||||||
|
PrimitiveType::LineList => 2,
|
||||||
|
PrimitiveType::LineStrip => 3,
|
||||||
|
PrimitiveType::TriangleList => 4,
|
||||||
|
PrimitiveType::TriangleFan => 5,
|
||||||
|
PrimitiveType::TriangleStrip => 6,
|
||||||
|
PrimitiveType::RectangleList => 8,
|
||||||
|
PrimitiveType::QuadList => 13,
|
||||||
|
PrimitiveType::Unknown(x) => x as u32,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve the first vertex-fetch window referenced by the parsed VS.
|
||||||
|
///
|
||||||
|
/// Walks the VS instruction stream for the first `vfetch` (mini) instruction,
|
||||||
|
/// reads its fetch constant from `rf`, and copies a bounded window of guest
|
||||||
|
/// memory starting at the fetch base. Returns `(dwords, window_base_dwords)`
|
||||||
|
/// or `None` if the VS has no vertex fetch or the constant is malformed.
|
||||||
|
fn resolve_vertex_window(
|
||||||
|
parsed_vs: &crate::ucode::ParsedShader,
|
||||||
|
rf: &RegisterFile,
|
||||||
|
mem: &dyn MemoryAccess,
|
||||||
|
) -> Option<(Vec<u32>, u32)> {
|
||||||
|
// iterate-3W (GPUBUG-109): the instruction block packs ALU and fetch
|
||||||
|
// instructions identically (96 bits / 3 dwords each); ONLY the owning
|
||||||
|
// `Exec` control-flow clause's `sequence` bitmap (2 bits per instruction,
|
||||||
|
// bit[2*i]=fetch/ALU) tells them apart. The previous blind triple-walk
|
||||||
|
// decoded ALU triples as fetches → garbage fetch-constant indices and a
|
||||||
|
// bogus `type==3` guard, never reaching the real vertex fetch. Walk the CF
|
||||||
|
// exec clauses exactly as the translator does (`translator.rs::emit_exec`)
|
||||||
|
// and take the FIRST sequence-flagged *vertex* fetch.
|
||||||
|
let instrs = &parsed_vs.instructions;
|
||||||
|
let mut const_off: Option<u32> = None;
|
||||||
|
'clauses: for clause in &parsed_vs.cf {
|
||||||
|
let crate::ucode::control_flow::ControlFlowInstruction::Exec {
|
||||||
|
address,
|
||||||
|
count,
|
||||||
|
sequence,
|
||||||
|
..
|
||||||
|
} = *clause
|
||||||
|
else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
for i in 0..(count as usize) {
|
||||||
|
// bit[2*i] of the sequence bitmap: 1 = fetch, 0 = ALU.
|
||||||
|
if (sequence >> (i * 2)) & 1 == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let base = (address as usize + i) * 3;
|
||||||
|
if base + 2 >= instrs.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let crate::ucode::fetch::FetchInstruction::Vertex(vf) =
|
||||||
|
crate::ucode::fetch::decode_fetch([instrs[base], instrs[base + 1], instrs[base + 2]])
|
||||||
|
{
|
||||||
|
const_off = Some(vf.const_reg_offset());
|
||||||
|
break 'clauses;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// iterate-3X (GPUBUG-110): vertex fetch constants are addressed by
|
||||||
|
// `const_index * 3 + const_index_sel` (canary `ucode.h:700` —
|
||||||
|
// `VertexFetchInstruction::fetch_constant_index`), NOT by `const_index`
|
||||||
|
// alone. The register region packs 3 two-dword vertex-fetch constants per
|
||||||
|
// 6-dword group, so the constant lives at
|
||||||
|
// `0x4800 + const_index*6 + const_index_sel*2`. The previous decode dropped
|
||||||
|
// `const_index_sel` and read sub-slot 0 (`fc*6`), which for the publisher
|
||||||
|
// logo (`const_index=31, sel=2`) held `0x00000001` (an unused slot) instead
|
||||||
|
// of the real vertex-buffer base at sub-slot 2 (`0x48BE`). That made
|
||||||
|
// `has_real_vertices=false` → the logo fell to the procedural fullscreen
|
||||||
|
// magenta fallback. (Refutes iterate-3W's "geometry is auto-generated from
|
||||||
|
// vertex_id" — measured: the real fetch constant is a 4-vertex QuadList
|
||||||
|
// buffer at `0x0adf60f0`.)
|
||||||
|
let const_reg = CONST_BASE_FETCH + const_off?;
|
||||||
|
let dword0 = rf.read(const_reg);
|
||||||
|
let dword1 = rf.read(const_reg + 1);
|
||||||
|
// address:30 at bits[31:2] of dword0 (in bytes once masked). The fetch
|
||||||
|
// constant carries a guest *physical* dword address — canary reads the
|
||||||
|
// vertex buffer via `Memory::TranslatePhysical(fetch.address * 4)`
|
||||||
|
// (`draw_util.cc:961`). On the Xbox 360 the physical range is mirrored at
|
||||||
|
// several virtual windows; ours only maps the cached-physical window at
|
||||||
|
// `0x4000_0000` (`gpu_system::physical_to_backing`). Reading the bare low
|
||||||
|
// address (`0x0adf_xxxx`) hits an unmapped VA and returns zeros, so rebase
|
||||||
|
// a low physical base onto the mapped `0x4000_0000` alias when the raw VA
|
||||||
|
// is not itself mapped. `window_base_dwords` keeps the *original* base so
|
||||||
|
// the shader's rebase against the (unmodified) fetch-constant address still
|
||||||
|
// indexes the uploaded window correctly.
|
||||||
|
let base_bytes = dword0 & 0xFFFF_FFFC;
|
||||||
|
if base_bytes == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let read_base = if mem.translate(base_bytes).is_some() {
|
||||||
|
base_bytes
|
||||||
|
} else if base_bytes < 0x2000_0000 && mem.translate(base_bytes | 0x4000_0000).is_some() {
|
||||||
|
base_bytes | 0x4000_0000
|
||||||
|
} else {
|
||||||
|
base_bytes
|
||||||
|
};
|
||||||
|
// size:24 at bits[25:2] of dword1, in dwords. Clamp to our window cap.
|
||||||
|
let size_dwords = ((dword1 >> 2) & 0x00FF_FFFF).clamp(1, MAX_WINDOW_DWORDS);
|
||||||
|
let window_base_dwords = base_bytes >> 2;
|
||||||
|
let mut dwords = Vec::with_capacity(size_dwords as usize);
|
||||||
|
for i in 0..size_dwords {
|
||||||
|
let addr = read_base.wrapping_add(i * 4);
|
||||||
|
if addr < read_base {
|
||||||
|
break; // wrap guard
|
||||||
|
}
|
||||||
|
// `read_u32` composes big-endian bytes into the u32 value; the WGSL's
|
||||||
|
// `gpu_swap` expects the *raw little-endian dword* as it sits in guest
|
||||||
|
// memory, so undo the BE composition with `swap_bytes`.
|
||||||
|
dwords.push(mem.read_u32(addr).swap_bytes());
|
||||||
|
}
|
||||||
|
if dwords.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some((dwords, window_base_dwords))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a [`DrawCapture`] for one draw. Best-effort: when the vertex window
|
||||||
|
/// can't be resolved, `has_real_vertices` is `false` and the UI falls back to
|
||||||
|
/// procedural geometry (never fabricated pixels).
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn build(
|
||||||
|
draw_index: u32,
|
||||||
|
primitive: PrimitiveType,
|
||||||
|
host_vertex_count: u32,
|
||||||
|
_index_source: IndexSource,
|
||||||
|
_index_size: IndexSize,
|
||||||
|
vs_key: u32,
|
||||||
|
ps_key: u32,
|
||||||
|
parsed_vs: Option<&crate::ucode::ParsedShader>,
|
||||||
|
rf: &RegisterFile,
|
||||||
|
mem: &dyn MemoryAccess,
|
||||||
|
) -> DrawCapture {
|
||||||
|
let (vertex_dwords, window_base_dwords, has_real) = match parsed_vs
|
||||||
|
.and_then(|vs| resolve_vertex_window(vs, rf, mem))
|
||||||
|
{
|
||||||
|
Some((d, base)) => (d, base, true),
|
||||||
|
None => (Vec::new(), 0, false),
|
||||||
|
};
|
||||||
|
let (ndc_scale, ndc_offset) = compute_ndc_xy(rf);
|
||||||
|
// iterate-3Y: capture RT0 color/blend/depth render state. Registers per
|
||||||
|
// canary `registers.h`: RB_BLENDCONTROL0=0x2201, RB_COLOR_MASK=0x2104
|
||||||
|
// (RT0 = bits[3:0]), RB_COLORCONTROL=0x2202, RB_DEPTHCONTROL=0x2200.
|
||||||
|
const RB_BLENDCONTROL_0: u32 = 0x2201;
|
||||||
|
const RB_COLOR_MASK: u32 = 0x2104;
|
||||||
|
const RB_COLORCONTROL: u32 = 0x2202;
|
||||||
|
const RB_DEPTHCONTROL: u32 = 0x2200;
|
||||||
|
DrawCapture {
|
||||||
|
draw_index,
|
||||||
|
prim_code: prim_code(primitive),
|
||||||
|
host_vertex_count,
|
||||||
|
vs_key,
|
||||||
|
ps_key,
|
||||||
|
vertex_dwords,
|
||||||
|
window_base_dwords,
|
||||||
|
has_real_vertices: has_real,
|
||||||
|
ndc_scale,
|
||||||
|
ndc_offset,
|
||||||
|
textures: Vec::new(),
|
||||||
|
blend_control: rf.read(RB_BLENDCONTROL_0),
|
||||||
|
color_mask: (rf.read(RB_COLOR_MASK) & 0xF) as u8,
|
||||||
|
color_control: rf.read(RB_COLORCONTROL),
|
||||||
|
depth_control: rf.read(RB_DEPTHCONTROL),
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -429,13 +429,19 @@ pub struct GpuSystem {
|
|||||||
/// hardcoded slot). `vd_swap` publishes the first of these to the UI so
|
/// hardcoded slot). `vd_swap` publishes the first of these to the UI so
|
||||||
/// the replay binds the texture the draw actually samples. Cleared and
|
/// the replay binds the texture the draw actually samples. Cleared and
|
||||||
/// repopulated each draw; empty when the active PS issues no `tfetch`.
|
/// repopulated each draw; empty when the active PS issues no `tfetch`.
|
||||||
pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, Vec<u8>)>,
|
pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, u64, Vec<u8>)>,
|
||||||
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
|
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
|
||||||
/// (future) host-render-target readback; read by the resolve byte-copy
|
/// (future) host-render-target readback; read by the resolve byte-copy
|
||||||
/// path that writes tiled pixels into guest memory. Allocated once at
|
/// path that writes tiled pixels into guest memory. Allocated once at
|
||||||
/// `GpuSystem::new` and lives for the whole GPU lifetime — no
|
/// `GpuSystem::new` and lives for the whole GPU lifetime — no
|
||||||
/// per-frame churn.
|
/// per-frame churn.
|
||||||
pub edram: crate::edram::ShadowEdram,
|
pub edram: crate::edram::ShadowEdram,
|
||||||
|
/// UI-only: when `Some`, every `PM4_DRAW_INDX*` appends a
|
||||||
|
/// [`crate::draw_capture::DrawCapture`] here so the host UI can replay the
|
||||||
|
/// real guest geometry. `None` in headless/deterministic mode — the
|
||||||
|
/// `--gpu-inline` golden never enables this, so capture is entirely inert
|
||||||
|
/// for `check`. Drained (taken) by `vd_swap` at each present.
|
||||||
|
pub frame_captures: Option<Vec<crate::draw_capture::DrawCapture>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GpuSystem {
|
impl GpuSystem {
|
||||||
@@ -463,6 +469,15 @@ impl GpuSystem {
|
|||||||
texture_cache: crate::texture_cache::TextureCache::new(),
|
texture_cache: crate::texture_cache::TextureCache::new(),
|
||||||
last_draw_textures: Vec::new(),
|
last_draw_textures: Vec::new(),
|
||||||
edram: crate::edram::ShadowEdram::new(),
|
edram: crate::edram::ShadowEdram::new(),
|
||||||
|
frame_captures: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable per-draw geometry capture for the host UI. Inert (and never
|
||||||
|
/// called) in headless/deterministic mode. Idempotent.
|
||||||
|
pub fn enable_frame_capture(&mut self) {
|
||||||
|
if self.frame_captures.is_none() {
|
||||||
|
self.frame_captures = Some(Vec::new());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1295,8 +1310,56 @@ impl GpuSystem {
|
|||||||
"gpu: DRAW_INDX captured"
|
"gpu: DRAW_INDX captured"
|
||||||
);
|
);
|
||||||
self.last_draw = Some(ds);
|
self.last_draw = Some(ds);
|
||||||
|
let host_vertex_count = processed.host_vertex_count;
|
||||||
self.last_primitive = Some(processed);
|
self.last_primitive = Some(processed);
|
||||||
|
|
||||||
|
// iterate-3O: UI-only per-draw geometry capture. Resolves the
|
||||||
|
// real guest vertex window behind this draw (from the active
|
||||||
|
// VS's vertex-fetch constant) so the host UI can replay the
|
||||||
|
// actual splash geometry instead of synthetic shapes. Entirely
|
||||||
|
// inert in headless/deterministic mode (`frame_captures` is
|
||||||
|
// `None`), so the `--gpu-inline` golden is unaffected.
|
||||||
|
if self.frame_captures.is_some() {
|
||||||
|
let vs_key = self.active_vs_key.unwrap_or(0);
|
||||||
|
let ps_key = self.active_ps_key.unwrap_or(0);
|
||||||
|
let parsed_vs = self
|
||||||
|
.active_vs_key
|
||||||
|
.and_then(|k| self.shader_blobs.get(&k))
|
||||||
|
.map(|b| crate::ucode::parse_shader(&b.dwords));
|
||||||
|
let (idx_src, idx_size) = match ds.index_source {
|
||||||
|
crate::draw_state::IndexSource::Dma { index_size, .. } => {
|
||||||
|
(ds.index_source, index_size)
|
||||||
|
}
|
||||||
|
crate::draw_state::IndexSource::Immediate { index_size } => {
|
||||||
|
(ds.index_source, index_size)
|
||||||
|
}
|
||||||
|
crate::draw_state::IndexSource::AutoIndex => {
|
||||||
|
(ds.index_source, crate::draw_state::IndexSize::Sixteen)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let cap = crate::draw_capture::build(
|
||||||
|
self.stats.draws_seen as u32,
|
||||||
|
ds.primitive,
|
||||||
|
host_vertex_count,
|
||||||
|
idx_src,
|
||||||
|
idx_size,
|
||||||
|
vs_key,
|
||||||
|
ps_key,
|
||||||
|
parsed_vs.as_ref(),
|
||||||
|
&self.register_file,
|
||||||
|
mem,
|
||||||
|
);
|
||||||
|
if let Some(caps) = self.frame_captures.as_mut() {
|
||||||
|
// Bound the per-frame list so a runaway frame can't grow
|
||||||
|
// host memory without limit; keep the most recent.
|
||||||
|
const MAX_CAPS: usize = 4096;
|
||||||
|
if caps.len() >= MAX_CAPS {
|
||||||
|
caps.remove(0);
|
||||||
|
}
|
||||||
|
caps.push(cap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// P5b: decode the textures the *active pixel shader* actually
|
// P5b: decode the textures the *active pixel shader* actually
|
||||||
// samples. Parse the bound PS, collect its `tfetch`
|
// samples. Parse the bound PS, collect its `tfetch`
|
||||||
// fetch-constant slots, read each 6-dword fetch constant from
|
// fetch-constant slots, read each 6-dword fetch constant from
|
||||||
@@ -1322,9 +1385,24 @@ impl GpuSystem {
|
|||||||
.register_file
|
.register_file
|
||||||
.read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32);
|
.read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32);
|
||||||
}
|
}
|
||||||
let Some(key) = crate::texture_cache::decode_fetch_constant(fetch6) else {
|
let Some(mut key) = crate::texture_cache::decode_fetch_constant(fetch6) else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
// The Xenos texture fetch constant carries a guest
|
||||||
|
// *physical* base address (`base >> 12`). On the Xbox
|
||||||
|
// 360 the GPU reads the unified physical memory; the
|
||||||
|
// CPU writes the (decompressed) texels through its
|
||||||
|
// cached-physical aperture, which ours backs at the
|
||||||
|
// committed `0x4000_0000` window. Map the physical
|
||||||
|
// base onto that backing window so the GPU samples the
|
||||||
|
// bytes the guest actually wrote — exactly as the
|
||||||
|
// vertex-fetch path does (`draw_capture.rs`) and as
|
||||||
|
// canary reads textures through its GPU shared memory
|
||||||
|
// (= physical). Without this the decode reads the
|
||||||
|
// low VA `0x0dbee000` (always zero) instead of the
|
||||||
|
// filled `0x4dbee000`, flattening every disk-asset
|
||||||
|
// texture (e.g. the publisher logo `E59B2B3D`).
|
||||||
|
key.base_address = physical_to_backing(key.base_address);
|
||||||
let bi = key.format.block_info();
|
let bi = key.format.block_info();
|
||||||
let span_bytes = (key.pitch_texels as u32)
|
let span_bytes = (key.pitch_texels as u32)
|
||||||
* (key.height as u32)
|
* (key.height as u32)
|
||||||
@@ -1333,7 +1411,17 @@ impl GpuSystem {
|
|||||||
let version = span_max_version(mem, key.base_address, span_bytes.max(4));
|
let version = span_max_version(mem, key.base_address, span_bytes.max(4));
|
||||||
match self.texture_cache.ensure_cached(key, version, mem) {
|
match self.texture_cache.ensure_cached(key, version, mem) {
|
||||||
Ok(entry) => {
|
Ok(entry) => {
|
||||||
self.last_draw_textures.push((entry.key, entry.bytes.clone()));
|
// iterate-3AD: carry the real content `version`
|
||||||
|
// (from `span_max_version`) so the UI host
|
||||||
|
// texture cache re-uploads when the guest fills
|
||||||
|
// more of an evolving atlas (e.g. the 2nd splash
|
||||||
|
// logo's texels land after the publisher's, in
|
||||||
|
// the SAME K8888 surface). Previously the UI
|
||||||
|
// pinned `version_when_uploaded = 1`, so the
|
||||||
|
// first (partial) upload stuck and later draws
|
||||||
|
// sampled the not-yet-filled region as black.
|
||||||
|
self.last_draw_textures
|
||||||
|
.push((entry.key, version, entry.bytes.clone()));
|
||||||
metrics::counter!(
|
metrics::counter!(
|
||||||
"gpu.texture.decode",
|
"gpu.texture.decode",
|
||||||
"fmt" => format!("{:?}", key.format),
|
"fmt" => format!("{:?}", key.format),
|
||||||
@@ -1350,6 +1438,18 @@ impl GpuSystem {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// iterate-3T: attach this draw's decoded textures to the just-
|
||||||
|
// captured draw so the UI can bind the real artwork per-draw
|
||||||
|
// (keyed off the active PS's real tfetch slots) instead of a
|
||||||
|
// single last-draw `primary_texture`. UI-only (`frame_captures`
|
||||||
|
// is `None` headless); does not touch the deterministic core.
|
||||||
|
if !self.last_draw_textures.is_empty()
|
||||||
|
&& let Some(caps) = self.frame_captures.as_mut()
|
||||||
|
&& let Some(last) = caps.last_mut()
|
||||||
|
{
|
||||||
|
last.textures = self.last_draw_textures.clone();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
||||||
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
||||||
|
|||||||
@@ -444,6 +444,23 @@ impl GpuBackend {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Current guest present (`VdSwap`) count. Cheap single-field read used
|
||||||
|
/// by the present-anchored vsync ticker (iterate-3AJ) every scheduler
|
||||||
|
/// round. Inline mode reads the live counter directly; threaded mode
|
||||||
|
/// reads the last-published digest mirror under a brief lock (the
|
||||||
|
/// `--parallel` path uses the wall-clock vsync ticker anyway, so the
|
||||||
|
/// exact freshness here is not load-bearing).
|
||||||
|
pub fn swaps_seen(&self) -> u64 {
|
||||||
|
match self {
|
||||||
|
GpuBackend::Inline(s) => s.stats.swaps_seen,
|
||||||
|
GpuBackend::Threaded(h) => h
|
||||||
|
.digest
|
||||||
|
.lock()
|
||||||
|
.map(|d| d.stats.swaps_seen)
|
||||||
|
.unwrap_or(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Forward [`GpuSystem::has_pending_interrupts`] under inline mode;
|
/// Forward [`GpuSystem::has_pending_interrupts`] under inline mode;
|
||||||
/// under threaded mode peek the `int_rx` channel.
|
/// under threaded mode peek the `int_rx` channel.
|
||||||
pub fn has_pending_interrupts(&self) -> bool {
|
pub fn has_pending_interrupts(&self) -> bool {
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
//! [`gpu_system::GpuSystem`].
|
//! [`gpu_system::GpuSystem`].
|
||||||
|
|
||||||
pub mod command_processor;
|
pub mod command_processor;
|
||||||
|
pub mod draw_capture;
|
||||||
pub mod draw_state;
|
pub mod draw_state;
|
||||||
pub mod edram;
|
pub mod edram;
|
||||||
pub mod gpu_system;
|
pub mod gpu_system;
|
||||||
|
|||||||
@@ -45,8 +45,9 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
|||||||
parsed.instructions[base + 1],
|
parsed.instructions[base + 1],
|
||||||
parsed.instructions[base + 2],
|
parsed.instructions[base + 2],
|
||||||
];
|
];
|
||||||
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||||
|
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||||
if is_fetch {
|
if is_fetch {
|
||||||
match decode_fetch(words) {
|
match decode_fetch(words) {
|
||||||
FetchInstruction::Vertex(_) => vfetch_count += 1,
|
FetchInstruction::Vertex(_) => vfetch_count += 1,
|
||||||
@@ -196,8 +197,9 @@ pub fn tfetch_slots(parsed: &ParsedShader) -> Vec<u8> {
|
|||||||
if base + 2 >= parsed.instructions.len() {
|
if base + 2 >= parsed.instructions.len() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||||
|
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||||
if !is_fetch {
|
if !is_fetch {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -345,17 +347,17 @@ mod tests {
|
|||||||
/// fetch (and dedup), and return empty for a flat ALU-only shader.
|
/// fetch (and dedup), and return empty for a flat ALU-only shader.
|
||||||
#[test]
|
#[test]
|
||||||
fn tfetch_slots_extracts_texture_fetch_constants() {
|
fn tfetch_slots_extracts_texture_fetch_constants() {
|
||||||
// word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, fetch_const=3 in
|
// word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, const_index=3 in
|
||||||
// bits[9:5] → 0x01 | (3 << 5) = 0x61.
|
// bits[24:20] (Xenos `ucode.h:844`) → 0x01 | (3 << 20).
|
||||||
let tfetch_w0: u32 = 0x01 | (3u32 << 5);
|
let tfetch_w0: u32 = 0x01 | (3u32 << 20);
|
||||||
let shader = ParsedShader {
|
let shader = ParsedShader {
|
||||||
cf: vec![
|
cf: vec![
|
||||||
ControlFlowInstruction::Exec {
|
ControlFlowInstruction::Exec {
|
||||||
address: 0,
|
address: 0,
|
||||||
count: 2,
|
count: 2,
|
||||||
// triple 0 is a fetch (hi bit of its 2-bit field set),
|
// instruction 0 is a fetch (bit[0] of its 2-bit field set),
|
||||||
// triple 1 is ALU. is_fetch = (sequence >> (i*2+1)) & 1.
|
// instruction 1 is ALU. is_fetch = (sequence >> (i*2)) & 1.
|
||||||
sequence: 0b00_10,
|
sequence: 0b00_01,
|
||||||
is_end: false,
|
is_end: false,
|
||||||
predicated: false,
|
predicated: false,
|
||||||
predicate_condition: false,
|
predicate_condition: false,
|
||||||
|
|||||||
@@ -20,7 +20,15 @@ struct XenosDrawConstants {
|
|||||||
draw_index: u32,
|
draw_index: u32,
|
||||||
vertex_count: u32,
|
vertex_count: u32,
|
||||||
prim_kind: u32,
|
prim_kind: u32,
|
||||||
_pad: u32,
|
// iterate-3O: guest dword address that maps to index 0 of `vertex_buffer`.
|
||||||
|
// The CPU uploads a bounded guest-memory window starting at the active
|
||||||
|
// vertex-fetch base; the shader subtracts this base from the absolute
|
||||||
|
// fetch-constant address so it indexes the uploaded window. 0 means "no
|
||||||
|
// real vertex window" (procedural fallback path).
|
||||||
|
vertex_base_dwords: u32,
|
||||||
|
// iterate-3S: guest viewport → host NDC XY transform (Y pre-flipped).
|
||||||
|
ndc_scale: vec2<f32>,
|
||||||
|
ndc_offset: vec2<f32>,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct XenosConstants {
|
struct XenosConstants {
|
||||||
@@ -56,6 +64,7 @@ const CF_KIND_LOOP_END: u32 = 5u;
|
|||||||
const CF_KIND_COND_JMP: u32 = 6u;
|
const CF_KIND_COND_JMP: u32 = 6u;
|
||||||
const CF_KIND_COND_CALL: u32 = 7u;
|
const CF_KIND_COND_CALL: u32 = 7u;
|
||||||
const CF_KIND_RETURN: u32 = 8u;
|
const CF_KIND_RETURN: u32 = 8u;
|
||||||
|
const CF_KIND_NOP: u32 = 9u;
|
||||||
const CF_KIND_UNKNOWN: u32 = 15u;
|
const CF_KIND_UNKNOWN: u32 = 15u;
|
||||||
|
|
||||||
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
|
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
|
||||||
@@ -628,8 +637,8 @@ const VFMT_32_32_32_FLOAT: u32 = 57u;
|
|||||||
// layout in `ucode.h:690`):
|
// layout in `ucode.h:690`):
|
||||||
// w0 [4:0] opcode
|
// w0 [4:0] opcode
|
||||||
// w0 [10:5] src_reg[5:0]
|
// w0 [10:5] src_reg[5:0]
|
||||||
// w0 [17:11] dst_reg[6:0] + must-be-one
|
// w0 [17:12] dst_reg[5:0]
|
||||||
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
|
// w0 [24:20] const_index[4:0], [26:25] const_index_sel[1:0]
|
||||||
// w1 [21:16] format[5:0]
|
// w1 [21:16] format[5:0]
|
||||||
// w2 [7:0] stride (in dwords)
|
// w2 [7:0] stride (in dwords)
|
||||||
// w2 [30:8] offset (signed, in dwords)
|
// w2 [30:8] offset (signed, in dwords)
|
||||||
@@ -641,9 +650,9 @@ fn interpret_vertex_fetch(t: u32) {
|
|||||||
let w0 = vs_instr_dword(t, 0u);
|
let w0 = vs_instr_dword(t, 0u);
|
||||||
let w1 = vs_instr_dword(t, 1u);
|
let w1 = vs_instr_dword(t, 1u);
|
||||||
let w2 = vs_instr_dword(t, 2u);
|
let w2 = vs_instr_dword(t, 2u);
|
||||||
let fetch_const = (w0 >> 5u) & 0x1Fu;
|
let fetch_const = (w0 >> 20u) & 0x1Fu;
|
||||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||||
let format = (w1 >> 16u) & 0x3Fu;
|
let format = (w1 >> 16u) & 0x3Fu;
|
||||||
let stride = w2 & 0xFFu;
|
let stride = w2 & 0xFFu;
|
||||||
|
|
||||||
@@ -651,7 +660,20 @@ fn interpret_vertex_fetch(t: u32) {
|
|||||||
// dword 1 carries (endian[1:0], size[25:2]).
|
// dword 1 carries (endian[1:0], size[25:2]).
|
||||||
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
||||||
let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u];
|
let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u];
|
||||||
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
// iterate-3O: the fetch constant holds an *absolute* guest dword address.
|
||||||
|
// The CPU uploaded a window of guest memory starting at
|
||||||
|
// `draw_ctx.vertex_base_dwords`, so rebase the absolute address into that
|
||||||
|
// window. When no real window was published (`vertex_base_dwords == 0`)
|
||||||
|
// keep the absolute value (the `addr < n` guards below then skip the read
|
||||||
|
// and the procedural fallback position is used).
|
||||||
|
// GPUBUG-108 (iterate-3S): the captured window begins exactly at the fetch
|
||||||
|
// base, so index from 0 (vertex i at i*stride). The uniform `fetch[]` holds
|
||||||
|
// the last-published per-frame constant, not this draw's — recomputing
|
||||||
|
// `abs_base` from it produced a stale out-of-window address (the splash
|
||||||
|
// collapsed to one pixel). Only consult the uniform for the no-window
|
||||||
|
// synthetic fallback.
|
||||||
|
let abs_base = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||||
|
let base_dwords = select(abs_base, 0u, draw_ctx.vertex_base_dwords != 0u);
|
||||||
// GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is
|
// GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is
|
||||||
// big-endian; the host is little-endian. Pre-fix every dword was
|
// big-endian; the host is little-endian. Pre-fix every dword was
|
||||||
// bitcast as-is — vertex positions were byte-reversed garbage.
|
// bitcast as-is — vertex positions were byte-reversed garbage.
|
||||||
@@ -773,20 +795,20 @@ fn interpret_texture_fetch(t: u32, is_vertex: bool) {
|
|||||||
} else {
|
} else {
|
||||||
w0 = ps_instr_dword(t, 0u);
|
w0 = ps_instr_dword(t, 0u);
|
||||||
}
|
}
|
||||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||||
let uv = registers[src_reg & 0x7Fu].xy;
|
let uv = registers[src_reg & 0x3Fu].xy;
|
||||||
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
|
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
|
||||||
registers[dst_reg & 0x7Fu] = sample;
|
registers[dst_reg & 0x3Fu] = sample;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Walk an Exec clause's instruction triples.
|
// Walk an Exec clause's instruction triples.
|
||||||
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
|
// sequence: 2-bit-per-instruction bitmap. Bit 0 of a pair = fetch(1)/ALU(0);
|
||||||
// (we ignore in MVP); bit 1 = is-fetch.
|
// bit 1 = serialize (ignored). (Xenos `ucode.h:226`.)
|
||||||
fn exec_vs(address: u32, count: u32, sequence: u32) {
|
fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||||
let t = address + i;
|
let t = address + i;
|
||||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||||
if is_fetch {
|
if is_fetch {
|
||||||
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
|
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
|
||||||
// 0x00 = vertex fetch, 0x01 = texture fetch.
|
// 0x00 = vertex fetch, 0x01 = texture fetch.
|
||||||
@@ -803,7 +825,7 @@ fn exec_vs(address: u32, count: u32, sequence: u32) {
|
|||||||
fn exec_ps(address: u32, count: u32, sequence: u32) {
|
fn exec_ps(address: u32, count: u32, sequence: u32) {
|
||||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||||
let t = address + i;
|
let t = address + i;
|
||||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||||
if is_fetch {
|
if is_fetch {
|
||||||
interpret_texture_fetch(t, false);
|
interpret_texture_fetch(t, false);
|
||||||
} else {
|
} else {
|
||||||
@@ -871,7 +893,13 @@ fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
|
|||||||
// Use registers[OPOS_REG] as position; the procedural fallback above
|
// Use registers[OPOS_REG] as position; the procedural fallback above
|
||||||
// seeded it so an un-interpreted shader still draws a recognisable
|
// seeded it so an un-interpreted shader still draws a recognisable
|
||||||
// circle.
|
// circle.
|
||||||
out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
var opos = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
||||||
|
// iterate-3S: guest VS position → host clip space (see translator.rs). When
|
||||||
|
// the transform is unset (procedural fallback) pass through unchanged.
|
||||||
|
if (draw_ctx.ndc_scale.x != 0.0 || draw_ctx.ndc_scale.y != 0.0) {
|
||||||
|
opos = vec4<f32>(opos.xy * draw_ctx.ndc_scale + draw_ctx.ndc_offset * opos.w, opos.z, opos.w);
|
||||||
|
}
|
||||||
|
out.position = opos;
|
||||||
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
|
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
@@ -962,6 +990,9 @@ fn walk_cf_vs() {
|
|||||||
// No call stack — mark and continue.
|
// No call stack — mark and continue.
|
||||||
reject_mask |= REJECT_CF_CALL;
|
reject_mask |= REJECT_CF_CALL;
|
||||||
}
|
}
|
||||||
|
case CF_KIND_NOP: {
|
||||||
|
// kNop padding / kMarkVsFetchDone hint — no-op, just advance.
|
||||||
|
}
|
||||||
default: { reject_mask |= REJECT_CF_JUMP; }
|
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||||
}
|
}
|
||||||
if stop { break; }
|
if stop { break; }
|
||||||
|
|||||||
@@ -94,7 +94,9 @@ struct XenosDrawConstants {
|
|||||||
draw_index: u32,
|
draw_index: u32,
|
||||||
vertex_count: u32,
|
vertex_count: u32,
|
||||||
prim_kind: u32,
|
prim_kind: u32,
|
||||||
_pad: u32,
|
vertex_base_dwords: u32,
|
||||||
|
ndc_scale: vec2<f32>,
|
||||||
|
ndc_offset: vec2<f32>,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct XenosConstants {
|
struct XenosConstants {
|
||||||
@@ -113,9 +115,21 @@ struct XenosConstants {
|
|||||||
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||||
@group(1) @binding(1) var xenos_samp : sampler;
|
@group(1) @binding(1) var xenos_samp : sampler;
|
||||||
|
|
||||||
|
// iterate-3T: real interpolator passthrough. The Xenos VS exports up to 16
|
||||||
|
// interpolators (export index 0..15); the PS reads interpolator i from its
|
||||||
|
// general register r[i]. We carry 8 interpolator vec4s (covers Sylpheed's
|
||||||
|
// splash: r0=color, r1=texcoord). `color` retained as an alias of interp0 so
|
||||||
|
// older single-color paths keep working.
|
||||||
struct VsOut {
|
struct VsOut {
|
||||||
@builtin(position) position: vec4<f32>,
|
@builtin(position) position: vec4<f32>,
|
||||||
@location(0) color: vec4<f32>,
|
@location(0) interp0: vec4<f32>,
|
||||||
|
@location(1) interp1: vec4<f32>,
|
||||||
|
@location(2) interp2: vec4<f32>,
|
||||||
|
@location(3) interp3: vec4<f32>,
|
||||||
|
@location(4) interp4: vec4<f32>,
|
||||||
|
@location(5) interp5: vec4<f32>,
|
||||||
|
@location(6) interp6: vec4<f32>,
|
||||||
|
@location(7) interp7: vec4<f32>,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct FsOut {
|
struct FsOut {
|
||||||
@@ -154,6 +168,14 @@ struct EmitCtx {
|
|||||||
stage: Stage,
|
stage: Stage,
|
||||||
out: String,
|
out: String,
|
||||||
indent: usize,
|
indent: usize,
|
||||||
|
/// GPUBUG-114: dword stride of the most recent *full* vfetch, keyed by
|
||||||
|
/// fetch-const register offset. A vfetch_mini carries stride=0 and reuses
|
||||||
|
/// the address + stride of the preceding full vfetch of the same stream
|
||||||
|
/// (canary ucode.h:733). Without this a mini color attribute indexes by its
|
||||||
|
/// tight dword count instead of the real vertex stride → reads the wrong
|
||||||
|
/// vertex's data (Sylpheed's background fill `0x36660986` read garbage →
|
||||||
|
/// white instead of the intended color).
|
||||||
|
last_full_stride: std::collections::HashMap<u32, u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EmitCtx {
|
impl EmitCtx {
|
||||||
@@ -162,6 +184,7 @@ impl EmitCtx {
|
|||||||
stage,
|
stage,
|
||||||
out: String::with_capacity(2048),
|
out: String::with_capacity(2048),
|
||||||
indent: 0,
|
indent: 0,
|
||||||
|
last_full_stride: std::collections::HashMap::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -198,19 +221,74 @@ impl EmitCtx {
|
|||||||
self.push("var ps: f32 = 0.0;");
|
self.push("var ps: f32 = 0.0;");
|
||||||
match self.stage {
|
match self.stage {
|
||||||
Stage::Vertex => {
|
Stage::Vertex => {
|
||||||
|
// iterate-3T: host→guest vertex-index remap for primitives the
|
||||||
|
// replay draws non-indexed as a flat triangle list. wgpu has no
|
||||||
|
// QuadList/RectangleList topology, so the host issues 6 vertices
|
||||||
|
// per quad/rect and we map them back to the guest's 4/3 source
|
||||||
|
// vertices here (mirrors `primitive.rs` index rewrite, but in the
|
||||||
|
// VS since the replay path is non-indexed):
|
||||||
|
// QuadList(13): 6 host verts → guest [0,1,2, 0,2,3]
|
||||||
|
// RectangleList(8): drawn as one triangle [0,1,2] (the 4th
|
||||||
|
// corner needs cross-vertex synthesis — TODO), so host
|
||||||
|
// indices >=3 fold onto the existing triangle.
|
||||||
|
// Other prims pass through unchanged.
|
||||||
|
self.push("var gvidx: u32 = vidx;");
|
||||||
|
self.push("if (draw_ctx.prim_kind == 13u) {");
|
||||||
|
self.indent += 1;
|
||||||
|
self.push("let q = vidx % 6u; let qbase = (vidx / 6u) * 4u;");
|
||||||
|
self.push("var lut = array<u32, 6>(0u, 1u, 2u, 0u, 2u, 3u);");
|
||||||
|
self.push("gvidx = qbase + lut[q];");
|
||||||
|
self.indent -= 1;
|
||||||
|
self.push("} else if (draw_ctx.prim_kind == 8u) {");
|
||||||
|
self.indent += 1;
|
||||||
|
self.push("let t = vidx % 3u; let rbase = (vidx / 3u) * 3u;");
|
||||||
|
self.push("gvidx = rbase + t;");
|
||||||
|
self.indent -= 1;
|
||||||
|
self.push("}");
|
||||||
// Seed r0 with vertex index for simple shaders that read it.
|
// Seed r0 with vertex index for simple shaders that read it.
|
||||||
self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
|
self.push("r[0] = vec4<f32>(f32(gvidx), 0.0, 0.0, 1.0);");
|
||||||
// Synthetic export slots — match the interpreter's layout so
|
// iterate-3T: real export model. Xenos export index 62 = oPos;
|
||||||
// the fallback path and translator path produce the same
|
// indices 0..15 = interpolators. We hold position + 8
|
||||||
// visual output on shaders both support.
|
// interpolator vec4s; `emit_export` writes the right slot keyed
|
||||||
|
// on the export index.
|
||||||
|
//
|
||||||
|
// iterate-3AE (WHITE-TRIANGLE ROOT): interpolators a VS does NOT
|
||||||
|
// export must default to ZERO, not white. The old `ointerp[0] =
|
||||||
|
// (1,1,1,1)` was an iterate-3T debug convenience ("so a VS that
|
||||||
|
// only exports position still yields a visible non-zero color")
|
||||||
|
// — but it is a FAKE: it injects white that no guest value backs.
|
||||||
|
// The transition/background draws use the position-only VS
|
||||||
|
// `0xd4c14f46` (one vfetch → oPos; it exports NO color) paired
|
||||||
|
// with PS `0xed732b5a` (`ocolor0 = interp0`). With the white
|
||||||
|
// seed, interp0 stayed (1,1,1,1) → the fullscreen fill rendered
|
||||||
|
// OPAQUE WHITE (the diagonal half-triangle artifact that flashed
|
||||||
|
// before each splash logo and persisted across the dev-logo
|
||||||
|
// transition). Canary shows a black background there because the
|
||||||
|
// un-exported interpolator carries no white. Default to
|
||||||
|
// (0,0,0,0): a position-only VS now contributes nothing visible
|
||||||
|
// under its real (opaque or premultiplied) blend, matching
|
||||||
|
// canary, while every VS that really exports interp0 (the logo
|
||||||
|
// `0x03b7b020`, the `0x36660986` color fill) overwrites this seed
|
||||||
|
// and is unaffected. RGB=0 → black fill; A=0 → premultiplied
|
||||||
|
// overlays stay transparent.
|
||||||
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
||||||
self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
|
self.push("var ointerp: array<vec4<f32>, 8>;");
|
||||||
|
self.push("for (var i = 0u; i < 8u; i = i + 1u) { ointerp[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0); }");
|
||||||
}
|
}
|
||||||
Stage::Pixel => {
|
Stage::Pixel => {
|
||||||
// Seed r0.xy with interpolated color lane so trivial shaders
|
// iterate-3T: the PS reads interpolator i from general register
|
||||||
// that read r0 still produce something.
|
// r[i] (Xenos PS input GPR mapping). Seed r0..r7 from the VS's
|
||||||
self.push("r[0] = in.color;");
|
// interpolators so e.g. the logo PS's texcoord (r1) and color
|
||||||
self.push("var ocolor0: vec4<f32> = in.color;");
|
// (r0) arrive correctly; tfetch then samples at the real UV.
|
||||||
|
self.push("r[0] = in.interp0;");
|
||||||
|
self.push("r[1] = in.interp1;");
|
||||||
|
self.push("r[2] = in.interp2;");
|
||||||
|
self.push("r[3] = in.interp3;");
|
||||||
|
self.push("r[4] = in.interp4;");
|
||||||
|
self.push("r[5] = in.interp5;");
|
||||||
|
self.push("r[6] = in.interp6;");
|
||||||
|
self.push("r[7] = in.interp7;");
|
||||||
|
self.push("var ocolor0: vec4<f32> = in.interp0;");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -237,6 +315,10 @@ impl EmitCtx {
|
|||||||
current_alloc = *kind;
|
current_alloc = *kind;
|
||||||
}
|
}
|
||||||
ControlFlowInstruction::Exit => break,
|
ControlFlowInstruction::Exit => break,
|
||||||
|
// Non-executing CF clauses: padding (`kNop`) and the
|
||||||
|
// vertex-fetch-done hint (`kMarkVsFetchDone`). Skip them.
|
||||||
|
ControlFlowInstruction::Nop
|
||||||
|
| ControlFlowInstruction::MarkVsFetchDone => {}
|
||||||
ControlFlowInstruction::LoopStart { .. }
|
ControlFlowInstruction::LoopStart { .. }
|
||||||
| ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
|
| ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
|
||||||
ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
|
ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
|
||||||
@@ -250,13 +332,41 @@ impl EmitCtx {
|
|||||||
match self.stage {
|
match self.stage {
|
||||||
Stage::Vertex => {
|
Stage::Vertex => {
|
||||||
self.push("var out: VsOut;");
|
self.push("var out: VsOut;");
|
||||||
|
// iterate-3S: guest VS position → host clip space. The guest
|
||||||
|
// emits either clip-space or (screen-space, clip disabled)
|
||||||
|
// render-target-pixel coords; `ndc_scale`/`ndc_offset` (from
|
||||||
|
// canary's GetHostViewportInfo, computed CPU-side per draw)
|
||||||
|
// rescale XY into wgpu clip space with Y already flipped. When
|
||||||
|
// the transform is unset (all-zero scale, procedural fallback)
|
||||||
|
// pass the position through unchanged.
|
||||||
|
self.push("if (draw_ctx.ndc_scale.x != 0.0 || draw_ctx.ndc_scale.y != 0.0) {");
|
||||||
|
self.indent += 1;
|
||||||
|
self.push("opos = vec4<f32>(opos.xy * draw_ctx.ndc_scale + draw_ctx.ndc_offset * opos.w, opos.z, opos.w);");
|
||||||
|
self.indent -= 1;
|
||||||
|
self.push("}");
|
||||||
self.push("out.position = opos;");
|
self.push("out.position = opos;");
|
||||||
self.push("out.color = ocolor;");
|
self.push("out.interp0 = ointerp[0];");
|
||||||
|
self.push("out.interp1 = ointerp[1];");
|
||||||
|
self.push("out.interp2 = ointerp[2];");
|
||||||
|
self.push("out.interp3 = ointerp[3];");
|
||||||
|
self.push("out.interp4 = ointerp[4];");
|
||||||
|
self.push("out.interp5 = ointerp[5];");
|
||||||
|
self.push("out.interp6 = ointerp[6];");
|
||||||
|
self.push("out.interp7 = ointerp[7];");
|
||||||
self.push("return out;");
|
self.push("return out;");
|
||||||
}
|
}
|
||||||
Stage::Pixel => {
|
Stage::Pixel => {
|
||||||
self.push("var out: FsOut;");
|
self.push("var out: FsOut;");
|
||||||
self.push("out.color0 = ocolor0;");
|
// GPUBUG-115: saturate the color export to [0,1], flushing NaN
|
||||||
|
// to 0 — exactly what canary does before writing a UNORM render
|
||||||
|
// target (spirv_shader_translator.cc:3607 "Saturate, flushing
|
||||||
|
// NaN to 0"). The Xenos RB clamps PS output for UNORM targets;
|
||||||
|
// without this an out-of-range guest color (Sylpheed's
|
||||||
|
// background fill exports a huge negative float `-32896.5` as a
|
||||||
|
// fullscreen-clear value) writes garbage/NaN to the sRGB target
|
||||||
|
// → renders white instead of the clamped black canary shows.
|
||||||
|
// `clamp(x,0,1)` returns 0 for NaN under WGSL's clamp semantics.
|
||||||
|
self.push("out.color0 = clamp(ocolor0, vec4<f32>(0.0), vec4<f32>(1.0));");
|
||||||
self.push("return out;");
|
self.push("return out;");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -284,7 +394,9 @@ impl EmitCtx {
|
|||||||
parsed.instructions[base + 1],
|
parsed.instructions[base + 1],
|
||||||
parsed.instructions[base + 2],
|
parsed.instructions[base + 2],
|
||||||
];
|
];
|
||||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||||
|
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||||
|
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||||
if is_fetch {
|
if is_fetch {
|
||||||
match decode_fetch(words) {
|
match decode_fetch(words) {
|
||||||
FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
|
FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
|
||||||
@@ -378,53 +490,185 @@ impl EmitCtx {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
||||||
// Xenos's export "register" indexing within an alloc range is
|
// iterate-3T: real Xenos export-index model (replaces the `AllocKind`
|
||||||
// normally (alloc_base + offset). Since our CF stream doesn't
|
// heuristic, which collapsed every VS export to a single color slot and
|
||||||
// carry per-export slot offsets cleanly, use `alloc` to pick the
|
// dropped the texcoord interpolator → tfetch sampled (0,0) → flat).
|
||||||
// target.
|
// When `export_data` is set the 6-bit vector_dest IS the export index:
|
||||||
let lhs = match (self.stage, alloc) {
|
// VS: 62 = oPos, 63 = oPointSize/edge (ignored), 0..15 = interpolators.
|
||||||
(Stage::Vertex, AllocKind::Position) => "opos",
|
// PS: 0..3 = color render targets (we honor RT0).
|
||||||
(Stage::Vertex, AllocKind::Interpolators) => "ocolor",
|
let _ = alloc;
|
||||||
(Stage::Vertex, AllocKind::Colors) => "ocolor",
|
match self.stage {
|
||||||
(Stage::Vertex, _) => "ocolor", // fall through — any other alloc
|
Stage::Vertex => {
|
||||||
(Stage::Pixel, AllocKind::Colors) => "ocolor0",
|
let lhs = if dst_reg == 62 {
|
||||||
(Stage::Pixel, _) => "ocolor0",
|
"opos".to_string()
|
||||||
|
} else if dst_reg <= 15 {
|
||||||
|
// Clamp to the 8 interpolator slots we carry; higher slots
|
||||||
|
// are unused by Sylpheed's splash.
|
||||||
|
let i = (dst_reg as usize).min(7);
|
||||||
|
format!("ointerp[{i}u]")
|
||||||
|
} else {
|
||||||
|
// oPointSize (63) / unknown export slot — discard.
|
||||||
|
return;
|
||||||
};
|
};
|
||||||
let _ = dst_reg; // per-slot export indexing reserved for a richer v2
|
self.emit_masked_write(&lhs, expr, mask);
|
||||||
self.emit_masked_write(lhs, expr, mask);
|
}
|
||||||
|
Stage::Pixel => {
|
||||||
|
// Only RT0 (export index 0) is wired to the single host target.
|
||||||
|
if dst_reg == 0 {
|
||||||
|
self.emit_masked_write("ocolor0", expr, mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
||||||
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
// GPUBUG-107 (iterate-3S): decode the vertex FORMAT + dword STRIDE from
|
||||||
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
// the vfetch instruction instead of hardcoding R32G32B32A32 (4 floats,
|
||||||
// formats alongside the CPU texture cache's format expansion.
|
// stride 4). Sylpheed's splash quads are `k_32_32_FLOAT` (2 floats,
|
||||||
|
// stride 2); over-reading them put the next vertex's X into .w → a
|
||||||
|
// negative W → the whole rectangle clipped behind the camera. We cover
|
||||||
|
// the float vertex formats (the UI / screen-space draws); other formats
|
||||||
|
// reject to the interpreter.
|
||||||
//
|
//
|
||||||
// GPUBUG-102: the fetch constant (xe_gpu_vertex_fetch_t,
|
// GPUBUG-102: the fetch constant holds the endian field in dword_1's
|
||||||
// xenos.h:1158-1172) holds the endian field in dword_1's low
|
// low 2 bits; Xbox 360 vertex data is big-endian, so `gpu_swap` undoes
|
||||||
// 2 bits. Vertex data on Xbox 360 is big-endian; the host is
|
// it per component.
|
||||||
// little-endian. Pre-fix, every dword was bitcast as-is →
|
// (comps, dwords_read) per format. Float formats are 1 dword/component;
|
||||||
// vertex positions were byte-reversed garbage and any draw
|
// iterate-3T adds the packed-16 `k_16_16` (format 6) used for the logo
|
||||||
// that did reach the host produced clipped / NaN positions.
|
// UV interpolator — 2 components packed into ONE dword.
|
||||||
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
#[derive(PartialEq)]
|
||||||
|
enum Pack {
|
||||||
|
Float, // N f32 lanes, N dwords
|
||||||
|
Norm16x2, // 2× u16 normalized into [0,1], 1 dword (k_16_16)
|
||||||
|
Norm8x4, // 4× u8 normalized into [0,1], 1 dword (k_8_8_8_8)
|
||||||
|
}
|
||||||
|
let (comps, dwords_read, pack): (u32, u32, Pack) = match vf.format {
|
||||||
|
36 => (1, 1, Pack::Float), // k_32_FLOAT
|
||||||
|
37 => (2, 2, Pack::Float), // k_32_32_FLOAT
|
||||||
|
57 => (3, 3, Pack::Float), // k_32_32_32_FLOAT
|
||||||
|
38 => (4, 4, Pack::Float), // k_32_32_32_32_FLOAT
|
||||||
|
6 => (4, 1, Pack::Norm8x4), // k_8_8_8_8 (packed RGBA8 — GPUBUG-112)
|
||||||
|
25 => (2, 1, Pack::Norm16x2), // k_16_16
|
||||||
|
_ => return Err(reject::VFETCH_FMT),
|
||||||
|
};
|
||||||
|
// iterate-3X (GPUBUG-110): index the fetch-constant region by the full
|
||||||
|
// `const_index*3 + const_index_sel` mapping (canary `ucode.h:700`),
|
||||||
|
// packed as `const_index*6 + sel*2` dwords. The previous expression
|
||||||
|
// `(vf.raw[0] >> 5) & 0x1F` read the *src_reg* bits, not the const
|
||||||
|
// index — wrong for the endian term and the no-window fallback base.
|
||||||
|
let const_off = vf.const_reg_offset();
|
||||||
|
// GPUBUG-114: a full vfetch carries the real vertex dword stride; a
|
||||||
|
// vfetch_mini reuses the address + stride of the preceding full vfetch
|
||||||
|
// of the same stream (canary ucode.h:733). Track the last full stride
|
||||||
|
// per fetch-const and inherit it for mini-fetches (stride field == 0).
|
||||||
|
let stride = if vf.is_mini_fetch || vf.stride == 0 {
|
||||||
|
*self
|
||||||
|
.last_full_stride
|
||||||
|
.get(&const_off)
|
||||||
|
.unwrap_or(&dwords_read)
|
||||||
|
} else {
|
||||||
|
self.last_full_stride.insert(const_off, vf.stride as u32);
|
||||||
|
vf.stride as u32
|
||||||
|
};
|
||||||
|
// iterate-3T: per-attribute dword offset within the vertex (vfetches
|
||||||
|
// sharing one fetch constant read different attributes).
|
||||||
|
let attr_off = vf.offset;
|
||||||
let src_reg = vf.src_register & 0x7F;
|
let src_reg = vf.src_register & 0x7F;
|
||||||
let dst_reg = vf.dest_register & 0x7F;
|
let dst_reg = vf.dest_register & 0x7F;
|
||||||
|
// is_signed selects [-1,1] vs [0,1] for normalized integer formats.
|
||||||
|
let signed = vf.is_signed;
|
||||||
|
// Build the per-component reads; unread lanes default to 0/0/0/1 so an
|
||||||
|
// XY-only position keeps W=1 (and Z=0).
|
||||||
|
let lane = |i: u32| -> String {
|
||||||
|
match pack {
|
||||||
|
Pack::Float => {
|
||||||
|
if i < comps {
|
||||||
|
format!("bitcast<f32>(gpu_swap(vertex_buffer[addr + {i}u], endian))")
|
||||||
|
} else if i == 3 {
|
||||||
|
"1.0".to_string()
|
||||||
|
} else {
|
||||||
|
"0.0".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Pack::Norm16x2 => {
|
||||||
|
// One dword holds [u16 lo | u16 hi] after the endian swap.
|
||||||
|
// Component 0 = low halfword, component 1 = high halfword.
|
||||||
|
if i == 0 {
|
||||||
|
if signed {
|
||||||
|
"(max(f32(i32(w16 << 16u) >> 16u) / 32767.0, -1.0))".to_string()
|
||||||
|
} else {
|
||||||
|
"(f32(w16 & 0xFFFFu) / 65535.0)".to_string()
|
||||||
|
}
|
||||||
|
} else if i == 1 {
|
||||||
|
if signed {
|
||||||
|
"(max(f32(i32(w16) >> 16u) / 32767.0, -1.0))".to_string()
|
||||||
|
} else {
|
||||||
|
"(f32(w16 >> 16u) / 65535.0)".to_string()
|
||||||
|
}
|
||||||
|
} else if i == 3 {
|
||||||
|
"1.0".to_string()
|
||||||
|
} else {
|
||||||
|
"0.0".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Pack::Norm8x4 => {
|
||||||
|
// One dword holds 4× u8 (canary spirv_shader_translator_fetch
|
||||||
|
// k_8_8_8_8: comp0@bit0, comp1@bit8, comp2@bit16, comp3@bit24)
|
||||||
|
// after the endian swap. All four channels present → normalize
|
||||||
|
// to [0,1]. GPUBUG-112: this is the logo/background vertex
|
||||||
|
// COLOR (RGBA8), previously misdecoded as k_16_16 (2 chans,
|
||||||
|
// B forced 0) → white texture × (R,G,0) = yellow.
|
||||||
|
let sh = i * 8;
|
||||||
|
if signed {
|
||||||
|
format!(
|
||||||
|
"(max(f32(i32(w16 << {l}u) >> 24u) / 127.0, -1.0))",
|
||||||
|
l = 24 - sh
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
format!("(f32((w16 >> {sh}u) & 0xFFu) / 255.0)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let read_bound = dwords_read - 1;
|
||||||
|
// GPUBUG-108 (iterate-3S): for the captured-geometry path the CPU
|
||||||
|
// uploads a vertex window that begins EXACTLY at the fetch base, so the
|
||||||
|
// base within `vertex_buffer` is 0 and vertex i sits at `i * stride`.
|
||||||
|
// The previous `abs_base - vertex_base_dwords` rebase recomputed the
|
||||||
|
// base from `xenos_consts.fetch[]`, but that uniform carries the
|
||||||
|
// *last-published* (per-frame) fetch constant, not this draw's — for
|
||||||
|
// the splash it was stale (0x8a000002 vs the real 0x0adf… base), so the
|
||||||
|
// rebase produced a huge out-of-window address, the bounds guard
|
||||||
|
// failed, and every vertex kept its seed (vertex_index, 0, 0, 1) →
|
||||||
|
// every quad collapsed to ~one pixel at the origin. Index from 0 when a
|
||||||
|
// real window is present (`vertex_base_dwords != 0`); only the
|
||||||
|
// synthetic/no-window fallback consults the uniform fetch constant.
|
||||||
|
let endian_term = format!("xenos_consts.fetch[{}u] & 0x3u", const_off + 1);
|
||||||
|
// For packed formats (k_16_16, k_8_8_8_8) we read one dword into `w16`
|
||||||
|
// (post endian-swap) and the `lane()` exprs above unpack the channels.
|
||||||
|
let w16_decl = if pack == Pack::Norm16x2 || pack == Pack::Norm8x4 {
|
||||||
|
"let w16 = gpu_swap(vertex_buffer[addr], endian); "
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
};
|
||||||
self.push(&format!(
|
self.push(&format!(
|
||||||
"{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \
|
"{{ let endian = {endian_term}; \
|
||||||
let fc1 = xenos_consts.fetch[{fc1_idx}u]; \
|
|
||||||
let endian = fc1 & 0x3u; \
|
|
||||||
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
|
||||||
let vidx = u32(r[{src_reg}u].x); \
|
let vidx = u32(r[{src_reg}u].x); \
|
||||||
let addr = base + vidx * 4u; \
|
var base = 0u; \
|
||||||
|
if (draw_ctx.vertex_base_dwords == 0u) {{ \
|
||||||
|
base = (xenos_consts.fetch[{fc0_idx}u] & 0xFFFFFFFCu) >> 2u; \
|
||||||
|
}} \
|
||||||
|
let addr = base + vidx * {stride}u + {attr_off}u; \
|
||||||
let n = arrayLength(&vertex_buffer); \
|
let n = arrayLength(&vertex_buffer); \
|
||||||
if (addr + 3u < n) {{ \
|
if (addr + {read_bound}u < n) {{ \
|
||||||
r[{dst_reg}u] = vec4<f32>( \
|
{w16_decl}\
|
||||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 0u], endian)), \
|
r[{dst_reg}u] = vec4<f32>({l0}, {l1}, {l2}, {l3}); \
|
||||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 1u], endian)), \
|
|
||||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 2u], endian)), \
|
|
||||||
bitcast<f32>(gpu_swap(vertex_buffer[addr + 3u], endian))); \
|
|
||||||
}} }}",
|
}} }}",
|
||||||
fc0_idx = fetch_const * 2,
|
fc0_idx = const_off,
|
||||||
fc1_idx = fetch_const * 2 + 1,
|
l0 = lane(0),
|
||||||
|
l1 = lane(1),
|
||||||
|
l2 = lane(2),
|
||||||
|
l3 = lane(3),
|
||||||
));
|
));
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -477,6 +721,22 @@ fn src_operand(src_byte: u8, is_temp: bool, swizzle: u8, negate: bool) -> String
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||||
|
// Semantics mirror the runtime interpreter's `exec_vector_op`
|
||||||
|
// (`shaders/xenos_interp.wgsl`), which in turn mirrors canary's
|
||||||
|
// `AluVectorOpcode` (ucode.h:1001+). Side-effecting ops (kill*, setp_push)
|
||||||
|
// need per-invocation state the AOT emitter doesn't track yet → still
|
||||||
|
// `None` (interpreter fallback).
|
||||||
|
let cmp4 = |op: &str| {
|
||||||
|
format!(
|
||||||
|
"vec4<f32>(select(0.0,1.0,{a}.x{op}{b}.x), select(0.0,1.0,{a}.y{op}{b}.y), select(0.0,1.0,{a}.z{op}{b}.z), select(0.0,1.0,{a}.w{op}{b}.w))"
|
||||||
|
)
|
||||||
|
};
|
||||||
|
// CND* : per-lane select(c, b, a <cmp> 0).
|
||||||
|
let cnd4 = |op: &str| {
|
||||||
|
format!(
|
||||||
|
"vec4<f32>(select({c}.x,{b}.x,{a}.x{op}0.0), select({c}.y,{b}.y,{a}.y{op}0.0), select({c}.z,{b}.z,{a}.z{op}0.0), select({c}.w,{b}.w,{a}.w{op}0.0))"
|
||||||
|
)
|
||||||
|
};
|
||||||
let s = match op {
|
let s = match op {
|
||||||
vop::ADD => format!("({a} + {b})"),
|
vop::ADD => format!("({a} + {b})"),
|
||||||
vop::MUL => format!("({a} * {b})"),
|
vop::MUL => format!("({a} * {b})"),
|
||||||
@@ -485,37 +745,63 @@ fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
|||||||
vop::MAD => format!("({a} * {b} + {c})"),
|
vop::MAD => format!("({a} * {b} + {c})"),
|
||||||
vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
|
vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
|
||||||
vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
|
vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
|
||||||
vop::DOT2_ADD => format!(
|
vop::DOT2_ADD => format!("vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"),
|
||||||
"vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"
|
vop::SEQ => cmp4("=="),
|
||||||
),
|
vop::SGT => cmp4(">"),
|
||||||
vop::SEQ => format!(
|
vop::SGE => cmp4(">="),
|
||||||
"vec4<f32>(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))"
|
vop::SNE => cmp4("!="),
|
||||||
),
|
vop::CND_EQ => cnd4("=="),
|
||||||
vop::SGT => format!(
|
vop::CND_GE => cnd4(">="),
|
||||||
"vec4<f32>(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))"
|
vop::CND_GT => cnd4(">"),
|
||||||
),
|
|
||||||
vop::SGE => format!(
|
|
||||||
"vec4<f32>(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))"
|
|
||||||
),
|
|
||||||
vop::SNE => format!(
|
|
||||||
"vec4<f32>(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))"
|
|
||||||
),
|
|
||||||
vop::FRC => format!("fract({a})"),
|
vop::FRC => format!("fract({a})"),
|
||||||
|
vop::TRUNC => format!("trunc({a})"),
|
||||||
vop::FLOOR => format!("floor({a})"),
|
vop::FLOOR => format!("floor({a})"),
|
||||||
|
vop::MAX4 => format!("vec4<f32>(max(max({a}.x,{a}.y), max({a}.z,{a}.w)))"),
|
||||||
|
// dst = (1, src0.y*src1.y, src0.z, src1.w) (canary kDst)
|
||||||
|
vop::DST => format!("vec4<f32>(1.0, {a}.y * {b}.y, {a}.z, {b}.w)"),
|
||||||
_ => return None,
|
_ => return None,
|
||||||
};
|
};
|
||||||
Some(s)
|
Some(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
|
fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
|
||||||
|
// Semantics mirror the runtime interpreter's `exec_scalar_op`
|
||||||
|
// (`shaders/xenos_interp.wgsl`) / canary's `AluScalarOpcode`
|
||||||
|
// (ucode.h:1001+). Side-effecting ops (setp*, kills*, maxas*) need
|
||||||
|
// per-invocation predicate/kill/address state the AOT emitter doesn't
|
||||||
|
// track yet → still `None` (interpreter fallback).
|
||||||
let s = match op {
|
let s = match op {
|
||||||
sop::ADDS => format!("({a} + {b})"),
|
sop::ADDS => format!("({a} + {b})"),
|
||||||
sop::ADDS_PREV => format!("({a} + {prev})"),
|
sop::ADDS_PREV => format!("({a} + {prev})"),
|
||||||
sop::MULS => format!("({a} * {b})"),
|
sop::MULS => format!("({a} * {b})"),
|
||||||
sop::MULS_PREV => format!("({a} * {prev})"),
|
sop::MULS_PREV => format!("({a} * {prev})"),
|
||||||
|
// muls_prev2 / LIT emulation (canary kMulsPrev2): guard against
|
||||||
|
// -FLT_MAX / non-finite ps & b, and b <= 0.
|
||||||
|
sop::MULS_PREV2 => format!(
|
||||||
|
"select({a} * {prev}, -3.4028235e38, {prev} == -3.4028235e38 || !(\
|
||||||
|
{prev} == {prev}) || abs({prev}) > 3.4028235e38 || !({b} == {b}) || \
|
||||||
|
abs({b}) > 3.4028235e38 || {b} <= 0.0)"
|
||||||
|
),
|
||||||
sop::MAXS => format!("max({a}, {b})"),
|
sop::MAXS => format!("max({a}, {b})"),
|
||||||
sop::MINS => format!("min({a}, {b})"),
|
sop::MINS => format!("min({a}, {b})"),
|
||||||
sop::RCP => format!("xe_rcp({a})"),
|
sop::SEQS => format!("select(0.0, 1.0, {a} == 0.0)"),
|
||||||
|
sop::SGTS => format!("select(0.0, 1.0, {a} > 0.0)"),
|
||||||
|
sop::SGES => format!("select(0.0, 1.0, {a} >= 0.0)"),
|
||||||
|
sop::SNES => format!("select(0.0, 1.0, {a} != 0.0)"),
|
||||||
|
sop::FRCS => format!("fract({a})"),
|
||||||
|
sop::TRUNCS => format!("trunc({a})"),
|
||||||
|
sop::FLOORS => format!("floor({a})"),
|
||||||
|
sop::SUBS => format!("({a} - {b})"),
|
||||||
|
sop::SUBS_PREV => format!("({a} - {prev})"),
|
||||||
|
sop::EXP => format!("exp2({a})"),
|
||||||
|
sop::LOG | sop::LOGC => format!("select(log2({a}), 0.0, {a} == 1.0)"),
|
||||||
|
sop::RCP | sop::RCPC | sop::RCPF => format!("xe_rcp({a})"),
|
||||||
|
sop::RSQ | sop::RSQC | sop::RSQF => {
|
||||||
|
format!("select(0.0, inverseSqrt({a}), {a} > 0.0)")
|
||||||
|
}
|
||||||
|
sop::SQRT => format!("select(0.0, sqrt({a}), {a} >= 0.0)"),
|
||||||
|
sop::SIN => format!("sin({a})"),
|
||||||
|
sop::COS => format!("cos({a})"),
|
||||||
sop::RETAIN_PREV => prev.to_string(),
|
sop::RETAIN_PREV => prev.to_string(),
|
||||||
_ => return None,
|
_ => return None,
|
||||||
};
|
};
|
||||||
@@ -528,17 +814,68 @@ mod tests {
|
|||||||
use crate::ucode::alu::{sop, vop};
|
use crate::ucode::alu::{sop, vop};
|
||||||
use crate::ucode::control_flow::ControlFlowInstruction;
|
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||||
|
|
||||||
|
/// iterate-3T: the real publisher-logo VS (`vs_key 0x03b7b020`, captured
|
||||||
|
/// from the live boot) must now TRANSLATE — pre-3T it rejected with
|
||||||
|
/// `vfetch_fmt` because (a) the `k_16_16` color stream (format 6) was
|
||||||
|
/// unsupported and (b) the export-index model (62=oPos, 0/1=interpolators)
|
||||||
|
/// was a wrong AllocKind heuristic. This locks in the format-6 + per-
|
||||||
|
/// attribute-offset + export-index work so the UV interpolator reaches the
|
||||||
|
/// pixel shader (texcoord in r1) instead of collapsing to a single color.
|
||||||
|
#[test]
|
||||||
|
fn real_logo_vs_translates_with_interpolators() {
|
||||||
|
let ucode: [u32; 30] = [
|
||||||
|
0x70153003, 0x00001200, 0xC2000000, 0x00001006, 0x00001200, 0xC4000000,
|
||||||
|
0x00002007, 0x00002200, 0x00000000, 0x2DF82000, 0x00393A88, 0x00000006,
|
||||||
|
0x05F81000, 0x4006060A, 0x00000306, 0x05F80000, 0x40253FC8, 0x00000406,
|
||||||
|
0xC80F803E, 0x00000000, 0xC2020200, 0xC8038001, 0x00B0B000, 0xC2000000,
|
||||||
|
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
];
|
||||||
|
let p = crate::ucode::parse_shader(&ucode);
|
||||||
|
let body = match translate(&p, Stage::Vertex) {
|
||||||
|
Translation::Ok(b) => b,
|
||||||
|
Translation::Reject(r) => panic!("logo VS rejected: {r}"),
|
||||||
|
};
|
||||||
|
// Position must come from the export-index-62 path (`opos`) and the
|
||||||
|
// UV/color interpolators must be exported as distinct slots.
|
||||||
|
assert!(body.contains("opos ="), "no position export: {body}");
|
||||||
|
assert!(body.contains("ointerp[0u]"), "no interp0 export: {body}");
|
||||||
|
assert!(body.contains("ointerp[1u]"), "no interp1 export: {body}");
|
||||||
|
// The k_16_16 attribute must unpack via the packed-16 helper.
|
||||||
|
assert!(body.contains("w16"), "no packed-16 unpack for k_16_16: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The logo pixel shader (`ps_key 0x03b79001`) samples its texture at the
|
||||||
|
/// interpolated texcoord register r1 — which the PS now seeds from the VS
|
||||||
|
/// interpolator `in.interp1` (Xenos PS-input-GPR mapping). Verifies the UV
|
||||||
|
/// chain so tfetch samples the real UV instead of (0,0).
|
||||||
|
#[test]
|
||||||
|
fn ps_seeds_interpolators_into_registers() {
|
||||||
|
// A trivial PS that just exports — we only assert the preamble wiring.
|
||||||
|
let p = crate::ucode::ParsedShader {
|
||||||
|
cf: vec![ControlFlowInstruction::Exit],
|
||||||
|
instructions: vec![],
|
||||||
|
};
|
||||||
|
let body = match translate(&p, Stage::Pixel) {
|
||||||
|
Translation::Ok(b) => b,
|
||||||
|
Translation::Reject(r) => panic!("trivial PS rejected: {r}"),
|
||||||
|
};
|
||||||
|
assert!(body.contains("r[1] = in.interp1;"), "PS must seed r1 from interp1: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
fn synthetic_trivial_shader() -> ParsedShader {
|
fn synthetic_trivial_shader() -> ParsedShader {
|
||||||
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||||
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||||
// precedes so the ALU's export (if it were one) would target oPos.
|
// precedes so the ALU's export (if it were one) would target oPos.
|
||||||
// Word-0 bits 29-31 set so all three operands resolve as temps —
|
// GPUBUG-106 canary layout: dest/mask/scalar_opc in w0; vector_opc +
|
||||||
// matches the prior assertion `r[0u] = (r[0u] + r[0u])`.
|
// src_sel in w2. All three operands temps → r0.
|
||||||
let w0 = (1u32 << 29) | (1u32 << 30) | (1u32 << 31);
|
let w0 = (0u32) // vector_dest = 0
|
||||||
let w2 = (vop::ADD as u32)
|
| (0xFu32 << 16) // vector_write_mask = 0xF
|
||||||
| ((sop::RETAIN_PREV as u32) << 6)
|
| ((sop::RETAIN_PREV as u32) << 26); // scalar_opc
|
||||||
| (0xF << 12) // vector_write_mask
|
let w1 = 0u32;
|
||||||
| (0u32 << 16); // vector_dest = 0
|
let w2 = ((vop::ADD as u32) << 24) // vector_opc
|
||||||
|
| (1u32 << 31) // src1_sel = temp
|
||||||
|
| (1u32 << 30) // src2_sel = temp
|
||||||
|
| (1u32 << 29); // src3_sel = temp
|
||||||
ParsedShader {
|
ParsedShader {
|
||||||
cf: vec![
|
cf: vec![
|
||||||
ControlFlowInstruction::Alloc {
|
ControlFlowInstruction::Alloc {
|
||||||
@@ -554,7 +891,7 @@ mod tests {
|
|||||||
predicate_condition: false,
|
predicate_condition: false,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
instructions: vec![w0, 0, w2],
|
instructions: vec![w0, w1, w2],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -642,19 +979,17 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn shader_using_c0_emits_xenos_consts_read() {
|
fn shader_using_c0_emits_xenos_consts_read() {
|
||||||
// ALU: r0 = c0 + r0. src_a (low byte) is constant index 0;
|
// ALU: r0 = c0 + r0. GPUBUG-106 canary layout. src_a = src1 (w2
|
||||||
// src_b (next byte) is temp index 0. src_a_is_temp=false →
|
// 16:23), src_b = src2 (w2 8:15). src1_sel (w2 bit31) = 0 → c0;
|
||||||
// src1_sel-style bit at w0 bit 29 = 0; src_b_is_temp=true →
|
// src2_sel (w2 bit30) = 1 → r0.
|
||||||
// bit 30 = 1. (src_c left as 0/temp; unused.)
|
let w0 = (0u32) // vector_dest = 0
|
||||||
let w0 = 0x00u32 // src_a = c0
|
| (0xFu32 << 16) // vector_write_mask
|
||||||
| (0x00u32 << 8) // src_b = r0
|
| ((sop::RETAIN_PREV as u32) << 26); // scalar_opc
|
||||||
| (0x00u32 << 16) // src_c
|
let w2 = ((vop::ADD as u32) << 24) // vector_opc
|
||||||
| (0u32 << 29) // src_a_is_temp = false (constant)
|
| (0u32 << 16) // src1_reg = 0 → c0
|
||||||
| (1u32 << 30); // src_b_is_temp = true (register)
|
| (0u32 << 8) // src2_reg = 0 → r0
|
||||||
let w2 = (vop::ADD as u32)
|
| (0u32 << 31) // src1_sel = 0 (constant)
|
||||||
| ((sop::RETAIN_PREV as u32) << 6)
|
| (1u32 << 30); // src2_sel = 1 (temp)
|
||||||
| (0xF << 12)
|
|
||||||
| (0u32 << 16);
|
|
||||||
let shader = ParsedShader {
|
let shader = ParsedShader {
|
||||||
cf: vec![
|
cf: vec![
|
||||||
ControlFlowInstruction::Alloc {
|
ControlFlowInstruction::Alloc {
|
||||||
@@ -695,9 +1030,16 @@ mod tests {
|
|||||||
let mut ctx = EmitCtx::new(Stage::Vertex);
|
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||||
let vf = crate::ucode::fetch::VertexFetch {
|
let vf = crate::ucode::fetch::VertexFetch {
|
||||||
fetch_const: 0,
|
fetch_const: 0,
|
||||||
|
const_index_sel: 0,
|
||||||
src_register: 0,
|
src_register: 0,
|
||||||
dest_register: 0,
|
dest_register: 0,
|
||||||
dest_write_mask: 0xF,
|
dest_write_mask: 0xF,
|
||||||
|
format: 38, // k_32_32_32_32_FLOAT (4 floats)
|
||||||
|
stride: 4,
|
||||||
|
offset: 0,
|
||||||
|
is_signed: false,
|
||||||
|
is_normalized: true,
|
||||||
|
is_mini_fetch: false,
|
||||||
raw: [0; 3],
|
raw: [0; 3],
|
||||||
};
|
};
|
||||||
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
ctx.emit_vfetch(&vf).expect("emit_vfetch");
|
||||||
@@ -705,6 +1047,70 @@ mod tests {
|
|||||||
assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}");
|
assert!(body.contains("gpu_swap("), "emitted vfetch body: {body}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn vf(format: u8, stride: u8, offset: u32, mini: bool) -> crate::ucode::fetch::VertexFetch {
|
||||||
|
crate::ucode::fetch::VertexFetch {
|
||||||
|
fetch_const: 0,
|
||||||
|
const_index_sel: 0,
|
||||||
|
src_register: 0,
|
||||||
|
dest_register: 0,
|
||||||
|
dest_write_mask: 0xF,
|
||||||
|
format,
|
||||||
|
stride,
|
||||||
|
offset,
|
||||||
|
is_signed: false,
|
||||||
|
is_normalized: true,
|
||||||
|
is_mini_fetch: mini,
|
||||||
|
raw: [0; 3],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vfetch_k8888_unpacks_four_channels() {
|
||||||
|
// GPUBUG-112: VertexFormat 6 = k_8_8_8_8 (4× u8 normalized, 1 dword),
|
||||||
|
// NOT k_16_16. All four channels (R,G,B,A) must be unpacked so a
|
||||||
|
// vertex COLOR keeps its blue channel (white texture × white color =
|
||||||
|
// white, not yellow).
|
||||||
|
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||||
|
ctx.emit_vfetch(&vf(6, 6, 3, false)).expect("emit");
|
||||||
|
let body = ctx.finish();
|
||||||
|
// Four /255.0 channel reads from one packed dword `w16`.
|
||||||
|
assert!(body.contains("let w16 ="), "needs packed dword: {body}");
|
||||||
|
assert_eq!(body.matches("/ 255.0").count(), 4, "four 8-bit channels: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vfetch_mini_inherits_full_stride() {
|
||||||
|
// GPUBUG-114: a vfetch_mini (stride field 0) inherits the stride of the
|
||||||
|
// preceding full vfetch of the same stream (canary ucode.h:733). Emit a
|
||||||
|
// full fetch (stride 7) then a mini fetch and assert the mini indexes by
|
||||||
|
// stride 7, not its tight dword count.
|
||||||
|
let mut ctx = EmitCtx::new(Stage::Vertex);
|
||||||
|
ctx.emit_vfetch(&vf(57, 7, 0, false)).expect("full"); // k_32_32_32_FLOAT
|
||||||
|
ctx.emit_vfetch(&vf(38, 0, 3, true)).expect("mini"); // k_32_32_32_32_FLOAT, mini
|
||||||
|
let body = ctx.finish();
|
||||||
|
assert!(body.contains("vidx * 7u + 3u"), "mini must inherit stride 7: {body}");
|
||||||
|
assert!(!body.contains("vidx * 4u"), "mini must not use tight stride 4: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ps_color_export_is_saturated() {
|
||||||
|
// GPUBUG-115: the PS color export must be clamped to [0,1] (canary
|
||||||
|
// saturates before UNORM RT write) so an out-of-range guest color
|
||||||
|
// doesn't write garbage/white to the sRGB target.
|
||||||
|
let p = crate::ucode::ParsedShader {
|
||||||
|
cf: vec![ControlFlowInstruction::Exit],
|
||||||
|
instructions: vec![],
|
||||||
|
};
|
||||||
|
let body = match translate(&p, Stage::Pixel) {
|
||||||
|
Translation::Ok(b) => b,
|
||||||
|
Translation::Reject(r) => panic!("PS rejected: {r}"),
|
||||||
|
};
|
||||||
|
assert!(
|
||||||
|
body.contains("clamp(ocolor0, vec4<f32>(0.0), vec4<f32>(1.0))"),
|
||||||
|
"PS must saturate color export: {body}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn loop_clause_rejected() {
|
fn loop_clause_rejected() {
|
||||||
let shader = ParsedShader {
|
let shader = ParsedShader {
|
||||||
@@ -722,9 +1128,10 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn unsupported_op_rejected() {
|
fn unsupported_op_rejected() {
|
||||||
let w2 = (29u32) // VOP_MAX_A, not in v1 subset
|
// GPUBUG-106 layout: vector_write_mask in w0 (16:19), vector_opc in
|
||||||
| ((sop::RETAIN_PREV as u32) << 6)
|
// w2 (24:28). MAX_A (29) is outside the supported subset → reject.
|
||||||
| (0xF << 12);
|
let w0 = (0xFu32 << 16) | ((sop::RETAIN_PREV as u32) << 26);
|
||||||
|
let w2 = (29u32) << 24; // VOP_MAX_A
|
||||||
let shader = ParsedShader {
|
let shader = ParsedShader {
|
||||||
cf: vec![ControlFlowInstruction::Exec {
|
cf: vec![ControlFlowInstruction::Exec {
|
||||||
address: 0,
|
address: 0,
|
||||||
@@ -734,7 +1141,7 @@ mod tests {
|
|||||||
predicated: false,
|
predicated: false,
|
||||||
predicate_condition: false,
|
predicate_condition: false,
|
||||||
}],
|
}],
|
||||||
instructions: vec![0, 0, w2],
|
instructions: vec![w0, 0, w2],
|
||||||
};
|
};
|
||||||
assert!(matches!(
|
assert!(matches!(
|
||||||
translate(&shader, Stage::Vertex),
|
translate(&shader, Stage::Vertex),
|
||||||
|
|||||||
@@ -71,33 +71,50 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
|||||||
let w0 = words[0];
|
let w0 = words[0];
|
||||||
let w1 = words[1];
|
let w1 = words[1];
|
||||||
let w2 = words[2];
|
let w2 = words[2];
|
||||||
|
// GPUBUG-106 (iterate-3S): correct the dword field map to match canary's
|
||||||
|
// `AluInstruction` union (ucode.h:2036-2086). Pre-fix this read the
|
||||||
|
// dest/mask/export/scalar-opcode out of `w2`, but they live in `w0`; the
|
||||||
|
// vector opcode + source registers live in `w2`, and swizzle/negate/pred
|
||||||
|
// in `w1`. The misread made every *export* ALU decode with
|
||||||
|
// `vector_write_mask=0` → no oPos/oColor export emitted → the translated VS
|
||||||
|
// collapsed every vertex to the clip origin (degenerate, nothing drawn).
|
||||||
|
//
|
||||||
|
// w0: vector_dest(0:5) vector_dest_rel(6) abs_constants(7)
|
||||||
|
// scalar_dest(8:13) scalar_dest_rel(14) export_data(15)
|
||||||
|
// vector_write_mask(16:19) scalar_write_mask(20:23)
|
||||||
|
// vector_clamp(24) scalar_clamp(25) scalar_opc(26:31)
|
||||||
|
// w1: src3_swiz(0:7) src2_swiz(8:15) src1_swiz(16:23)
|
||||||
|
// src3/2/1_reg_negate(24/25/26) pred_condition(27) is_predicated(28)
|
||||||
|
// w2: src3_reg(0:7) src2_reg(8:15) src1_reg(16:23)
|
||||||
|
// vector_opc(24:28) src3/2/1_sel(29/30/31)
|
||||||
|
//
|
||||||
|
// Our (a,b,c) operands map to canary's (src1,src2,src3).
|
||||||
AluInstruction {
|
AluInstruction {
|
||||||
vector_opcode: (w2 & 0x3F) as u8,
|
vector_opcode: ((w2 >> 24) & 0x1F) as u8,
|
||||||
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
|
scalar_opcode: ((w0 >> 26) & 0x3F) as u8,
|
||||||
vector_dest: ((w2 >> 16) & 0x7F) as u8,
|
vector_dest: (w0 & 0x3F) as u8,
|
||||||
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
|
scalar_dest: ((w0 >> 8) & 0x3F) as u8,
|
||||||
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
|
vector_write_mask: ((w0 >> 16) & 0xF) as u8,
|
||||||
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
|
scalar_write_mask: ((w0 >> 20) & 0xF) as u8,
|
||||||
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
|
vector_dest_is_export: ((w0 >> 15) & 1) != 0,
|
||||||
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
|
// Not a real microcode bit — the scalar pipe selects `ps` implicitly
|
||||||
src_a: (w0 & 0xFF) as u8,
|
// via the *_PREV opcodes, which `scalar_expr` handles by opcode.
|
||||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
scalar_src_is_ps: false,
|
||||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
src_a: ((w2 >> 16) & 0xFF) as u8,
|
||||||
// Word-0 bits 29-31 are the per-operand temp-vs-constant
|
src_b: ((w2 >> 8) & 0xFF) as u8,
|
||||||
// selector (canary `src3_sel`/`src2_sel`/`src1_sel`,
|
src_c: (w2 & 0xFF) as u8,
|
||||||
// ucode.h:2078-2086). Our `src_a` is canary's third operand
|
// sel==1 → operand is a temp register; sel==0 → ALU constant.
|
||||||
// (low byte of w0), so its selector is bit 29.
|
src_a_is_temp: ((w2 >> 31) & 1) != 0,
|
||||||
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
src_b_is_temp: ((w2 >> 30) & 1) != 0,
|
||||||
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
src_c_is_temp: ((w2 >> 29) & 1) != 0,
|
||||||
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
src_a_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||||
src_a_swiz: (w1 & 0xFF) as u8,
|
|
||||||
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
||||||
src_c_swiz: ((w1 >> 16) & 0xFF) as u8,
|
src_c_swiz: (w1 & 0xFF) as u8,
|
||||||
src_a_negate: ((w1 >> 24) & 1) != 0,
|
src_a_negate: ((w1 >> 26) & 1) != 0,
|
||||||
src_b_negate: ((w1 >> 25) & 1) != 0,
|
src_b_negate: ((w1 >> 25) & 1) != 0,
|
||||||
src_c_negate: ((w1 >> 26) & 1) != 0,
|
src_c_negate: ((w1 >> 24) & 1) != 0,
|
||||||
predicated: ((w0 >> 27) & 1) != 0,
|
predicated: ((w1 >> 28) & 1) != 0,
|
||||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
predicate_condition: ((w1 >> 27) & 1) != 0,
|
||||||
raw: words,
|
raw: words,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -225,19 +242,24 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn decode_extracts_opcodes_and_dests() {
|
fn decode_extracts_opcodes_and_dests() {
|
||||||
// Build a minimal ALU word:
|
// GPUBUG-106: correct canary field map. w0 carries dest/mask/scalar_opc;
|
||||||
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
|
// w2 carries vector_opc + source regs.
|
||||||
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
|
// vector_opcode = ADD (0) → w2 bits 24:28
|
||||||
let w2 = (vop::ADD as u32)
|
// scalar_opcode = RCP (22) → w0 bits 26:31
|
||||||
| ((sop::RCP as u32) << 6)
|
// vector_dest = 3 → w0 bits 0:5, scalar_dest = 7 → w0 bits 8:13
|
||||||
| (0xF << 12) // vector_write_mask
|
// vector_write_mask = 0xF → w0 bits 16:19, export_data → w0 bit 15
|
||||||
| (3u32 << 16) // vector_dest
|
let w0 = 3u32 // vector_dest
|
||||||
| (7u32 << 24); // scalar_dest
|
| (7u32 << 8) // scalar_dest
|
||||||
let alu = decode_alu([0, 0, w2]);
|
| (1u32 << 15) // export_data
|
||||||
|
| (0xFu32 << 16) // vector_write_mask
|
||||||
|
| ((sop::RCP as u32) << 26); // scalar_opc
|
||||||
|
let w2 = (vop::ADD as u32) << 24; // vector_opc
|
||||||
|
let alu = decode_alu([w0, 0, w2]);
|
||||||
assert_eq!(alu.vector_opcode, vop::ADD);
|
assert_eq!(alu.vector_opcode, vop::ADD);
|
||||||
assert_eq!(alu.scalar_opcode, sop::RCP);
|
assert_eq!(alu.scalar_opcode, sop::RCP);
|
||||||
assert_eq!(alu.vector_dest, 3);
|
assert_eq!(alu.vector_dest, 3);
|
||||||
assert_eq!(alu.scalar_dest, 7);
|
assert_eq!(alu.scalar_dest, 7);
|
||||||
assert_eq!(alu.vector_write_mask, 0xF);
|
assert_eq!(alu.vector_write_mask, 0xF);
|
||||||
|
assert!(alu.vector_dest_is_export);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,7 +43,15 @@ pub enum ControlFlowInstruction {
|
|||||||
Return,
|
Return,
|
||||||
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
||||||
Alloc { size: u32, kind: AllocKind },
|
Alloc { size: u32, kind: AllocKind },
|
||||||
/// Exit the shader (terminal).
|
/// `kNop` — fills space in the CF block; executes nothing, does not end
|
||||||
|
/// the shader. (Xenos opcode 0.)
|
||||||
|
Nop,
|
||||||
|
/// `kMarkVsFetchDone` — hint that no more vertex fetches will be performed.
|
||||||
|
/// (Xenos opcode 15.) Non-terminating.
|
||||||
|
MarkVsFetchDone,
|
||||||
|
/// Exit the shader (terminal). Synthesized — Xenos has no dedicated exit
|
||||||
|
/// opcode; the shader ends after an `Exec`/`CondExec` clause with the
|
||||||
|
/// END bit set (`is_end`). Retained for callers/tests that reference it.
|
||||||
Exit,
|
Exit,
|
||||||
/// Unknown / unhandled opcode.
|
/// Unknown / unhandled opcode.
|
||||||
Unknown { opcode: u8 },
|
Unknown { opcode: u8 },
|
||||||
@@ -88,42 +96,66 @@ pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruc
|
|||||||
fn decode_single(payload: u64) -> ControlFlowInstruction {
|
fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||||
// Top 4 bits of the 48-bit payload.
|
// Top 4 bits of the 48-bit payload.
|
||||||
let opcode = ((payload >> 44) & 0xF) as u8;
|
let opcode = ((payload >> 44) & 0xF) as u8;
|
||||||
// Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
|
|
||||||
// extraction — good enough for the interpreter, which logs unknowns.
|
|
||||||
let predicated = ((payload >> 28) & 1) != 0;
|
|
||||||
let predicate_condition = ((payload >> 29) & 1) != 0;
|
|
||||||
|
|
||||||
|
// GPUBUG-103 (iterate-3P): clause-level predication is determined by the
|
||||||
|
// *opcode*, not by free bits. The 48-bit CF payload is word0 = bits 0..31,
|
||||||
|
// word1 = bits 32..47. Per canary `ucode.h`:
|
||||||
|
// * `ControlFlowExecInstruction` (kExec/kExecEnd, opcodes 1/2): NOT
|
||||||
|
// predicate-gated — it runs unconditionally.
|
||||||
|
// * `ControlFlowCondExecInstruction` (kCondExec/kCondExecEnd, 3/4): gated
|
||||||
|
// by a *bool constant*, `condition_` at word1 bit 10 = payload bit 42.
|
||||||
|
// We don't model bool-constant gating in the WGSL paths (the bool is
|
||||||
|
// virtually always set for these), so treat as unconditional.
|
||||||
|
// * `ControlFlowCondExecPredInstruction` (kCondExecPred/...End/Clean...,
|
||||||
|
// 5/6/13/14): gated by the *predicate register*; `condition_` at word1
|
||||||
|
// bit 9 = payload bit 41.
|
||||||
|
// The prior code read bits 28/29 (which fall inside `sequence_`/`vc_hi_`)
|
||||||
|
// and stamped `predicated=true` on plenty of plain `kExec` clauses — which
|
||||||
|
// made the P7 translator reject EVERY splash VS as `cf_cond`, forcing the
|
||||||
|
// interpreter (placeholder geometry) for all draws.
|
||||||
|
let is_pred_gated = matches!(opcode, 5 | 6 | 13 | 14);
|
||||||
|
let predicated = is_pred_gated;
|
||||||
|
let predicate_condition = is_pred_gated && ((payload >> 41) & 1) != 0;
|
||||||
|
|
||||||
|
// Xenos `ControlFlowOpcode` (canary `ucode.h:86-160`):
|
||||||
|
// 0 kNop, 1 kExec, 2 kExecEnd, 3 kCondExec, 4 kCondExecEnd,
|
||||||
|
// 5 kCondExecPred, 6 kCondExecPredEnd, 7 kLoopStart, 8 kLoopEnd,
|
||||||
|
// 9 kCondCall, 10 kReturn, 11 kCondJmp, 12 kAlloc,
|
||||||
|
// 13 kCondExecPredClean, 14 kCondExecPredCleanEnd, 15 kMarkVsFetchDone.
|
||||||
|
// All exec variants share the address(12)/count(3)/sequence(12) layout
|
||||||
|
// of `ControlFlowExecInstruction`; the `*End` variants terminate the
|
||||||
|
// shader. (Prior table was off-by-one — it mapped 0→Exec and 1→Exit,
|
||||||
|
// so a real `kExec` clause was misread as a terminal `Exit`, truncating
|
||||||
|
// the CF block and dropping every `tfetch` in it.)
|
||||||
|
let exec = |is_end: bool| ControlFlowInstruction::Exec {
|
||||||
|
address: (payload & 0xFFF) as u32,
|
||||||
|
count: ((payload >> 12) & 0x7) as u32,
|
||||||
|
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||||
|
is_end,
|
||||||
|
predicated,
|
||||||
|
predicate_condition,
|
||||||
|
};
|
||||||
match opcode {
|
match opcode {
|
||||||
0 => ControlFlowInstruction::Exec {
|
0 => ControlFlowInstruction::Nop,
|
||||||
address: (payload & 0xFFF) as u32,
|
1 => exec(false),
|
||||||
count: ((payload >> 12) & 0x7) as u32,
|
2 => exec(true),
|
||||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
3 => exec(false),
|
||||||
is_end: false,
|
4 => exec(true),
|
||||||
predicated,
|
5 => exec(false),
|
||||||
predicate_condition,
|
6 => exec(true),
|
||||||
},
|
7 => ControlFlowInstruction::LoopStart {
|
||||||
1 => ControlFlowInstruction::Exit,
|
|
||||||
2 => ControlFlowInstruction::Exec {
|
|
||||||
address: (payload & 0xFFF) as u32,
|
|
||||||
count: ((payload >> 12) & 0x7) as u32,
|
|
||||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
|
||||||
is_end: true,
|
|
||||||
predicated,
|
|
||||||
predicate_condition,
|
|
||||||
},
|
|
||||||
6 => ControlFlowInstruction::LoopStart {
|
|
||||||
address: (payload & 0x3FF) as u32,
|
address: (payload & 0x3FF) as u32,
|
||||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||||
},
|
},
|
||||||
7 => ControlFlowInstruction::LoopEnd {
|
8 => ControlFlowInstruction::LoopEnd {
|
||||||
address: (payload & 0x3FF) as u32,
|
address: (payload & 0x3FF) as u32,
|
||||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||||
},
|
},
|
||||||
8 => ControlFlowInstruction::CondCall {
|
9 => ControlFlowInstruction::CondCall {
|
||||||
target: (payload & 0x3FF) as u32,
|
target: (payload & 0x3FF) as u32,
|
||||||
},
|
},
|
||||||
9 => ControlFlowInstruction::Return,
|
10 => ControlFlowInstruction::Return,
|
||||||
10 => ControlFlowInstruction::CondJmp {
|
11 => ControlFlowInstruction::CondJmp {
|
||||||
target: (payload & 0x3FF) as u32,
|
target: (payload & 0x3FF) as u32,
|
||||||
predicated,
|
predicated,
|
||||||
predicate_condition,
|
predicate_condition,
|
||||||
@@ -132,6 +164,9 @@ fn decode_single(payload: u64) -> ControlFlowInstruction {
|
|||||||
size: (payload & 0x7) as u32,
|
size: (payload & 0x7) as u32,
|
||||||
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
||||||
},
|
},
|
||||||
|
13 => exec(false),
|
||||||
|
14 => exec(true),
|
||||||
|
15 => ControlFlowInstruction::MarkVsFetchDone,
|
||||||
other => ControlFlowInstruction::Unknown { opcode: other },
|
other => ControlFlowInstruction::Unknown { opcode: other },
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -141,12 +176,49 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn opcode_exit_decodes() {
|
fn opcode_nop_and_exec_decode() {
|
||||||
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
|
// Xenos opcode 0 = kNop (non-terminating padding).
|
||||||
|
let payload: u64 = 0u64 << 44;
|
||||||
|
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||||
|
assert_eq!(decode_cf_pair(hi, lo, 0).0, ControlFlowInstruction::Nop);
|
||||||
|
// Xenos opcode 1 = kExec (executes instructions; NOT a terminal exit).
|
||||||
let payload: u64 = 1u64 << 44;
|
let payload: u64 = 1u64 << 44;
|
||||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
match decode_cf_pair(hi, lo, 0).0 {
|
||||||
assert_eq!(cf, ControlFlowInstruction::Exit);
|
ControlFlowInstruction::Exec { is_end, .. } => assert!(!is_end),
|
||||||
|
other => panic!("opcode 1 should be non-end Exec, got {other:?}"),
|
||||||
|
}
|
||||||
|
// Xenos opcode 15 = kMarkVsFetchDone (non-terminating hint).
|
||||||
|
let payload: u64 = 15u64 << 44;
|
||||||
|
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||||
|
assert_eq!(
|
||||||
|
decode_cf_pair(hi, lo, 0).0,
|
||||||
|
ControlFlowInstruction::MarkVsFetchDone
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_logo_shader_has_tfetch_clauses() {
|
||||||
|
// The publisher-logo pixel shader E59B2B3DA4AA9008 (captured from the
|
||||||
|
// canary oracle, byte-identical to the microcode our guest IM_LOADs).
|
||||||
|
// Regression for iterate-3M: the old off-by-one opcode table decoded
|
||||||
|
// its leading `kExec` (opcode 1) as a terminal `Exit`, truncating the
|
||||||
|
// CF block so the `tfetch2D` never appeared → flat splash.
|
||||||
|
let ucode: [u32; 24] = [
|
||||||
|
0x00011002, 0x00001200, 0xC4000000, 0x00004003, 0x00002200, 0x00000000,
|
||||||
|
0x10082021, 0x1F1FF688, 0x00004000, 0xC8080001, 0x001B1B00, 0xC1020000,
|
||||||
|
0xC8070000, 0x00C0C000, 0xC1020000, 0xC8070001, 0x00C01B00, 0xC1000100,
|
||||||
|
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
];
|
||||||
|
let p = crate::ucode::parse_shader(&ucode);
|
||||||
|
let exec_clauses = p
|
||||||
|
.cf
|
||||||
|
.iter()
|
||||||
|
.filter(|c| matches!(c, ControlFlowInstruction::Exec { .. }))
|
||||||
|
.count();
|
||||||
|
assert!(exec_clauses >= 1, "expected >=1 Exec clause, cf={:?}", p.cf);
|
||||||
|
let slots = crate::shader_metrics::tfetch_slots(&p);
|
||||||
|
assert!(!slots.is_empty(), "expected tfetch slots, got none; cf={:?}", p.cf);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -17,17 +17,64 @@ pub enum FetchInstruction {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
pub struct VertexFetch {
|
pub struct VertexFetch {
|
||||||
/// Vertex fetch constant index (0..=95).
|
/// Vertex fetch *const_index* (5 bits, w0[20:24]). The full fetch-constant
|
||||||
|
/// index is `const_index * 3 + const_index_sel` (canary `ucode.h:700`); use
|
||||||
|
/// [`VertexFetch::const_reg_offset`] for the register-region dword offset.
|
||||||
pub fetch_const: u8,
|
pub fetch_const: u8,
|
||||||
|
/// iterate-3X (GPUBUG-110): `const_index_sel` (2 bits, w0[25:26]) — selects
|
||||||
|
/// one of the 3 two-dword vertex-fetch constants packed in each 6-dword
|
||||||
|
/// register group. Dropping this read sub-slot 0 of the group, missing the
|
||||||
|
/// real vertex-buffer base for shaders that use sub-slot 1/2 (the publisher
|
||||||
|
/// logo uses `const_index=31, sel=2`).
|
||||||
|
pub const_index_sel: u8,
|
||||||
/// Source register index (vertex index in r#).
|
/// Source register index (vertex index in r#).
|
||||||
pub src_register: u8,
|
pub src_register: u8,
|
||||||
/// Destination register for the fetched value.
|
/// Destination register for the fetched value.
|
||||||
pub dest_register: u8,
|
pub dest_register: u8,
|
||||||
/// 4-bit write mask.
|
/// 4-bit write mask.
|
||||||
pub dest_write_mask: u8,
|
pub dest_write_mask: u8,
|
||||||
|
/// iterate-3S (GPUBUG-107): `xenos::VertexFormat` (6 bits, dword1[16:21]).
|
||||||
|
/// Determines how many components to read and their packing. Pre-fix the
|
||||||
|
/// translator hardcoded `k_32_32_32_32_FLOAT` (4 floats, stride 4),
|
||||||
|
/// over-striding 2-float UI quads (`k_32_32_FLOAT`) → wrong/clipped
|
||||||
|
/// positions (the next vertex's X bled into .w, giving negative W → the
|
||||||
|
/// whole rectangle was clipped behind the camera).
|
||||||
|
pub format: u8,
|
||||||
|
/// Dword stride between consecutive vertices (dword2[0:7]).
|
||||||
|
pub stride: u8,
|
||||||
|
/// iterate-3T: dword offset of THIS attribute within the vertex stride
|
||||||
|
/// (dword2[16:38] in canary's `VertexFetchInstruction`; the low 23 bits).
|
||||||
|
/// A 6-dword vertex with position@0 + UV@2 + extra@3 needs this so the
|
||||||
|
/// three vfetches sharing one fetch-constant read different attributes
|
||||||
|
/// instead of all reading offset 0.
|
||||||
|
pub offset: u32,
|
||||||
|
/// `is_signed` = canary `fomat_comp_all`, word1 bit 12 (ucode.h:757) —
|
||||||
|
/// selects signed vs unsigned interpretation of packed integer formats.
|
||||||
|
/// (GPUBUG-113: was read from word1 bit 24, which is inside `exp_adjust`.)
|
||||||
|
pub is_signed: bool,
|
||||||
|
/// `is_normalized` = canary `num_format_all == 0`, word1 bit 13
|
||||||
|
/// (ucode.h:758). Set bit ⇒ integer (un-normalized); clear ⇒ normalized.
|
||||||
|
/// We store the normalized sense directly. (GPUBUG-113: was word1 bit 25.)
|
||||||
|
pub is_normalized: bool,
|
||||||
|
/// `is_mini_fetch` = canary word1 bit 30 (ucode.h:764). A mini-fetch reuses
|
||||||
|
/// the address AND STRIDE of the preceding full vfetch of the same stream;
|
||||||
|
/// its own `stride` field is 0. Required so a vfetch_mini color attribute
|
||||||
|
/// indexes by the real vertex stride instead of its tight dword count.
|
||||||
|
pub is_mini_fetch: bool,
|
||||||
pub raw: [u32; 3],
|
pub raw: [u32; 3],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl VertexFetch {
|
||||||
|
/// Dword offset of this fetch's 2-dword constant within the fetch-constant
|
||||||
|
/// register region (`CONST_BASE_FETCH`). Vertex fetch constants are packed
|
||||||
|
/// 3 per 6-dword group: `const_index * 6 + const_index_sel * 2`
|
||||||
|
/// (canary `ucode.h:700` `fetch_constant_index = const_index*3 + sel`,
|
||||||
|
/// each constant 2 dwords).
|
||||||
|
pub fn const_reg_offset(&self) -> u32 {
|
||||||
|
self.fetch_const as u32 * 6 + self.const_index_sel as u32 * 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
pub struct TextureFetch {
|
pub struct TextureFetch {
|
||||||
/// Texture fetch constant index (0..=31).
|
/// Texture fetch constant index (0..=31).
|
||||||
@@ -54,23 +101,47 @@ pub mod op {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||||
|
// Fetch dword0 bitfields (Xenos `ucode.h:740-749` vfetch / `844-845`
|
||||||
|
// tfetch): opcode_value:5, src_reg:6, src_reg_am:1, dst_reg:6,
|
||||||
|
// dst_reg_am:1, (fetch_valid_only|must_be_one):1, const_index:5 @ bit20,
|
||||||
|
// ... The prior decoder read `const_index` from bit 5 (which is actually
|
||||||
|
// `src_reg`), so every fetch reported the wrong fetch-constant slot — the
|
||||||
|
// logo `tfetch2D ..., tf0` was read as `tf1`, and slot 1's empty constant
|
||||||
|
// failed to decode → no texture. The texture-fetch `dimension` lives in
|
||||||
|
// dword2 bits 14..15, not dword1.
|
||||||
let w0 = words[0];
|
let w0 = words[0];
|
||||||
let w1 = words[1];
|
let w1 = words[1];
|
||||||
|
let w2 = words[2];
|
||||||
let opcode = (w0 & 0x1F) as u8;
|
let opcode = (w0 & 0x1F) as u8;
|
||||||
match opcode {
|
match opcode {
|
||||||
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
||||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
const_index_sel: ((w0 >> 25) & 0x3) as u8,
|
||||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||||
|
dest_write_mask: (w1 & 0xF) as u8,
|
||||||
|
// dword1[16:21] = VertexFormat. dword2: stride[0:7],
|
||||||
|
// offset (in dwords) [8:?] — empirically the attribute offset of
|
||||||
|
// the textured logo VS lands in dword2[8:15] (pos@4, UV@3,
|
||||||
|
// 3-float@0 in a 6-dword vertex). signed/normalized live higher.
|
||||||
|
format: ((w1 >> 16) & 0x3F) as u8,
|
||||||
|
stride: (w2 & 0xFF) as u8,
|
||||||
|
offset: (w2 >> 8) & 0xFF,
|
||||||
|
// GPUBUG-113: canary ucode.h:757-758,764 — signed=fomat_comp_all
|
||||||
|
// (w1 bit12), normalized=(num_format_all==0) (w1 bit13),
|
||||||
|
// mini-fetch=(w1 bit30). The previous bit24/25 reads landed inside
|
||||||
|
// `exp_adjust`, so signedness/normalization were effectively random.
|
||||||
|
is_signed: ((w1 >> 12) & 1) != 0,
|
||||||
|
is_normalized: ((w1 >> 13) & 1) == 0,
|
||||||
|
is_mini_fetch: ((w1 >> 30) & 1) != 0,
|
||||||
raw: words,
|
raw: words,
|
||||||
}),
|
}),
|
||||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
dest_write_mask: (w1 & 0xF) as u8,
|
||||||
dimension: ((w1 >> 29) & 0x3) as u8,
|
dimension: ((w2 >> 14) & 0x3) as u8,
|
||||||
raw: words,
|
raw: words,
|
||||||
}),
|
}),
|
||||||
_ => FetchInstruction::Unknown { opcode, raw: words },
|
_ => FetchInstruction::Unknown { opcode, raw: words },
|
||||||
@@ -83,8 +154,9 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn decode_vertex_fetch() {
|
fn decode_vertex_fetch() {
|
||||||
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
|
// opcode=0 (vertex). Xenos dword0: src_reg@bit5, dst_reg@bit12,
|
||||||
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
|
// const_index@bit20. fetch_const=5, src=2, dest=7.
|
||||||
|
let w0 = 0u32 | (2 << 5) | (7 << 12) | (5 << 20);
|
||||||
let v = decode_fetch([w0, 0, 0]);
|
let v = decode_fetch([w0, 0, 0]);
|
||||||
match v {
|
match v {
|
||||||
FetchInstruction::Vertex(vf) => {
|
FetchInstruction::Vertex(vf) => {
|
||||||
@@ -96,13 +168,69 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vertex_fetch_const_index_sel_and_reg_offset() {
|
||||||
|
// iterate-3X (GPUBUG-110): the real publisher-logo vfetch (w0 =
|
||||||
|
// 0x2DF82000) encodes const_index=31, const_index_sel=2. Its fetch
|
||||||
|
// constant lives at dword offset `31*6 + 2*2 = 190` (reg 0x48BE), not
|
||||||
|
// `31*6 = 186` (reg 0x48BA, which held the unused 0x1 slot). Dropping
|
||||||
|
// the sel field made the logo geometry resolve as "no vertex buffer".
|
||||||
|
let v = decode_fetch([0x2DF8_2000, 0, 0]);
|
||||||
|
match v {
|
||||||
|
FetchInstruction::Vertex(vf) => {
|
||||||
|
assert_eq!(vf.fetch_const, 31, "const_index");
|
||||||
|
assert_eq!(vf.const_index_sel, 2, "const_index_sel");
|
||||||
|
assert_eq!(vf.const_reg_offset(), 190, "reg offset = 31*6 + 2*2");
|
||||||
|
}
|
||||||
|
other => panic!("expected Vertex, got {other:?}"),
|
||||||
|
}
|
||||||
|
// sel=0 collapses to the legacy `fetch_const*6` offset (back-compat).
|
||||||
|
let v0 = decode_fetch([0u32 | (5 << 20), 0, 0]);
|
||||||
|
if let FetchInstruction::Vertex(vf) = v0 {
|
||||||
|
assert_eq!(vf.const_index_sel, 0);
|
||||||
|
assert_eq!(vf.const_reg_offset(), 30);
|
||||||
|
} else {
|
||||||
|
panic!("expected Vertex");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vertex_fetch_signed_normalized_mini_bits() {
|
||||||
|
// GPUBUG-113: canary ucode.h:757-758,764 — is_signed=fomat_comp_all
|
||||||
|
// (w1 bit12), is_normalized=(num_format_all==0) (w1 bit13),
|
||||||
|
// is_mini_fetch=(w1 bit30). Validate each bit independently.
|
||||||
|
let mk = |w1: u32| match decode_fetch([0, w1, 0]) {
|
||||||
|
FetchInstruction::Vertex(vf) => vf,
|
||||||
|
_ => panic!("vertex"),
|
||||||
|
};
|
||||||
|
// No bits: unsigned, normalized, full fetch.
|
||||||
|
let v = mk(0);
|
||||||
|
assert!(!v.is_signed);
|
||||||
|
assert!(v.is_normalized);
|
||||||
|
assert!(!v.is_mini_fetch);
|
||||||
|
// bit12 → signed.
|
||||||
|
assert!(mk(1 << 12).is_signed);
|
||||||
|
// bit13 (num_format_all=1) → NOT normalized.
|
||||||
|
assert!(!mk(1 << 13).is_normalized);
|
||||||
|
// bit30 → mini fetch.
|
||||||
|
assert!(mk(1 << 30).is_mini_fetch);
|
||||||
|
// The old (wrong) bits 24/25 must NOT affect signed/normalized.
|
||||||
|
assert!(!mk(1 << 24).is_signed);
|
||||||
|
assert!(mk(1 << 25).is_normalized);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn decode_texture_fetch() {
|
fn decode_texture_fetch() {
|
||||||
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
|
// opcode=1 (texture). const_index@bit20=3, src@bit5=1, dst@bit12=4.
|
||||||
let t = decode_fetch([w0, (2u32 << 29), 0]);
|
// dimension lives in dword2 bits 14..15.
|
||||||
|
let w0 = 1u32 | (1 << 5) | (4 << 12) | (3 << 20);
|
||||||
|
let w2 = 2u32 << 14;
|
||||||
|
let t = decode_fetch([w0, 0, w2]);
|
||||||
match t {
|
match t {
|
||||||
FetchInstruction::Texture(tf) => {
|
FetchInstruction::Texture(tf) => {
|
||||||
assert_eq!(tf.fetch_const, 3);
|
assert_eq!(tf.fetch_const, 3);
|
||||||
|
assert_eq!(tf.src_register, 1);
|
||||||
|
assert_eq!(tf.dest_register, 4);
|
||||||
assert_eq!(tf.dimension, 2);
|
assert_eq!(tf.dimension, 2);
|
||||||
}
|
}
|
||||||
other => panic!("expected Texture, got {other:?}"),
|
other => panic!("expected Texture, got {other:?}"),
|
||||||
|
|||||||
@@ -48,6 +48,9 @@ pub mod cf_kind {
|
|||||||
pub const COND_JMP: u32 = 6;
|
pub const COND_JMP: u32 = 6;
|
||||||
pub const COND_CALL: u32 = 7;
|
pub const COND_CALL: u32 = 7;
|
||||||
pub const RETURN: u32 = 8;
|
pub const RETURN: u32 = 8;
|
||||||
|
/// Non-executing CF clause: `kNop` padding or `kMarkVsFetchDone` hint.
|
||||||
|
/// The WGSL CF walker treats this as a no-op (advance, do not reject).
|
||||||
|
pub const NOP: u32 = 9;
|
||||||
pub const UNKNOWN: u32 = 15;
|
pub const UNKNOWN: u32 = 15;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -136,6 +139,7 @@ fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
|||||||
}
|
}
|
||||||
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
||||||
Return => (cf_kind::RETURN, 0, 0),
|
Return => (cf_kind::RETURN, 0, 0),
|
||||||
|
Nop | MarkVsFetchDone => (cf_kind::NOP, 0, 0),
|
||||||
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -164,9 +168,11 @@ pub struct ParsedShader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
||||||
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
/// microcode buffer (control flow + instructions). The CF block is implicitly
|
||||||
/// is encoded in the first word's low 12 bits of the last exec clause —
|
/// bounded: we walk clause-pair rows until one terminates the shader (an
|
||||||
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
/// `Exec`/`CondExec` clause with the END bit set, per Xenos). Everything after
|
||||||
|
/// that row is the instruction block; exec/loop addresses are then rebased to
|
||||||
|
/// be relative to it.
|
||||||
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||||
let mut cf = Vec::new();
|
let mut cf = Vec::new();
|
||||||
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
||||||
@@ -175,22 +181,50 @@ pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
|||||||
while i + 2 < raw_dwords.len() {
|
while i + 2 < raw_dwords.len() {
|
||||||
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
||||||
let (first, second) = a;
|
let (first, second) = a;
|
||||||
let seen_exit = matches!(
|
// The CF block ends after the clause that terminates the shader: an
|
||||||
first,
|
// `Exec` with the END bit set (Xenos `kExecEnd`/`kCondExec*End`), a
|
||||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
// synthetic `Exit`, or an `Unknown` opcode (decode ran off the CF
|
||||||
) || matches!(
|
// block into instruction data — stop defensively). `Nop` padding
|
||||||
second,
|
// does NOT terminate. (Previously this stopped on the first `Exit`,
|
||||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
// but with the corrected opcode table opcode 1 is `kExec`, not exit,
|
||||||
);
|
// so real exec clauses kept the parse going as intended.)
|
||||||
|
let terminates = |cf: &ControlFlowInstruction| {
|
||||||
|
matches!(
|
||||||
|
cf,
|
||||||
|
ControlFlowInstruction::Exec { is_end: true, .. }
|
||||||
|
| ControlFlowInstruction::Exit
|
||||||
|
| ControlFlowInstruction::Unknown { .. }
|
||||||
|
)
|
||||||
|
};
|
||||||
|
let seen_end = terminates(&first) || terminates(&second);
|
||||||
cf.push(first);
|
cf.push(first);
|
||||||
cf.push(second);
|
cf.push(second);
|
||||||
i += 3;
|
i += 3;
|
||||||
if seen_exit {
|
if seen_end {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Everything after `i` dwords is the instruction block.
|
// Everything after `i` dwords is the instruction block.
|
||||||
let instructions = raw_dwords[i..].to_vec();
|
let instructions = raw_dwords[i..].to_vec();
|
||||||
|
// Xenos exec/loop `address` fields are absolute instruction-triple indices
|
||||||
|
// counted from shader dword 0, but `instructions` here begins *after* the
|
||||||
|
// CF block. Rebase those addresses to be relative to the instruction block
|
||||||
|
// (subtract the CF triple count) so `address * 3` indexes `instructions`
|
||||||
|
// directly. (Without this, every exec read 3 dwords too far per CF triple —
|
||||||
|
// the publisher-logo `tfetch` triple was skipped → flat splash.)
|
||||||
|
let cf_triples = (i / 3) as u32;
|
||||||
|
for clause in cf.iter_mut() {
|
||||||
|
match clause {
|
||||||
|
ControlFlowInstruction::Exec { address, .. } => {
|
||||||
|
*address = address.saturating_sub(cf_triples);
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::LoopStart { address, .. }
|
||||||
|
| ControlFlowInstruction::LoopEnd { address, .. } => {
|
||||||
|
*address = address.saturating_sub(cf_triples);
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
ParsedShader { cf, instructions }
|
ParsedShader { cf, instructions }
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,15 +269,19 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn trivial_exit_clause_stops_parsing() {
|
fn exec_end_clause_stops_parsing() {
|
||||||
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
// Row: clause B = kExecEnd (opcode 2) terminates the CF block.
|
||||||
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
// 48-bit payload of B occupies hi16(word1) + word2; opcode lives in
|
||||||
let w0 = 0u32; // clause A body
|
// bits 44..47 of that payload. Put opcode 2 there: payload bit 44 set
|
||||||
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
// for the `2` → (2 << 44). In B's framing, bits 16..47 come from
|
||||||
let w2 = 0u32;
|
// word2, so word2 bit (44-16)=28 region holds the opcode nibble.
|
||||||
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
let b_payload: u64 = 2u64 << 44; // kExecEnd
|
||||||
|
// B = lo16 from hi16(word1), hi from word2. Reconstruct word1/word2.
|
||||||
|
let word1 = ((b_payload & 0xFFFF) as u32) << 16; // B's low 16 bits → hi16(word1)
|
||||||
|
let word2 = ((b_payload >> 16) & 0xFFFF_FFFF) as u32;
|
||||||
|
let p = parse_shader(&[0, word1, word2, 0xDEAD_BEEF]);
|
||||||
assert!(!p.cf.is_empty());
|
assert!(!p.cf.is_empty());
|
||||||
// Exit detected → remaining dword is instruction data.
|
// ExecEnd detected in the first row → remaining dword is instruction data.
|
||||||
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3195,7 +3195,14 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
|||||||
// the first (the UI binds a single texture today). When the last draw
|
// the first (the UI binds a single texture today). When the last draw
|
||||||
// used a flat (no-tfetch) shader the list is empty, so we fall back to
|
// used a flat (no-tfetch) shader the list is empty, so we fall back to
|
||||||
// the legacy slot-0 probe to preserve behavior on flat-only frames.
|
// the legacy slot-0 probe to preserve behavior on flat-only frames.
|
||||||
let published = gpu_inline.last_draw_textures.first().cloned().or_else(|| {
|
// The legacy single-texture `publish_texture` bridge wants
|
||||||
|
// `(TextureKey, bytes)`; `last_draw_textures` now also carries the
|
||||||
|
// content version (for the per-draw host-cache re-upload). Drop it here.
|
||||||
|
let published = gpu_inline
|
||||||
|
.last_draw_textures
|
||||||
|
.first()
|
||||||
|
.map(|(k, _v, b)| (*k, b.clone()))
|
||||||
|
.or_else(|| {
|
||||||
// Fallback: probe fetch constant slot 0 directly. Texture fetch
|
// Fallback: probe fetch constant slot 0 directly. Texture fetch
|
||||||
// constants live at `CONST_BASE_FETCH + slot*6` in the register
|
// constants live at `CONST_BASE_FETCH + slot*6` in the register
|
||||||
// file; read 6 dwords, decode the key, hit the CPU cache with
|
// file; read 6 dwords, decode the key, hit the CPU cache with
|
||||||
@@ -3231,6 +3238,16 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
|||||||
metrics::gauge!("gpu.texture_cache.entries")
|
metrics::gauge!("gpu.texture_cache.entries")
|
||||||
.set(gpu_inline.texture_cache.len() as f64);
|
.set(gpu_inline.texture_cache.len() as f64);
|
||||||
ui.publish_texture(published);
|
ui.publish_texture(published);
|
||||||
|
|
||||||
|
// iterate-3O: publish this frame's captured per-draw geometry and
|
||||||
|
// reset the accumulator for the next frame. The UI replays these as
|
||||||
|
// real guest draws (real vertices + prim type) instead of synthetic
|
||||||
|
// placeholder shapes. `frame_captures` is `Some` only under `--ui`.
|
||||||
|
if let Some(caps) = gpu_inline.frame_captures.as_mut() {
|
||||||
|
let drained = std::mem::take(caps);
|
||||||
|
metrics::counter!("gpu.geometry.published").increment(drained.len() as u64);
|
||||||
|
ui.publish_geometry(drained);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Notify the UI.
|
// Notify the UI.
|
||||||
if let Some(ui) = state.ui.clone() {
|
if let Some(ui) = state.ui.clone() {
|
||||||
|
|||||||
@@ -183,6 +183,28 @@ pub struct InterruptState {
|
|||||||
/// ticker. `tick_vsync_instr` diffs against this to advance
|
/// ticker. `tick_vsync_instr` diffs against this to advance
|
||||||
/// `vsync_accumulator`.
|
/// `vsync_accumulator`.
|
||||||
pub last_instr_count: u64,
|
pub last_instr_count: u64,
|
||||||
|
/// **iterate-3AJ — present-anchored vsync.** Set `true` once the guest
|
||||||
|
/// has presented at least one frame (a `VdSwap`). Before this, the
|
||||||
|
/// vsync ticker uses the legacy fixed instruction-quantum cadence so
|
||||||
|
/// the boot present-loop bootstrap (iterate-2W) still gets the vsyncs
|
||||||
|
/// it needs *before* the first present. After this, vsync is anchored
|
||||||
|
/// to the guest's real present rate (≈1 vblank per present, as on real
|
||||||
|
/// hardware where the title double-buffers at vblank), with only a
|
||||||
|
/// small capped instruction-quantum *fallback* for frames where the
|
||||||
|
/// guest genuinely stops presenting (heavy asset load). This stops the
|
||||||
|
/// proxy from firing ~66 vsyncs during one heavy load frame, which
|
||||||
|
/// collapsed the splash-logo intro fade-in (the guest's vsync counter
|
||||||
|
/// jumped 0→66 in one frame instead of ramping smoothly).
|
||||||
|
pub vsync_present_anchored: bool,
|
||||||
|
/// Last observed guest present (`VdSwap`) count. `tick_vsync_instr`
|
||||||
|
/// diffs the live count against this each call to emit one vblank per
|
||||||
|
/// new present once `vsync_present_anchored` is set.
|
||||||
|
pub last_present_count: u64,
|
||||||
|
/// How many *fallback* (non-present-driven) vsyncs have fired in the
|
||||||
|
/// current dry (no-present) window. Reset to 0 whenever a present
|
||||||
|
/// occurs. Capped at [`DRY_FALLBACK_CAP`] so one heavy non-presenting
|
||||||
|
/// frame cannot fire a long burst of vsyncs (the fade-in regression).
|
||||||
|
pub dry_fallback_fired: u32,
|
||||||
/// Wall-clock anchor for the production v-sync ticker. `None` until
|
/// Wall-clock anchor for the production v-sync ticker. `None` until
|
||||||
/// the first `tick_vsync_wallclock` call (lazy init so unit tests
|
/// the first `tick_vsync_wallclock` call (lazy init so unit tests
|
||||||
/// that never invoke that function don't construct an Instant).
|
/// that never invoke that function don't construct an Instant).
|
||||||
@@ -208,6 +230,21 @@ pub struct InterruptState {
|
|||||||
/// determinism.
|
/// determinism.
|
||||||
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
|
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
|
||||||
|
|
||||||
|
/// **iterate-3AJ — present-anchored vsync fallback.**
|
||||||
|
///
|
||||||
|
/// Once the guest is in its present loop (`vsync_present_anchored`), each
|
||||||
|
/// guest present emits exactly one vblank — vsync *is* the present cadence,
|
||||||
|
/// as on real Xbox 360 hardware where the title double-buffers at vblank.
|
||||||
|
/// For a frame where the guest stops presenting (e.g. the ~1.1 s splash
|
||||||
|
/// asset-load), we still need *some* vsyncs to keep timers / the present
|
||||||
|
/// loop alive, but firing one per [`VSYNC_INSTR_PERIOD`] would reproduce the
|
||||||
|
/// ~66-vsync spike that collapsed the fade-in. So the fallback fires one
|
||||||
|
/// vblank per `VSYNC_INSTR_PERIOD` of *non-presenting* instructions, but at
|
||||||
|
/// most [`DRY_FALLBACK_CAP`] per dry window (the counter resets on each
|
||||||
|
/// present). A heavy load frame therefore advances the guest vsync counter
|
||||||
|
/// by ≤ `DRY_FALLBACK_CAP` (a small ramp like canary's 0/5/10/2/1…), not 66.
|
||||||
|
pub const DRY_FALLBACK_CAP: u32 = 4;
|
||||||
|
|
||||||
/// Wall-clock period for the **production** v-sync ticker. 16.667 ms
|
/// Wall-clock period for the **production** v-sync ticker. 16.667 ms
|
||||||
/// targets exactly 60 Hz. KRNBUG-D08 — converting from the
|
/// targets exactly 60 Hz. KRNBUG-D08 — converting from the
|
||||||
/// instruction-count proxy fixes the `--parallel` rate drop while
|
/// instruction-count proxy fixes the `--parallel` rate drop while
|
||||||
@@ -254,14 +291,44 @@ impl InterruptState {
|
|||||||
self.pending.pop_front().map(|(source, _)| source)
|
self.pending.pop_front().map(|(source, _)| source)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
|
/// **Present-anchored** instruction-paced v-sync ticker (the lockstep
|
||||||
/// that need a deterministic clock source. Production code calls
|
/// production path; also used by unit tests for a deterministic clock).
|
||||||
/// `tick_vsync_wallclock` instead. Returns `true` if at least one
|
///
|
||||||
/// v-sync was queued.
|
/// `current_instr_count` is the running retired-instruction count.
|
||||||
pub fn tick_vsync_instr(&mut self, current_instr_count: u64) -> bool {
|
/// `present_count` is the guest's running `VdSwap` count (monotonic).
|
||||||
|
///
|
||||||
|
/// Two regimes:
|
||||||
|
///
|
||||||
|
/// 1. **Bootstrap** (`!vsync_present_anchored`, i.e. before the guest's
|
||||||
|
/// first present): legacy fixed-quantum cadence — one vsync per
|
||||||
|
/// [`VSYNC_INSTR_PERIOD`] retired instructions. The boot present loop
|
||||||
|
/// (iterate-2W) needs vsyncs delivered *before* it can present, so
|
||||||
|
/// this regime is unchanged from the original ticker. The first
|
||||||
|
/// observed present flips `vsync_present_anchored`.
|
||||||
|
///
|
||||||
|
/// 2. **Present-anchored** (after the first present): one vblank per
|
||||||
|
/// guest present (vsync *is* the present cadence on real hardware),
|
||||||
|
/// plus a small capped instruction-quantum fallback ([`DRY_FALLBACK_CAP`]
|
||||||
|
/// per dry window) so a frame where the guest stops presenting (heavy
|
||||||
|
/// asset load) still ticks a *few* vsyncs — not ~66, which collapsed
|
||||||
|
/// the splash fade-in.
|
||||||
|
///
|
||||||
|
/// Returns `true` if at least one v-sync was queued.
|
||||||
|
pub fn tick_vsync_instr(&mut self, current_instr_count: u64, present_count: u64) -> bool {
|
||||||
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
||||||
self.last_instr_count = current_instr_count;
|
self.last_instr_count = current_instr_count;
|
||||||
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
|
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
|
||||||
|
|
||||||
|
let new_presents = present_count.saturating_sub(self.last_present_count);
|
||||||
|
self.last_present_count = present_count;
|
||||||
|
if new_presents > 0 {
|
||||||
|
self.vsync_present_anchored = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regime 1 — bootstrap: legacy fixed instruction quantum. Preserves
|
||||||
|
// the iterate-2W present-loop bootstrap exactly (vsyncs must fire
|
||||||
|
// before the guest can present).
|
||||||
|
if !self.vsync_present_anchored {
|
||||||
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -270,7 +337,37 @@ impl InterruptState {
|
|||||||
for _ in 0..periods {
|
for _ in 0..periods {
|
||||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||||
}
|
}
|
||||||
true
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regime 2 — present-anchored.
|
||||||
|
let mut queued = false;
|
||||||
|
|
||||||
|
if new_presents > 0 {
|
||||||
|
// One vblank per guest present. `queue_interrupt` caps the FIFO,
|
||||||
|
// so a burst of presents in one round can't flood. A fresh
|
||||||
|
// present resets the dry-window state.
|
||||||
|
for _ in 0..new_presents {
|
||||||
|
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||||
|
}
|
||||||
|
self.vsync_accumulator = 0;
|
||||||
|
self.dry_fallback_fired = 0;
|
||||||
|
queued = true;
|
||||||
|
} else if self.vsync_accumulator >= VSYNC_INSTR_PERIOD
|
||||||
|
&& self.dry_fallback_fired < DRY_FALLBACK_CAP
|
||||||
|
{
|
||||||
|
// Dry frame (no present this tick): the guest stopped presenting
|
||||||
|
// (heavy load). Tick a *capped* number of fallback vsyncs so
|
||||||
|
// timers/the present loop stay alive without re-introducing the
|
||||||
|
// ~66-vsync spike. Consume one period per fired vsync so the
|
||||||
|
// accumulator paces the few fallbacks.
|
||||||
|
self.vsync_accumulator -= VSYNC_INSTR_PERIOD;
|
||||||
|
self.dry_fallback_fired += 1;
|
||||||
|
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||||
|
queued = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
queued
|
||||||
}
|
}
|
||||||
|
|
||||||
/// **Production** — wall-clock v-sync ticker. Fires
|
/// **Production** — wall-clock v-sync ticker. Fires
|
||||||
@@ -364,9 +461,10 @@ mod tests {
|
|||||||
let mut s = InterruptState::default();
|
let mut s = InterruptState::default();
|
||||||
s.set_callback(0x1000, 0xAB);
|
s.set_callback(0x1000, 0xAB);
|
||||||
assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
|
assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
|
||||||
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1));
|
// present_count = 0 → bootstrap regime (legacy fixed quantum).
|
||||||
|
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1, 0));
|
||||||
assert!(s.pending.is_empty());
|
assert!(s.pending.is_empty());
|
||||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD));
|
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD, 0));
|
||||||
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -376,10 +474,59 @@ mod tests {
|
|||||||
// be delivered, not lost.
|
// be delivered, not lost.
|
||||||
let mut s = InterruptState::default();
|
let mut s = InterruptState::default();
|
||||||
s.set_callback(0x1000, 0xAB);
|
s.set_callback(0x1000, 0xAB);
|
||||||
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10));
|
// present_count = 0 → bootstrap regime drains all 3 periods at once.
|
||||||
|
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10, 0));
|
||||||
assert_eq!(s.pending.len(), 3);
|
assert_eq!(s.pending.len(), 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tick_vsync_instr_present_anchors_after_first_present() {
|
||||||
|
// iterate-3AJ: once the guest presents, vsync tracks presents (one
|
||||||
|
// vblank per present), NOT the fixed instruction quantum.
|
||||||
|
let mut s = InterruptState::default();
|
||||||
|
s.set_callback(0x1000, 0xAB);
|
||||||
|
// Bootstrap: instruction quantum fires (present_count still 0).
|
||||||
|
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD, 0));
|
||||||
|
assert_eq!(s.pending.len(), 1);
|
||||||
|
let _ = s.take_next();
|
||||||
|
// First present flips to anchored: exactly one vblank for the present.
|
||||||
|
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 2, 1));
|
||||||
|
assert!(s.vsync_present_anchored);
|
||||||
|
assert_eq!(s.pending.len(), 1);
|
||||||
|
let _ = s.take_next();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tick_vsync_instr_heavy_dry_frame_capped_not_spiking() {
|
||||||
|
// iterate-3AJ: the regression. A heavy non-presenting frame retires
|
||||||
|
// ~10M instructions; the OLD ticker fired ~66 vsyncs (10M/150k) in
|
||||||
|
// that single frame, jumping the guest vsync counter 0→66 and
|
||||||
|
// skipping the fade-in. The present-anchored ticker caps the dry
|
||||||
|
// window at DRY_FALLBACK_CAP.
|
||||||
|
let mut s = InterruptState::default();
|
||||||
|
s.set_callback(0x1000, 0xAB);
|
||||||
|
// Enter anchored mode via one present.
|
||||||
|
let mut instr: u64 = VSYNC_INSTR_PERIOD;
|
||||||
|
assert!(s.tick_vsync_instr(instr, 1));
|
||||||
|
while s.take_next().is_some() {}
|
||||||
|
// Simulate a 10M-instruction frame with NO new present, ticked in
|
||||||
|
// chunks (as coord_pre_round would). Count fallback vsyncs queued.
|
||||||
|
let mut fallback = 0usize;
|
||||||
|
for _ in 0..100 {
|
||||||
|
instr += 100_000; // 100 chunks × 100k = 10M instructions
|
||||||
|
if s.tick_vsync_instr(instr, 1) {
|
||||||
|
while s.take_next().is_some() {
|
||||||
|
fallback += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert_eq!(
|
||||||
|
fallback, DRY_FALLBACK_CAP as usize,
|
||||||
|
"a heavy dry frame must cap fallback vsyncs at DRY_FALLBACK_CAP, \
|
||||||
|
not fire ~66"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tick_vsync_wallclock_first_call_sets_anchor() {
|
fn tick_vsync_wallclock_first_call_sets_anchor() {
|
||||||
// First call seeds the anchor and never fires. KRNBUG-D08:
|
// First call seeds the anchor and never fires. KRNBUG-D08:
|
||||||
|
|||||||
@@ -219,6 +219,17 @@ pub struct KernelState {
|
|||||||
/// only). Used by `xex_get_procedure_address` to resolve ordinals back
|
/// only). Used by `xex_get_procedure_address` to resolve ordinals back
|
||||||
/// to callable thunks.
|
/// to callable thunks.
|
||||||
thunks_by_ordinal: HashMap<(ModuleId, u16), u32>,
|
thunks_by_ordinal: HashMap<(ModuleId, u16), u32>,
|
||||||
|
|
||||||
|
/// Perf (Tier-A #4): inclusive [min, max] guest-address band that
|
||||||
|
/// contains every registered import thunk. Import thunks sit in a
|
||||||
|
/// small contiguous region of the XEX; almost every executing PC is
|
||||||
|
/// ordinary guest code OUTSIDE this band. The per-slot-visit prologue
|
||||||
|
/// looks up `thunk_map.get(&pc)` (a `HashMap<u32,…>` → `hash_one` per
|
||||||
|
/// call, ~3.2M visits boot-to-splash). Range-rejecting against this
|
||||||
|
/// band first turns the common (non-thunk) case into a pair of integer
|
||||||
|
/// compares and skips the hash entirely. `None` until the first thunk
|
||||||
|
/// is registered (no band → reject everything, matching an empty map).
|
||||||
|
thunk_addr_band: Option<(u32, u32)>,
|
||||||
/// First-Pixels diagnostic latch. Set the first time
|
/// First-Pixels diagnostic latch. Set the first time
|
||||||
/// `RtlRaiseException` fires with code `0xE06D7363` (MSVC C++ throw)
|
/// `RtlRaiseException` fires with code `0xE06D7363` (MSVC C++ throw)
|
||||||
/// so the deep stack-walk + `runtime_error` decode in
|
/// so the deep stack-walk + `runtime_error` decode in
|
||||||
@@ -374,6 +385,15 @@ pub struct KernelState {
|
|||||||
/// block every round from the deterministic `global_clock` via
|
/// block every round from the deterministic `global_clock` via
|
||||||
/// [`Self::update_timestamp_bundle`].
|
/// [`Self::update_timestamp_bundle`].
|
||||||
pub timestamp_bundle_addr: u32,
|
pub timestamp_bundle_addr: u32,
|
||||||
|
|
||||||
|
/// Perf (Tier-B #5) throttle state for [`Self::update_timestamp_bundle`].
|
||||||
|
/// Holds the `clock` value at which the bundle was last actually written;
|
||||||
|
/// `u64::MAX` is the "never written" sentinel (forces the first write).
|
||||||
|
/// `AtomicU64` (not `Cell`) so the `&self` update path stays `Sync` for
|
||||||
|
/// the parallel `Arc<Mutex<KernelState>>` usage. Only ever advanced
|
||||||
|
/// forward under the kernel lock, so `Relaxed` ordering is sufficient and
|
||||||
|
/// the sequence is deterministic.
|
||||||
|
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// ITERATE-2C Phase D — one queued auto-signal. `deadline_cycle` is
|
/// ITERATE-2C Phase D — one queued auto-signal. `deadline_cycle` is
|
||||||
@@ -439,6 +459,7 @@ impl KernelState {
|
|||||||
audit: HandleAudit::default(),
|
audit: HandleAudit::default(),
|
||||||
reservations,
|
reservations,
|
||||||
thunks_by_ordinal: HashMap::new(),
|
thunks_by_ordinal: HashMap::new(),
|
||||||
|
thunk_addr_band: None,
|
||||||
cxx_throw_logged: false,
|
cxx_throw_logged: false,
|
||||||
ring_base: 0,
|
ring_base: 0,
|
||||||
ring_size_dwords: 0,
|
ring_size_dwords: 0,
|
||||||
@@ -465,6 +486,7 @@ impl KernelState {
|
|||||||
last_cycle_hint: 0,
|
last_cycle_hint: 0,
|
||||||
silph_autosignal_diag_logged: false,
|
silph_autosignal_diag_logged: false,
|
||||||
timestamp_bundle_addr: 0,
|
timestamp_bundle_addr: 0,
|
||||||
|
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64::new(u64::MAX),
|
||||||
};
|
};
|
||||||
crate::exports::register_exports(&mut state);
|
crate::exports::register_exports(&mut state);
|
||||||
crate::xam::register_exports(&mut state);
|
crate::xam::register_exports(&mut state);
|
||||||
@@ -584,6 +606,25 @@ impl KernelState {
|
|||||||
/// emits each ordinal once per module).
|
/// emits each ordinal once per module).
|
||||||
pub fn register_thunk(&mut self, module: ModuleId, ordinal: u16, address: u32) {
|
pub fn register_thunk(&mut self, module: ModuleId, ordinal: u16, address: u32) {
|
||||||
self.thunks_by_ordinal.insert((module, ordinal), address);
|
self.thunks_by_ordinal.insert((module, ordinal), address);
|
||||||
|
// Widen the thunk address band (Tier-A #4) so the hot prologue can
|
||||||
|
// range-reject non-thunk PCs before hashing the thunk map.
|
||||||
|
self.thunk_addr_band = Some(match self.thunk_addr_band {
|
||||||
|
Some((lo, hi)) => (lo.min(address), hi.max(address)),
|
||||||
|
None => (address, address),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perf (Tier-A #4). Cheap pre-filter for the per-slot-visit import-thunk
|
||||||
|
/// dispatch: `false` guarantees `pc` is NOT a registered thunk (so the
|
||||||
|
/// caller can skip the `thunk_map.get(&pc)` hash). `true` means `pc` lies
|
||||||
|
/// within the registered thunk address band and the map must be consulted
|
||||||
|
/// for an exact match. Conservative — never a false negative.
|
||||||
|
#[inline]
|
||||||
|
pub fn pc_in_thunk_band(&self, pc: u32) -> bool {
|
||||||
|
match self.thunk_addr_band {
|
||||||
|
Some((lo, hi)) => pc >= lo && pc <= hi,
|
||||||
|
None => false,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Resolve a `(module, ordinal)` to its registered thunk address.
|
/// Resolve a `(module, ordinal)` to its registered thunk address.
|
||||||
@@ -919,6 +960,31 @@ impl KernelState {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const INSTRUCTIONS_PER_MS: u64 = 10_000;
|
const INSTRUCTIONS_PER_MS: u64 = 10_000;
|
||||||
|
// Perf (Tier-B #5): the bundle is updated once per scheduler round
|
||||||
|
// (~every 7 retired instructions), but the four guest BE memory
|
||||||
|
// writes are ~8.6% of boot-to-splash. `clock` is the retired-
|
||||||
|
// instruction count, so consecutive rounds rewrite essentially the
|
||||||
|
// same staircase. Throttle to a 0.25 ms quantum: only re-write when
|
||||||
|
// `clock` advanced by >= INSTRUCTIONS_PER_MS/4 (2500 units) since the
|
||||||
|
// last write. This keeps `tick_count` (ms, changes every 10_000
|
||||||
|
// units) ALWAYS fresh and `interrupt_time`/`system_time` monotone at
|
||||||
|
// 0.25 ms granularity — finer than any guest deadline math needs
|
||||||
|
// (`parse_timeout` works in whole ms; the hub gate is `+66 ms`). The
|
||||||
|
// fade-in (3AH-proven vsync-counter driven, NOT this bundle) is
|
||||||
|
// untouched. Throttle threshold is well below 1 ms so no guest-
|
||||||
|
// visible ms boundary is ever skipped.
|
||||||
|
const BUNDLE_QUANTUM: u64 = INSTRUCTIONS_PER_MS / 4; // 2500 units = 0.25 ms
|
||||||
|
{
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
let last = self.timestamp_bundle_last_clock.load(Ordering::Relaxed);
|
||||||
|
// Always allow the first write (last == u64::MAX sentinel) and any
|
||||||
|
// write that crosses the quantum. Never go backwards.
|
||||||
|
if last != u64::MAX && clock < last.saturating_add(BUNDLE_QUANTUM) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
self.timestamp_bundle_last_clock
|
||||||
|
.store(clock, Ordering::Relaxed);
|
||||||
|
}
|
||||||
// FILETIME epoch base (~2021) so `system_time` is a plausible
|
// FILETIME epoch base (~2021) so `system_time` is a plausible
|
||||||
// absolute wall-clock; matches the constant used by
|
// absolute wall-clock; matches the constant used by
|
||||||
// `ke_query_system_time`. interrupt_time is "since boot" so it
|
// `ke_query_system_time`. interrupt_time is "since boot" so it
|
||||||
@@ -1042,6 +1108,24 @@ impl KernelState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Perf gate (Tier-A quick-win #3). `true` iff any of the four
|
||||||
|
/// per-slot-visit diagnostic probe registries
|
||||||
|
/// (`ctor_probe_pcs` / `branch_probe_pcs` / `audit_pc_probe_pcs`
|
||||||
|
/// / `lr_trace_pcs`) holds at least one PC. The common headless
|
||||||
|
/// run leaves all four empty, so the prologue can skip the four
|
||||||
|
/// `fire_*_if_match` calls entirely with this single predicted
|
||||||
|
/// branch — avoiding 4× call overhead per slot-visit (~3.2M
|
||||||
|
/// visits over boot-to-splash) when no probe is configured.
|
||||||
|
/// Purely a fast-path guard; each `fire_*` still re-checks its own
|
||||||
|
/// set, so behaviour is identical whether or not the caller gates.
|
||||||
|
#[inline]
|
||||||
|
pub fn any_probe_active(&self) -> bool {
|
||||||
|
!self.ctor_probe_pcs.is_empty()
|
||||||
|
|| !self.branch_probe_pcs.is_empty()
|
||||||
|
|| !self.audit_pc_probe_pcs.is_empty()
|
||||||
|
|| !self.lr_trace_pcs.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
/// Diagnostic. If the live PC for HW slot `hw_id` is in
|
/// Diagnostic. If the live PC for HW slot `hw_id` is in
|
||||||
/// `self.ctor_probe_pcs`, emit a single `CTOR-PROBE` line with
|
/// `self.ctor_probe_pcs`, emit a single `CTOR-PROBE` line with
|
||||||
/// the current cycle, tid, hw_id, sp, r3, lr, plus an 8-frame
|
/// the current cycle, tid, hw_id, sp, r3, lr, plus an 8-frame
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ use std::collections::HashMap;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::{AtomicBool, AtomicU64};
|
use std::sync::atomic::{AtomicBool, AtomicU64};
|
||||||
|
|
||||||
|
use xenia_gpu::draw_capture::DrawCapture;
|
||||||
use xenia_gpu::texture_cache::TextureKey;
|
use xenia_gpu::texture_cache::TextureKey;
|
||||||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||||||
use xenia_hid::GamepadState;
|
use xenia_hid::GamepadState;
|
||||||
@@ -133,6 +134,14 @@ pub struct UiBridge {
|
|||||||
/// reverts to its magenta stub.
|
/// reverts to its magenta stub.
|
||||||
pub publish_texture:
|
pub publish_texture:
|
||||||
Arc<dyn Fn(Option<(TextureKey, Vec<u8>)>) + Send + Sync>,
|
Arc<dyn Fn(Option<(TextureKey, Vec<u8>)>) + Send + Sync>,
|
||||||
|
/// iterate-3O real-render slice: at each `VdSwap`, the kernel hands the
|
||||||
|
/// UI the per-draw geometry captured this frame (one [`DrawCapture`] per
|
||||||
|
/// `PM4_DRAW_INDX*`), including the real guest vertex window. The UI
|
||||||
|
/// replays them through the Xenos wgpu pipeline so the splash renders its
|
||||||
|
/// actual geometry instead of synthetic placeholder shapes. Empty in the
|
||||||
|
/// degenerate case (no draws or capture disabled).
|
||||||
|
pub publish_geometry:
|
||||||
|
Arc<dyn Fn(Vec<DrawCapture>) + Send + Sync>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl UiBridge {
|
impl UiBridge {
|
||||||
@@ -182,4 +191,9 @@ impl UiBridge {
|
|||||||
pub fn publish_texture(&self, tex: Option<(TextureKey, Vec<u8>)>) {
|
pub fn publish_texture(&self, tex: Option<(TextureKey, Vec<u8>)>) {
|
||||||
(self.publish_texture)(tex);
|
(self.publish_texture)(tex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Hand this frame's captured per-draw geometry to the UI.
|
||||||
|
pub fn publish_geometry(&self, caps: Vec<DrawCapture>) {
|
||||||
|
(self.publish_geometry)(caps);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -89,6 +89,14 @@ pub struct GuestMemory {
|
|||||||
mem_watch_addrs: Vec<u32>,
|
mem_watch_addrs: Vec<u32>,
|
||||||
/// Count of fires observed (for tests / hand-off telemetry).
|
/// Count of fires observed (for tests / hand-off telemetry).
|
||||||
mem_watch_count: AtomicU64,
|
mem_watch_count: AtomicU64,
|
||||||
|
/// Monotonic count of MMIO accesses (every scalar load/store that
|
||||||
|
/// resolves to a registered MMIO region bumps this by 1). A pure,
|
||||||
|
/// deterministic function of guest execution — the superblock runner
|
||||||
|
/// samples it before/after each block to detect an MMIO touch and
|
||||||
|
/// end the run there (so MMIO ordering vs other HW threads stays at
|
||||||
|
/// the same fine lockstep granularity as before). Relaxed because the
|
||||||
|
/// lockstep path is single-threaded and only needs monotonicity.
|
||||||
|
mmio_access_count: AtomicU64,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit
|
/// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit
|
||||||
@@ -133,9 +141,26 @@ impl GuestMemory {
|
|||||||
writes_total: AtomicU64::new(0),
|
writes_total: AtomicU64::new(0),
|
||||||
mem_watch_addrs: Vec::new(),
|
mem_watch_addrs: Vec::new(),
|
||||||
mem_watch_count: AtomicU64::new(0),
|
mem_watch_count: AtomicU64::new(0),
|
||||||
|
mmio_access_count: AtomicU64::new(0),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Monotonic count of MMIO accesses since boot. Used by the superblock
|
||||||
|
/// runner to detect that a just-executed block touched MMIO (so it can
|
||||||
|
/// end the superblock there and keep MMIO ordering at lockstep
|
||||||
|
/// granularity). Deterministic function of guest execution.
|
||||||
|
#[inline]
|
||||||
|
pub fn mmio_access_count(&self) -> u64 {
|
||||||
|
self.mmio_access_count
|
||||||
|
.load(std::sync::atomic::Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn bump_mmio_access(&self) {
|
||||||
|
self.mmio_access_count
|
||||||
|
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
/// Current version watermark for the page containing `addr`. Bumped by
|
/// Current version watermark for the page containing `addr`. Bumped by
|
||||||
/// any write through `write_u8/16/32/64`. Not affected by MMIO writes
|
/// any write through `write_u8/16/32/64`. Not affected by MMIO writes
|
||||||
/// (those don't touch the backing texture memory).
|
/// (those don't touch the backing texture memory).
|
||||||
@@ -357,7 +382,8 @@ impl GuestMemory {
|
|||||||
/// from `GuestMemory` without a wider plumbing change).
|
/// from `GuestMemory` without a wider plumbing change).
|
||||||
pub fn write_bulk(&self, addr: u32, buf: &[u8]) {
|
pub fn write_bulk(&self, addr: u32, buf: &[u8]) {
|
||||||
let len = buf.len() as u32;
|
let len = buf.len() as u32;
|
||||||
let old_lane = self.capture_mem_watch_old(addr, len);
|
let watch = self.has_mem_watch();
|
||||||
|
let old_lane = if watch { self.capture_mem_watch_old(addr, len) } else { None };
|
||||||
let ptr = self.translate_virtual_mut(addr);
|
let ptr = self.translate_virtual_mut(addr);
|
||||||
unsafe {
|
unsafe {
|
||||||
std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len());
|
std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len());
|
||||||
@@ -374,7 +400,7 @@ impl GuestMemory {
|
|||||||
// the page works.
|
// the page works.
|
||||||
self.bump_page_version(page * PAGE_SIZE);
|
self.bump_page_version(page * PAGE_SIZE);
|
||||||
}
|
}
|
||||||
self.check_mem_watch(addr, len, old_lane);
|
if watch { self.check_mem_watch(addr, len, old_lane); }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if a guest address has been allocated/committed. Acquire load
|
/// Check if a guest address has been allocated/committed. Acquire load
|
||||||
@@ -487,6 +513,7 @@ impl MemoryAccess for GuestMemory {
|
|||||||
// MMIO dispatch must come first — a byte read at an MMIO-mapped
|
// MMIO dispatch must come first — a byte read at an MMIO-mapped
|
||||||
// address should invoke the callback, not the backing memory.
|
// address should invoke the callback, not the backing memory.
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
return (mmio.read_callback)(addr) as u8;
|
return (mmio.read_callback)(addr) as u8;
|
||||||
}
|
}
|
||||||
if !self.is_mapped(addr) { return 0; }
|
if !self.is_mapped(addr) { return 0; }
|
||||||
@@ -497,6 +524,7 @@ impl MemoryAccess for GuestMemory {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn read_u16(&self, addr: u32) -> u16 {
|
fn read_u16(&self, addr: u32) -> u16 {
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
(mmio.read_callback)(addr) as u16
|
(mmio.read_callback)(addr) as u16
|
||||||
} else if !self.is_mapped(addr) {
|
} else if !self.is_mapped(addr) {
|
||||||
0
|
0
|
||||||
@@ -509,6 +537,7 @@ impl MemoryAccess for GuestMemory {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn read_u32(&self, addr: u32) -> u32 {
|
fn read_u32(&self, addr: u32) -> u32 {
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
(mmio.read_callback)(addr)
|
(mmio.read_callback)(addr)
|
||||||
} else if !self.is_mapped(addr) {
|
} else if !self.is_mapped(addr) {
|
||||||
0
|
0
|
||||||
@@ -521,6 +550,7 @@ impl MemoryAccess for GuestMemory {
|
|||||||
#[inline]
|
#[inline]
|
||||||
fn read_u64(&self, addr: u32) -> u64 {
|
fn read_u64(&self, addr: u32) -> u64 {
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
let hi = (mmio.read_callback)(addr) as u64;
|
let hi = (mmio.read_callback)(addr) as u64;
|
||||||
let lo = (mmio.read_callback)(addr.wrapping_add(4)) as u64;
|
let lo = (mmio.read_callback)(addr.wrapping_add(4)) as u64;
|
||||||
(hi << 32) | lo
|
(hi << 32) | lo
|
||||||
@@ -536,23 +566,31 @@ impl MemoryAccess for GuestMemory {
|
|||||||
// MMIO dispatch first — a byte write at an MMIO-mapped address
|
// MMIO dispatch first — a byte write at an MMIO-mapped address
|
||||||
// must invoke the callback, not the backing memory.
|
// must invoke the callback, not the backing memory.
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
(mmio.write_callback)(addr, val as u32);
|
(mmio.write_callback)(addr, val as u32);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if !self.is_mapped(addr) { return; }
|
if !self.is_mapped(addr) { return; }
|
||||||
let old_lane = self.capture_mem_watch_old(addr, 1);
|
// Perf (Tier-A #1): the mem-watch capture/report pair are out-of-line
|
||||||
|
// calls; on the common (no-watch) path each was a real call that
|
||||||
|
// immediately returned. Gate both behind one predicted branch so the
|
||||||
|
// hot store does no call work unless a watch is actually armed.
|
||||||
|
let watch = self.has_mem_watch();
|
||||||
|
let old_lane = if watch { self.capture_mem_watch_old(addr, 1) } else { None };
|
||||||
let ptr = self.translate_virtual_mut(addr);
|
let ptr = self.translate_virtual_mut(addr);
|
||||||
unsafe { *ptr = val };
|
unsafe { *ptr = val };
|
||||||
self.bump_page_version(addr);
|
self.bump_page_version(addr);
|
||||||
self.check_mem_watch(addr, 1, old_lane);
|
if watch { self.check_mem_watch(addr, 1, old_lane); }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_u16(&self, addr: u32, val: u16) {
|
fn write_u16(&self, addr: u32, val: u16) {
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
(mmio.write_callback)(addr, val as u32);
|
(mmio.write_callback)(addr, val as u32);
|
||||||
} else if !self.is_mapped(addr) {
|
} else if !self.is_mapped(addr) {
|
||||||
} else {
|
} else {
|
||||||
let old_lane = self.capture_mem_watch_old(addr, 2);
|
let watch = self.has_mem_watch();
|
||||||
|
let old_lane = if watch { self.capture_mem_watch_old(addr, 2) } else { None };
|
||||||
let ptr = self.translate_virtual_mut(addr);
|
let ptr = self.translate_virtual_mut(addr);
|
||||||
unsafe {
|
unsafe {
|
||||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2);
|
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2);
|
||||||
@@ -564,16 +602,18 @@ impl MemoryAccess for GuestMemory {
|
|||||||
if (addr & 0xFFF) >= (PAGE_SIZE - 1) {
|
if (addr & 0xFFF) >= (PAGE_SIZE - 1) {
|
||||||
self.bump_page_version(addr.wrapping_add(1));
|
self.bump_page_version(addr.wrapping_add(1));
|
||||||
}
|
}
|
||||||
self.check_mem_watch(addr, 2, old_lane);
|
if watch { self.check_mem_watch(addr, 2, old_lane); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_u32(&self, addr: u32, val: u32) {
|
fn write_u32(&self, addr: u32, val: u32) {
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
(mmio.write_callback)(addr, val);
|
(mmio.write_callback)(addr, val);
|
||||||
} else if !self.is_mapped(addr) {
|
} else if !self.is_mapped(addr) {
|
||||||
} else {
|
} else {
|
||||||
let old_lane = self.capture_mem_watch_old(addr, 4);
|
let watch = self.has_mem_watch();
|
||||||
|
let old_lane = if watch { self.capture_mem_watch_old(addr, 4) } else { None };
|
||||||
let ptr = self.translate_virtual_mut(addr);
|
let ptr = self.translate_virtual_mut(addr);
|
||||||
unsafe {
|
unsafe {
|
||||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4);
|
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4);
|
||||||
@@ -582,17 +622,19 @@ impl MemoryAccess for GuestMemory {
|
|||||||
if (addr & 0xFFF) >= (PAGE_SIZE - 3) {
|
if (addr & 0xFFF) >= (PAGE_SIZE - 3) {
|
||||||
self.bump_page_version(addr.wrapping_add(3));
|
self.bump_page_version(addr.wrapping_add(3));
|
||||||
}
|
}
|
||||||
self.check_mem_watch(addr, 4, old_lane);
|
if watch { self.check_mem_watch(addr, 4, old_lane); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_u64(&self, addr: u32, val: u64) {
|
fn write_u64(&self, addr: u32, val: u64) {
|
||||||
if let Some(mmio) = self.find_mmio(addr) {
|
if let Some(mmio) = self.find_mmio(addr) {
|
||||||
|
self.bump_mmio_access();
|
||||||
(mmio.write_callback)(addr, (val >> 32) as u32);
|
(mmio.write_callback)(addr, (val >> 32) as u32);
|
||||||
(mmio.write_callback)(addr.wrapping_add(4), val as u32);
|
(mmio.write_callback)(addr.wrapping_add(4), val as u32);
|
||||||
} else if !self.is_mapped(addr) {
|
} else if !self.is_mapped(addr) {
|
||||||
} else {
|
} else {
|
||||||
let old_lane = self.capture_mem_watch_old(addr, 8);
|
let watch = self.has_mem_watch();
|
||||||
|
let old_lane = if watch { self.capture_mem_watch_old(addr, 8) } else { None };
|
||||||
let ptr = self.translate_virtual_mut(addr);
|
let ptr = self.translate_virtual_mut(addr);
|
||||||
unsafe {
|
unsafe {
|
||||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8);
|
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8);
|
||||||
@@ -601,7 +643,7 @@ impl MemoryAccess for GuestMemory {
|
|||||||
if (addr & 0xFFF) >= (PAGE_SIZE - 7) {
|
if (addr & 0xFFF) >= (PAGE_SIZE - 7) {
|
||||||
self.bump_page_version(addr.wrapping_add(7));
|
self.bump_page_version(addr.wrapping_add(7));
|
||||||
}
|
}
|
||||||
self.check_mem_watch(addr, 8, old_lane);
|
if watch { self.check_mem_watch(addr, 8, old_lane); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -181,10 +181,11 @@ impl App {
|
|||||||
y += line_h;
|
y += line_h;
|
||||||
let (fbw, fbh) = rs.frontbuffer_size();
|
let (fbw, fbh) = rs.frontbuffer_size();
|
||||||
let render_line = format!(
|
let render_line = format!(
|
||||||
"Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} tex-cache={:>3} fb={}x{}",
|
"Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} real-geo={:>5} tex-cache={:>3} fb={}x{}",
|
||||||
rs.xenos_dispatches_translator,
|
rs.xenos_dispatches_translator,
|
||||||
rs.xenos_dispatches_interpreter,
|
rs.xenos_dispatches_interpreter,
|
||||||
rs.translated_pipeline_count(),
|
rs.translated_pipeline_count(),
|
||||||
|
rs.real_geometry_draws(),
|
||||||
rs.host_texture_count(),
|
rs.host_texture_count(),
|
||||||
fbw,
|
fbw,
|
||||||
fbh,
|
fbh,
|
||||||
@@ -368,53 +369,28 @@ impl ApplicationHandler<SwapEvent> for App {
|
|||||||
.map(|s| s.frame_index)
|
.map(|s| s.frame_index)
|
||||||
.unwrap_or(0);
|
.unwrap_or(0);
|
||||||
if frame_idx != self.last_xenos_swap_frame {
|
if frame_idx != self.last_xenos_swap_frame {
|
||||||
rs.clear_frontbuffer([0.04, 0.04, 0.06, 1.0]);
|
// iterate-3AE: clear to BLACK, matching canary's
|
||||||
|
// splash background. The old navy `[0.04,0.04,0.06]`
|
||||||
|
// was an iterate-3S debug placeholder never matched
|
||||||
|
// to the guest. The splash background-fill draw is a
|
||||||
|
// full-screen Xbox-360 RectangleList (3 verts → a HW
|
||||||
|
// rectangle covering the whole screen); the UI replay
|
||||||
|
// draws it as a single triangle (the 4th implied
|
||||||
|
// corner isn't synthesized), so only the diagonal
|
||||||
|
// half is covered. With a navy clear the uncovered
|
||||||
|
// half showed a navy diagonal in the brief
|
||||||
|
// pre/inter-logo transition frames (where that fill
|
||||||
|
// is the only coverage). Canary's background there is
|
||||||
|
// black, and the guest's fill itself resolves to
|
||||||
|
// black, so a black clear makes the uncovered half
|
||||||
|
// match — the transition is uniformly black like the
|
||||||
|
// oracle. (Full RectangleList→rectangle expansion is
|
||||||
|
// the deeper fix and a separate follow-up; under a
|
||||||
|
// black clear the half-coverage is invisible.)
|
||||||
|
rs.clear_frontbuffer([0.0, 0.0, 0.0, 1.0]);
|
||||||
self.last_xenos_swap_frame = frame_idx;
|
self.last_xenos_swap_frame = frame_idx;
|
||||||
}
|
}
|
||||||
let delta = (draws_total - already) as u32;
|
let delta = (draws_total - already) as u32;
|
||||||
let (verts_hint, prim_kind, vs_key, ps_key) = self
|
|
||||||
.last_swap_info
|
|
||||||
.map(|s| {
|
|
||||||
(
|
|
||||||
s.last_draw_vertex_count.max(3),
|
|
||||||
s.last_draw_prim,
|
|
||||||
s.vs_blob_key,
|
|
||||||
s.ps_blob_key,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.unwrap_or((3, 4, 0, 0));
|
|
||||||
// Look up blobs + constants from the bridge and
|
|
||||||
// pack into the WGSL-interpreter layout. Empty
|
|
||||||
// slices produce zero-clause packed buffers — the
|
|
||||||
// WGSL walker short-circuits and the placeholder
|
|
||||||
// export path still renders.
|
|
||||||
let raw_vs: Vec<u32> = self
|
|
||||||
.handles
|
|
||||||
.shader_blobs
|
|
||||||
.lock()
|
|
||||||
.ok()
|
|
||||||
.and_then(|g| g.get(&vs_key).cloned())
|
|
||||||
.unwrap_or_default();
|
|
||||||
let raw_ps: Vec<u32> = self
|
|
||||||
.handles
|
|
||||||
.shader_blobs
|
|
||||||
.lock()
|
|
||||||
.ok()
|
|
||||||
.and_then(|g| g.get(&ps_key).cloned())
|
|
||||||
.unwrap_or_default();
|
|
||||||
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
|
||||||
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
|
||||||
// First time we see a blob key, run the static
|
|
||||||
// metrics analyzer. Keyed on (stage_tag, blob_key)
|
|
||||||
// because the guest can reuse a key across stages.
|
|
||||||
if self.seen_shader_blobs.insert((0u8, vs_key)) {
|
|
||||||
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
|
||||||
}
|
|
||||||
if self.seen_shader_blobs.insert((1u8, ps_key)) {
|
|
||||||
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
|
||||||
}
|
|
||||||
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
|
||||||
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
|
||||||
let constants = self
|
let constants = self
|
||||||
.handles
|
.handles
|
||||||
.xenos_constants
|
.xenos_constants
|
||||||
@@ -431,6 +407,58 @@ impl ApplicationHandler<SwapEvent> for App {
|
|||||||
.ok()
|
.ok()
|
||||||
.and_then(|g| g.clone());
|
.and_then(|g| g.clone());
|
||||||
rs.bind_primary_texture(tex_payload);
|
rs.bind_primary_texture(tex_payload);
|
||||||
|
|
||||||
|
// iterate-3O real-render slice: prefer replaying the
|
||||||
|
// *real* captured guest geometry. The kernel publishes
|
||||||
|
// one `DrawCapture` per `PM4_DRAW_INDX*` this frame
|
||||||
|
// (real vertices + prim type + shader keys). Fall back
|
||||||
|
// to the legacy synthetic dispatch only when no capture
|
||||||
|
// is available (e.g. capture disabled), so we never
|
||||||
|
// regress to a blank screen.
|
||||||
|
let captures: Vec<xenia_gpu::draw_capture::DrawCapture> = self
|
||||||
|
.handles
|
||||||
|
.geometry
|
||||||
|
.lock()
|
||||||
|
.map(|g| g.clone())
|
||||||
|
.unwrap_or_default();
|
||||||
|
let blobs: std::collections::HashMap<u32, Vec<u32>> = self
|
||||||
|
.handles
|
||||||
|
.shader_blobs
|
||||||
|
.lock()
|
||||||
|
.map(|g| g.clone())
|
||||||
|
.unwrap_or_default();
|
||||||
|
if !captures.is_empty() {
|
||||||
|
rs.dispatch_xenos_captures(
|
||||||
|
&captures,
|
||||||
|
&blobs,
|
||||||
|
&constants,
|
||||||
|
&mut self.seen_shader_blobs,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Legacy synthetic-geometry fallback (placeholder).
|
||||||
|
let (verts_hint, prim_kind, vs_key, ps_key) = self
|
||||||
|
.last_swap_info
|
||||||
|
.map(|s| {
|
||||||
|
(
|
||||||
|
s.last_draw_vertex_count.max(3),
|
||||||
|
s.last_draw_prim,
|
||||||
|
s.vs_blob_key,
|
||||||
|
s.ps_blob_key,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.unwrap_or((3, 4, 0, 0));
|
||||||
|
let raw_vs = blobs.get(&vs_key).cloned().unwrap_or_default();
|
||||||
|
let raw_ps = blobs.get(&ps_key).cloned().unwrap_or_default();
|
||||||
|
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||||
|
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||||
|
if self.seen_shader_blobs.insert((0u8, vs_key)) {
|
||||||
|
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||||
|
}
|
||||||
|
if self.seen_shader_blobs.insert((1u8, ps_key)) {
|
||||||
|
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||||
|
}
|
||||||
|
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||||
|
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||||
rs.dispatch_xenos_draws(
|
rs.dispatch_xenos_draws(
|
||||||
already,
|
already,
|
||||||
delta,
|
delta,
|
||||||
@@ -445,6 +473,7 @@ impl ApplicationHandler<SwapEvent> for App {
|
|||||||
&constants,
|
&constants,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
Self::ingest_frontbuffer(
|
Self::ingest_frontbuffer(
|
||||||
&self.handles,
|
&self.handles,
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ use std::sync::Mutex;
|
|||||||
|
|
||||||
use crossbeam_utils::atomic::AtomicCell;
|
use crossbeam_utils::atomic::AtomicCell;
|
||||||
use winit::event_loop::EventLoopProxy;
|
use winit::event_loop::EventLoopProxy;
|
||||||
|
use xenia_gpu::draw_capture::DrawCapture;
|
||||||
use xenia_gpu::texture_cache::TextureKey;
|
use xenia_gpu::texture_cache::TextureKey;
|
||||||
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
use xenia_gpu::xenos_constants::XenosConstantsBlock;
|
||||||
use xenia_hid::GamepadState;
|
use xenia_hid::GamepadState;
|
||||||
@@ -66,6 +67,10 @@ pub struct UiHandles {
|
|||||||
/// fetch-constant slot 0 into linear bytes that the UI should
|
/// fetch-constant slot 0 into linear bytes that the UI should
|
||||||
/// upload into the host cache and bind at `@group(1) @binding(0)`.
|
/// upload into the host cache and bind at `@group(1) @binding(0)`.
|
||||||
pub primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>>,
|
pub primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>>,
|
||||||
|
/// iterate-3O: the most recent frame's captured per-draw geometry. The
|
||||||
|
/// redraw path drains this to replay real guest draws. Replaced wholesale
|
||||||
|
/// each `VdSwap`.
|
||||||
|
pub geometry: Arc<Mutex<Vec<DrawCapture>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Swap event posted by the CPU-side `VdSwap` handler via
|
/// Swap event posted by the CPU-side `VdSwap` handler via
|
||||||
@@ -89,6 +94,7 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
|||||||
let xenos_constants = Arc::new(Mutex::new(XenosConstantsBlock::default()));
|
let xenos_constants = Arc::new(Mutex::new(XenosConstantsBlock::default()));
|
||||||
let primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>> =
|
let primary_texture: Arc<Mutex<Option<(TextureKey, Vec<u8>)>>> =
|
||||||
Arc::new(Mutex::new(None));
|
Arc::new(Mutex::new(None));
|
||||||
|
let geometry: Arc<Mutex<Vec<DrawCapture>>> = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
|
||||||
let kernel_bridge = UiBridge {
|
let kernel_bridge = UiBridge {
|
||||||
gamepad: {
|
gamepad: {
|
||||||
@@ -144,6 +150,14 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
|
publish_geometry: {
|
||||||
|
let geo = Arc::clone(&geometry);
|
||||||
|
Arc::new(move |caps| {
|
||||||
|
if let Ok(mut lock) = geo.lock() {
|
||||||
|
*lock = caps;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
let handles = UiHandles {
|
let handles = UiHandles {
|
||||||
@@ -155,6 +169,7 @@ pub fn build(proxy: EventLoopProxy<SwapEvent>) -> (UiHandles, UiBridge) {
|
|||||||
shader_blobs,
|
shader_blobs,
|
||||||
xenos_constants,
|
xenos_constants,
|
||||||
primary_texture,
|
primary_texture,
|
||||||
|
geometry,
|
||||||
};
|
};
|
||||||
(handles, kernel_bridge)
|
(handles, kernel_bridge)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -84,6 +84,9 @@ pub struct RenderState {
|
|||||||
/// the shader, or (c) we're running the slow interpreter path.
|
/// the shader, or (c) we're running the slow interpreter path.
|
||||||
pub xenos_dispatches_translator: u64,
|
pub xenos_dispatches_translator: u64,
|
||||||
pub xenos_dispatches_interpreter: u64,
|
pub xenos_dispatches_interpreter: u64,
|
||||||
|
/// iterate-3O: running total of replayed draws that carried a real guest
|
||||||
|
/// vertex window (vs. the procedural fallback). Surfaced on the HUD.
|
||||||
|
real_geometry_draws: u64,
|
||||||
/// One-shot latch so we emit a tracing::info! on the **first** real
|
/// One-shot latch so we emit a tracing::info! on the **first** real
|
||||||
/// draw dispatch rather than spamming every frame. Pairs with the
|
/// draw dispatch rather than spamming every frame. Pairs with the
|
||||||
/// "first translator compile" latch below.
|
/// "first translator compile" latch below.
|
||||||
@@ -447,6 +450,7 @@ impl RenderState {
|
|||||||
fallback_rgb: [0.06, 0.06, 0.09],
|
fallback_rgb: [0.06, 0.06, 0.09],
|
||||||
xenos_pipeline,
|
xenos_pipeline,
|
||||||
xenos_draws_rendered: 0,
|
xenos_draws_rendered: 0,
|
||||||
|
real_geometry_draws: 0,
|
||||||
xenos_dispatches_translator: 0,
|
xenos_dispatches_translator: 0,
|
||||||
xenos_dispatches_interpreter: 0,
|
xenos_dispatches_interpreter: 0,
|
||||||
first_dispatch_logged: false,
|
first_dispatch_logged: false,
|
||||||
@@ -657,26 +661,39 @@ impl RenderState {
|
|||||||
draw_index: idx,
|
draw_index: idx,
|
||||||
vertex_count: vertex_count_hint.max(3),
|
vertex_count: vertex_count_hint.max(3),
|
||||||
prim_kind,
|
prim_kind,
|
||||||
|
// Synthetic fallback path: no real vertex window.
|
||||||
|
vertex_base_dwords: 0,
|
||||||
|
// No real geometry → no NDC transform (procedural positions are
|
||||||
|
// already in clip space).
|
||||||
|
ndc_scale: [0.0, 0.0],
|
||||||
|
ndc_offset: [0.0, 0.0],
|
||||||
};
|
};
|
||||||
|
// Synthetic visualizer path (legacy): no captured render state, so
|
||||||
|
// use the opaque default.
|
||||||
|
let rstate = crate::xenos_pipeline::RenderState::OPAQUE;
|
||||||
if use_translated
|
if use_translated
|
||||||
&& let Some(p) = self.xenos_pipeline.translated_pipeline(vs_key, ps_key) {
|
&& self.xenos_pipeline.render_one_translated(
|
||||||
self.xenos_pipeline.render_one_with_pipeline(
|
&self.device,
|
||||||
&self.queue,
|
&self.queue,
|
||||||
&mut encoder,
|
&mut encoder,
|
||||||
&self.frontbuffer_view,
|
&self.frontbuffer_view,
|
||||||
req,
|
req,
|
||||||
p,
|
vs_key,
|
||||||
);
|
ps_key,
|
||||||
metrics::counter!("gpu.shader.use", "path" => "translator")
|
rstate,
|
||||||
.increment(1);
|
)
|
||||||
|
{
|
||||||
|
metrics::counter!("gpu.shader.use", "path" => "translator").increment(1);
|
||||||
served_translator += 1;
|
served_translator += 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
self.xenos_pipeline.render_one(
|
self.xenos_pipeline.render_one(
|
||||||
|
&self.device,
|
||||||
&self.queue,
|
&self.queue,
|
||||||
&mut encoder,
|
&mut encoder,
|
||||||
&self.frontbuffer_view,
|
&self.frontbuffer_view,
|
||||||
req,
|
req,
|
||||||
|
rstate,
|
||||||
);
|
);
|
||||||
metrics::counter!("gpu.shader.use", "path" => "interpreter").increment(1);
|
metrics::counter!("gpu.shader.use", "path" => "interpreter").increment(1);
|
||||||
served_interpreter += 1;
|
served_interpreter += 1;
|
||||||
@@ -707,12 +724,201 @@ impl RenderState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// iterate-3O real-render slice: replay a batch of *real* captured guest
|
||||||
|
/// draws. Unlike [`dispatch_xenos_draws`] (synthetic placeholder geometry),
|
||||||
|
/// each [`DrawCapture`] carries the actual guest vertex window, primitive
|
||||||
|
/// type, host vertex count, and the real (vs, ps) keys. Per capture we:
|
||||||
|
/// 1. upload the captured guest vertex bytes into `vertex_buffer` (b4),
|
||||||
|
/// 2. upload the matching VS/PS microcode + per-frame constants,
|
||||||
|
/// 3. render through the translated (P7) pipeline if it compiled, else
|
||||||
|
/// the interpreter — with `vertex_base_dwords` set so the shader
|
||||||
|
/// rebases its absolute fetch address into the uploaded window.
|
||||||
|
///
|
||||||
|
/// Returns the number of captures that had a real vertex window (vs. the
|
||||||
|
/// procedural fallback), for HUD reporting. `shader_blobs` / `constants`
|
||||||
|
/// come from the bridge; `seen` records which blobs have had static
|
||||||
|
/// metrics emitted (one-shot per blob, matching the legacy path).
|
||||||
|
pub fn dispatch_xenos_captures(
|
||||||
|
&mut self,
|
||||||
|
captures: &[xenia_gpu::draw_capture::DrawCapture],
|
||||||
|
shader_blobs: &std::collections::HashMap<u32, Vec<u32>>,
|
||||||
|
constants: &xenia_gpu::xenos_constants::XenosConstantsBlock,
|
||||||
|
seen: &mut std::collections::HashSet<(u8, u32)>,
|
||||||
|
) -> u32 {
|
||||||
|
if captures.is_empty() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
let mut real_count = 0u32;
|
||||||
|
// iterate-3X (GPUBUG-111): each captured draw uploads its OWN vertex
|
||||||
|
// window + per-draw constants + shader via `queue.write_buffer`. In
|
||||||
|
// wgpu all `write_buffer` calls staged before a single `queue.submit`
|
||||||
|
// are applied *before any* command in that submit executes — so a single
|
||||||
|
// encoder for the whole batch made every draw read only the LAST draw's
|
||||||
|
// vertex buffer / uniforms (the splash logo quad sampled the fullscreen
|
||||||
|
// background quad's vertices → nothing rendered where the logo was).
|
||||||
|
// Submit ONE encoder PER draw so each draw's writes land before its own
|
||||||
|
// pass. The frontbuffer uses `LoadOp::Load`, so per-draw submits still
|
||||||
|
// composite over each other exactly like before.
|
||||||
|
for cap in captures {
|
||||||
|
// iterate-3T: bind this draw's REAL decoded texture (keyed off the
|
||||||
|
// active PS's tfetch slot, attached in `gpu_system`) so the textured
|
||||||
|
// logo samples the artwork. `None` reverts to the magenta stub for
|
||||||
|
// flat draws. Each `set_texture_view` rebuilds the tex bind group;
|
||||||
|
// the subsequent `render_one*` reads it, so per-draw binding works
|
||||||
|
// even though all draws share one encoder.
|
||||||
|
{
|
||||||
|
let Self {
|
||||||
|
device,
|
||||||
|
queue,
|
||||||
|
xenos_pipeline,
|
||||||
|
host_texture_cache,
|
||||||
|
..
|
||||||
|
} = self;
|
||||||
|
match cap.textures.first() {
|
||||||
|
Some((key, version, bytes)) => {
|
||||||
|
// iterate-3AD: use the decoder's real content `version`
|
||||||
|
// (from `span_max_version`) so the host cache re-uploads
|
||||||
|
// when the guest fills MORE of an evolving atlas. The
|
||||||
|
// publisher and the 2nd splash logo share one K8888
|
||||||
|
// surface (base 0x4dbee000); the 2nd logo's texels land
|
||||||
|
// AFTER the first upload. With the old hardcoded
|
||||||
|
// `version_when_uploaded = 1`, the same `TextureKey`
|
||||||
|
// never re-uploaded, so the 2nd logo sampled its (then
|
||||||
|
// still-zero) atlas region as black. The real version
|
||||||
|
// increases as the guest writes, triggering re-upload.
|
||||||
|
let cached = xenia_gpu::texture_cache::CachedTexture {
|
||||||
|
key: *key,
|
||||||
|
version_when_uploaded: *version,
|
||||||
|
bytes: bytes.clone(),
|
||||||
|
};
|
||||||
|
host_texture_cache.upload(device, queue, &cached);
|
||||||
|
if let Some(view) = host_texture_cache.view_for(key) {
|
||||||
|
xenos_pipeline.set_texture_view(device, Some(view));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => xenos_pipeline.set_texture_view(device, None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let raw_vs = shader_blobs.get(&cap.vs_key).cloned().unwrap_or_default();
|
||||||
|
let raw_ps = shader_blobs.get(&cap.ps_key).cloned().unwrap_or_default();
|
||||||
|
let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs);
|
||||||
|
let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps);
|
||||||
|
if seen.insert((0u8, cap.vs_key)) {
|
||||||
|
xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs");
|
||||||
|
}
|
||||||
|
if seen.insert((1u8, cap.ps_key)) {
|
||||||
|
xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps");
|
||||||
|
}
|
||||||
|
let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs);
|
||||||
|
let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps);
|
||||||
|
// Upload this draw's shader + constants + real vertex window.
|
||||||
|
self.xenos_pipeline.upload_shader_and_constants(
|
||||||
|
&self.queue,
|
||||||
|
&vs_packed,
|
||||||
|
&ps_packed,
|
||||||
|
constants,
|
||||||
|
);
|
||||||
|
if cap.has_real_vertices && !cap.vertex_dwords.is_empty() {
|
||||||
|
self.xenos_pipeline
|
||||||
|
.upload_vertex_data(&self.queue, &cap.vertex_dwords);
|
||||||
|
real_count += 1;
|
||||||
|
}
|
||||||
|
let use_translated = cap.vs_key != 0
|
||||||
|
&& cap.ps_key != 0
|
||||||
|
&& ensure_translated_pipeline(
|
||||||
|
&mut self.xenos_pipeline,
|
||||||
|
&self.device,
|
||||||
|
cap.vs_key,
|
||||||
|
cap.ps_key,
|
||||||
|
&parsed_vs,
|
||||||
|
&parsed_ps,
|
||||||
|
);
|
||||||
|
let base = if cap.has_real_vertices {
|
||||||
|
cap.window_base_dwords
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
let req = DrawRequest {
|
||||||
|
draw_index: cap.draw_index,
|
||||||
|
vertex_count: cap.host_vertex_count.max(3),
|
||||||
|
prim_kind: cap.prim_code,
|
||||||
|
vertex_base_dwords: base,
|
||||||
|
// iterate-3S: apply the per-draw guest viewport → host NDC
|
||||||
|
// transform only when we have real geometry (otherwise the
|
||||||
|
// procedural fallback already emits clip-space positions).
|
||||||
|
ndc_scale: if cap.has_real_vertices { cap.ndc_scale } else { [0.0, 0.0] },
|
||||||
|
ndc_offset: if cap.has_real_vertices { cap.ndc_offset } else { [0.0, 0.0] },
|
||||||
|
};
|
||||||
|
// iterate-3Y: replay this draw's real color/blend/write-mask state
|
||||||
|
// (captured from `RB_BLENDCONTROL0` / `RB_COLOR_MASK`) so overlays
|
||||||
|
// composite the way the guest intends instead of opaquely
|
||||||
|
// overwriting the logo.
|
||||||
|
let rstate = crate::xenos_pipeline::RenderState {
|
||||||
|
blend_control: cap.blend_control,
|
||||||
|
color_mask: cap.color_mask,
|
||||||
|
};
|
||||||
|
let mut encoder = self
|
||||||
|
.device
|
||||||
|
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||||
|
label: Some("xenos capture replay (per-draw)"),
|
||||||
|
});
|
||||||
|
let served_translated = use_translated
|
||||||
|
&& self.xenos_pipeline.render_one_translated(
|
||||||
|
&self.device,
|
||||||
|
&self.queue,
|
||||||
|
&mut encoder,
|
||||||
|
&self.frontbuffer_view,
|
||||||
|
req,
|
||||||
|
cap.vs_key,
|
||||||
|
cap.ps_key,
|
||||||
|
rstate,
|
||||||
|
);
|
||||||
|
if served_translated {
|
||||||
|
self.xenos_dispatches_translator =
|
||||||
|
self.xenos_dispatches_translator.saturating_add(1);
|
||||||
|
} else {
|
||||||
|
self.xenos_pipeline.render_one(
|
||||||
|
&self.device,
|
||||||
|
&self.queue,
|
||||||
|
&mut encoder,
|
||||||
|
&self.frontbuffer_view,
|
||||||
|
req,
|
||||||
|
rstate,
|
||||||
|
);
|
||||||
|
self.xenos_dispatches_interpreter =
|
||||||
|
self.xenos_dispatches_interpreter.saturating_add(1);
|
||||||
|
}
|
||||||
|
self.queue.submit(std::iter::once(encoder.finish()));
|
||||||
|
}
|
||||||
|
self.xenos_draws_rendered = self
|
||||||
|
.xenos_draws_rendered
|
||||||
|
.saturating_add(captures.len() as u64);
|
||||||
|
self.real_geometry_draws = self
|
||||||
|
.real_geometry_draws
|
||||||
|
.saturating_add(real_count as u64);
|
||||||
|
if !self.first_dispatch_logged {
|
||||||
|
self.first_dispatch_logged = true;
|
||||||
|
tracing::info!(
|
||||||
|
captures = captures.len(),
|
||||||
|
real_vertex_draws = real_count,
|
||||||
|
"first Xenos capture batch replayed (real geometry)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
real_count
|
||||||
|
}
|
||||||
|
|
||||||
/// Count of distinct translator pipelines compiled so far. Surfaced
|
/// Count of distinct translator pipelines compiled so far. Surfaced
|
||||||
/// on the HUD as `xlated=N` to make "is P7 working?" observable.
|
/// on the HUD as `xlated=N` to make "is P7 working?" observable.
|
||||||
pub fn translated_pipeline_count(&self) -> usize {
|
pub fn translated_pipeline_count(&self) -> usize {
|
||||||
self.xenos_pipeline.translated_pipeline_count()
|
self.xenos_pipeline.translated_pipeline_count()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Running count of captured draws that carried a real vertex window
|
||||||
|
/// (surfaced on the HUD). Updated by [`dispatch_xenos_captures`].
|
||||||
|
pub fn real_geometry_draws(&self) -> u64 {
|
||||||
|
self.real_geometry_draws
|
||||||
|
}
|
||||||
|
|
||||||
/// Clear the frontbuffer to `[r,g,b,a]` in linear space. Matches the
|
/// Clear the frontbuffer to `[r,g,b,a]` in linear space. Matches the
|
||||||
/// fallback clear the outer swapchain render does so the two stages
|
/// fallback clear the outer swapchain render does so the two stages
|
||||||
/// agree on "no draws yet = dark navy".
|
/// agree on "no draws yet = dark navy".
|
||||||
|
|||||||
@@ -36,7 +36,142 @@ struct DrawConstants {
|
|||||||
draw_index: u32,
|
draw_index: u32,
|
||||||
vertex_count: u32,
|
vertex_count: u32,
|
||||||
prim_kind: u32,
|
prim_kind: u32,
|
||||||
_pad: u32,
|
/// iterate-3O: guest dword base of the uploaded `vertex_buffer` window.
|
||||||
|
/// The WGSL subtracts this from the absolute vertex-fetch address.
|
||||||
|
vertex_base_dwords: u32,
|
||||||
|
/// iterate-3S: guest→host NDC XY transform (mirrors canary
|
||||||
|
/// `GetHostViewportInfo`). `clip.xy = pos.xy * ndc_scale + ndc_offset*pos.w`.
|
||||||
|
/// Y is pre-flipped for wgpu. 16 bytes so the block stays 16-byte aligned.
|
||||||
|
ndc_scale: [f32; 2],
|
||||||
|
ndc_offset: [f32; 2],
|
||||||
|
}
|
||||||
|
|
||||||
|
/// iterate-3Y: the per-draw host color/blend/write-mask render state, decoded
|
||||||
|
/// from the guest registers (`RB_BLENDCONTROL0` / `RB_COLOR_MASK`). Used both
|
||||||
|
/// as part of the pipeline-cache key and to build the `wgpu::ColorTargetState`.
|
||||||
|
/// Mirrors canary's `GetColorBlendStateForRenderTarget` (D3D12
|
||||||
|
/// `pipeline_cache.cc`): the factors come straight from `RB_BLENDCONTROL`,
|
||||||
|
/// and a zero write-mask forces the no-blend `One,Zero` equation.
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
||||||
|
pub struct RenderState {
|
||||||
|
/// `RB_BLENDCONTROL0` raw value (RT0). `0x00010001` (One,Zero / One,Zero,
|
||||||
|
/// Add) is the opaque case.
|
||||||
|
pub blend_control: u32,
|
||||||
|
/// RT0 nibble of `RB_COLOR_MASK` (bit0=R … bit3=A). 0 = write nothing.
|
||||||
|
pub color_mask: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RenderState {
|
||||||
|
/// Fully-opaque, all-channels state (the legacy fixed behaviour). Used for
|
||||||
|
/// procedural/synthetic draws that have no captured guest state.
|
||||||
|
pub const OPAQUE: RenderState = RenderState {
|
||||||
|
blend_control: 0x0001_0001,
|
||||||
|
color_mask: 0xF,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Map a Xenos `BlendFactor` (5-bit field) to a wgpu `BlendFactor`,
|
||||||
|
/// mirroring canary `kBlendFactorMap` (D3D12 `pipeline_cache.cc:1504`).
|
||||||
|
fn map_factor(f: u32) -> wgpu::BlendFactor {
|
||||||
|
match f {
|
||||||
|
0 => wgpu::BlendFactor::Zero,
|
||||||
|
1 => wgpu::BlendFactor::One,
|
||||||
|
4 => wgpu::BlendFactor::Src,
|
||||||
|
5 => wgpu::BlendFactor::OneMinusSrc,
|
||||||
|
6 => wgpu::BlendFactor::SrcAlpha,
|
||||||
|
7 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||||
|
8 => wgpu::BlendFactor::Dst,
|
||||||
|
9 => wgpu::BlendFactor::OneMinusDst,
|
||||||
|
10 => wgpu::BlendFactor::DstAlpha,
|
||||||
|
11 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||||||
|
12 => wgpu::BlendFactor::Constant,
|
||||||
|
13 => wgpu::BlendFactor::OneMinusConstant,
|
||||||
|
14 => wgpu::BlendFactor::Constant,
|
||||||
|
15 => wgpu::BlendFactor::OneMinusConstant,
|
||||||
|
16 => wgpu::BlendFactor::SrcAlphaSaturated,
|
||||||
|
// 2/3 and >16 are undefined on Xenos; canary maps to Zero.
|
||||||
|
_ => wgpu::BlendFactor::Zero,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map a Xenos `BlendFactor` for the *alpha* channel, mirroring canary
|
||||||
|
/// `kBlendFactorAlphaMap` (color-mode factors collapse to alpha).
|
||||||
|
fn map_factor_alpha(f: u32) -> wgpu::BlendFactor {
|
||||||
|
match f {
|
||||||
|
4 => wgpu::BlendFactor::SrcAlpha,
|
||||||
|
5 => wgpu::BlendFactor::OneMinusSrcAlpha,
|
||||||
|
8 => wgpu::BlendFactor::DstAlpha,
|
||||||
|
9 => wgpu::BlendFactor::OneMinusDstAlpha,
|
||||||
|
other => Self::map_factor(other),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn map_op(o: u32) -> wgpu::BlendOperation {
|
||||||
|
match o {
|
||||||
|
0 => wgpu::BlendOperation::Add,
|
||||||
|
1 => wgpu::BlendOperation::Subtract,
|
||||||
|
2 => wgpu::BlendOperation::Min,
|
||||||
|
3 => wgpu::BlendOperation::Max,
|
||||||
|
4 => wgpu::BlendOperation::ReverseSubtract,
|
||||||
|
_ => wgpu::BlendOperation::Add,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the `wgpu::ColorTargetState` for this draw.
|
||||||
|
fn color_target(&self, format: wgpu::TextureFormat) -> wgpu::ColorTargetState {
|
||||||
|
let bc = self.blend_control;
|
||||||
|
let color_src = bc & 0x1F;
|
||||||
|
let color_op = (bc >> 5) & 0x7;
|
||||||
|
let color_dst = (bc >> 8) & 0x1F;
|
||||||
|
let alpha_src = (bc >> 16) & 0x1F;
|
||||||
|
let alpha_op = (bc >> 21) & 0x7;
|
||||||
|
let alpha_dst = (bc >> 24) & 0x1F;
|
||||||
|
|
||||||
|
// wgpu requires `blend: None` when nothing would be written; also the
|
||||||
|
// `One,Zero,Add` identity is the opaque case (canary's no-blend), which
|
||||||
|
// we express as `blend: None` so it's a plain overwrite.
|
||||||
|
let is_opaque = color_src == 1
|
||||||
|
&& color_dst == 0
|
||||||
|
&& color_op == 0
|
||||||
|
&& alpha_src == 1
|
||||||
|
&& alpha_dst == 0
|
||||||
|
&& alpha_op == 0;
|
||||||
|
let blend = if is_opaque {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(wgpu::BlendState {
|
||||||
|
color: wgpu::BlendComponent {
|
||||||
|
src_factor: Self::map_factor(color_src),
|
||||||
|
dst_factor: Self::map_factor(color_dst),
|
||||||
|
operation: Self::map_op(color_op),
|
||||||
|
},
|
||||||
|
alpha: wgpu::BlendComponent {
|
||||||
|
src_factor: Self::map_factor_alpha(alpha_src),
|
||||||
|
dst_factor: Self::map_factor_alpha(alpha_dst),
|
||||||
|
operation: Self::map_op(alpha_op),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut write_mask = wgpu::ColorWrites::empty();
|
||||||
|
if self.color_mask & 0x1 != 0 {
|
||||||
|
write_mask |= wgpu::ColorWrites::RED;
|
||||||
|
}
|
||||||
|
if self.color_mask & 0x2 != 0 {
|
||||||
|
write_mask |= wgpu::ColorWrites::GREEN;
|
||||||
|
}
|
||||||
|
if self.color_mask & 0x4 != 0 {
|
||||||
|
write_mask |= wgpu::ColorWrites::BLUE;
|
||||||
|
}
|
||||||
|
if self.color_mask & 0x8 != 0 {
|
||||||
|
write_mask |= wgpu::ColorWrites::ALPHA;
|
||||||
|
}
|
||||||
|
|
||||||
|
wgpu::ColorTargetState {
|
||||||
|
format,
|
||||||
|
blend,
|
||||||
|
write_mask,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
|
/// Submitted to [`XenosPipeline::render_one`] to render one captured draw.
|
||||||
@@ -48,6 +183,13 @@ pub struct DrawRequest {
|
|||||||
pub vertex_count: u32,
|
pub vertex_count: u32,
|
||||||
/// Xenos primitive-type code; shader may branch on it in P3b+.
|
/// Xenos primitive-type code; shader may branch on it in P3b+.
|
||||||
pub prim_kind: u32,
|
pub prim_kind: u32,
|
||||||
|
/// iterate-3O: guest dword base of the per-draw vertex window uploaded to
|
||||||
|
/// `vertex_buffer` (b4). 0 = no real vertex window (procedural fallback).
|
||||||
|
pub vertex_base_dwords: u32,
|
||||||
|
/// iterate-3S: guest→host NDC XY transform (Y pre-flipped). When all-zero
|
||||||
|
/// the shader leaves the position untransformed (procedural fallback).
|
||||||
|
pub ndc_scale: [f32; 2],
|
||||||
|
pub ndc_offset: [f32; 2],
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
|
/// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360
|
||||||
@@ -57,7 +199,16 @@ const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS
|
|||||||
const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;
|
const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024;
|
||||||
|
|
||||||
pub struct XenosPipeline {
|
pub struct XenosPipeline {
|
||||||
|
/// Interpreter pipeline with the legacy fixed (alpha-blend) state. Kept as
|
||||||
|
/// the default; per-state variants are built lazily in `interp_cache`.
|
||||||
pipeline: wgpu::RenderPipeline,
|
pipeline: wgpu::RenderPipeline,
|
||||||
|
/// iterate-3Y: the interpreter WGSL module, retained so per-render-state
|
||||||
|
/// interpreter pipelines can be compiled on demand.
|
||||||
|
interp_shader: wgpu::ShaderModule,
|
||||||
|
/// iterate-3Y: interpreter pipelines keyed on the per-draw `RenderState`
|
||||||
|
/// (blend + write mask), so flat/alpha/opaque draws composite correctly
|
||||||
|
/// even when their (vs,ps) didn't translate.
|
||||||
|
interp_cache: std::collections::HashMap<RenderState, wgpu::RenderPipeline>,
|
||||||
draw_ctx_buffer: wgpu::Buffer,
|
draw_ctx_buffer: wgpu::Buffer,
|
||||||
constants_buffer: wgpu::Buffer,
|
constants_buffer: wgpu::Buffer,
|
||||||
vs_ucode_buffer: wgpu::Buffer,
|
vs_ucode_buffer: wgpu::Buffer,
|
||||||
@@ -78,7 +229,12 @@ pub struct XenosPipeline {
|
|||||||
/// so every (vs, ps) pair gets compiled once and re-used for every
|
/// so every (vs, ps) pair gets compiled once and re-used for every
|
||||||
/// subsequent draw. Interpreter pipeline remains the fallback.
|
/// subsequent draw. Interpreter pipeline remains the fallback.
|
||||||
pipeline_layout: wgpu::PipelineLayout,
|
pipeline_layout: wgpu::PipelineLayout,
|
||||||
translated_cache: std::collections::HashMap<(u32, u32), wgpu::RenderPipeline>,
|
/// iterate-3Y: cached translator pipelines keyed on the shader pair AND the
|
||||||
|
/// per-draw render state, so the same (vs,ps) with different blend/mask
|
||||||
|
/// composites correctly. The translated WGSL module is itself cached per
|
||||||
|
/// (vs,ps) so re-translation only happens once.
|
||||||
|
translated_cache: std::collections::HashMap<(u32, u32, RenderState), wgpu::RenderPipeline>,
|
||||||
|
translated_modules: std::collections::HashMap<(u32, u32), wgpu::ShaderModule>,
|
||||||
pub target_format: wgpu::TextureFormat,
|
pub target_format: wgpu::TextureFormat,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -193,7 +349,9 @@ impl XenosPipeline {
|
|||||||
draw_index: 0,
|
draw_index: 0,
|
||||||
vertex_count: 3,
|
vertex_count: 3,
|
||||||
prim_kind: 4,
|
prim_kind: 4,
|
||||||
_pad: 0,
|
vertex_base_dwords: 0,
|
||||||
|
ndc_scale: [0.0, 0.0],
|
||||||
|
ndc_offset: [0.0, 0.0],
|
||||||
};
|
};
|
||||||
let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
|
||||||
label: Some("xenos draw ctx"),
|
label: Some("xenos draw ctx"),
|
||||||
@@ -242,8 +400,13 @@ impl XenosPipeline {
|
|||||||
usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
|
usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST,
|
||||||
view_formats: &[],
|
view_formats: &[],
|
||||||
});
|
});
|
||||||
// Magenta (255, 0, 255, 255) so a missing-texture read visibly stands
|
// iterate-3Y: transparent black (0,0,0,0). When a textured draw's
|
||||||
// out on-screen when the interpreter does sample it.
|
// real texture can't be resolved (e.g. its sampler slot is shadowed by
|
||||||
|
// a vertex-fetch constant), sampling a *transparent* texel makes the
|
||||||
|
// draw a no-op under its real premultiplied-alpha blend — instead of
|
||||||
|
// fabricating an opaque magenta that overpaints everything (the old
|
||||||
|
// debug stub). This removes a fake rather than adding one: we never
|
||||||
|
// invent visible pixels for an unresolved texture.
|
||||||
queue.write_texture(
|
queue.write_texture(
|
||||||
wgpu::ImageCopyTexture {
|
wgpu::ImageCopyTexture {
|
||||||
texture: &dummy_tex,
|
texture: &dummy_tex,
|
||||||
@@ -251,7 +414,7 @@ impl XenosPipeline {
|
|||||||
origin: wgpu::Origin3d::ZERO,
|
origin: wgpu::Origin3d::ZERO,
|
||||||
aspect: wgpu::TextureAspect::All,
|
aspect: wgpu::TextureAspect::All,
|
||||||
},
|
},
|
||||||
&[0xFFu8, 0x00, 0xFF, 0xFF],
|
&[0x00u8, 0x00, 0x00, 0x00],
|
||||||
wgpu::ImageDataLayout {
|
wgpu::ImageDataLayout {
|
||||||
offset: 0,
|
offset: 0,
|
||||||
bytes_per_row: Some(4),
|
bytes_per_row: Some(4),
|
||||||
@@ -359,6 +522,8 @@ impl XenosPipeline {
|
|||||||
|
|
||||||
Self {
|
Self {
|
||||||
pipeline,
|
pipeline,
|
||||||
|
interp_shader: shader,
|
||||||
|
interp_cache: std::collections::HashMap::new(),
|
||||||
draw_ctx_buffer,
|
draw_ctx_buffer,
|
||||||
constants_buffer,
|
constants_buffer,
|
||||||
vs_ucode_buffer,
|
vs_ucode_buffer,
|
||||||
@@ -371,31 +536,22 @@ impl XenosPipeline {
|
|||||||
dummy_view,
|
dummy_view,
|
||||||
pipeline_layout: layout,
|
pipeline_layout: layout,
|
||||||
translated_cache: std::collections::HashMap::new(),
|
translated_cache: std::collections::HashMap::new(),
|
||||||
|
translated_modules: std::collections::HashMap::new(),
|
||||||
target_format,
|
target_format,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// P7 — does the translator cache already have a pipeline for this
|
/// P7 — has the translator already produced a WGSL *module* for this
|
||||||
/// (vs, ps) pair?
|
/// (vs, ps) pair? (A per-render-state pipeline may still need building.)
|
||||||
pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
|
pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool {
|
||||||
self.translated_cache
|
self.translated_modules
|
||||||
.contains_key(&(vs_blob_key, ps_blob_key))
|
.contains_key(&(vs_blob_key, ps_blob_key))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// P7 — fetch a cached translator pipeline. `None` if not yet built.
|
/// P7 — compile a translator-produced WGSL module and cache it keyed on
|
||||||
pub fn translated_pipeline(
|
/// `(vs_blob_key, ps_blob_key)`. The actual `RenderPipeline` (which also
|
||||||
&self,
|
/// depends on the per-draw blend/mask state) is built lazily by
|
||||||
vs_blob_key: u32,
|
/// [`render_one_translated`]. Returns `true` on success.
|
||||||
ps_blob_key: u32,
|
|
||||||
) -> Option<&wgpu::RenderPipeline> {
|
|
||||||
self.translated_cache
|
|
||||||
.get(&(vs_blob_key, ps_blob_key))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// P7 — compile a translator-produced WGSL module into a
|
|
||||||
/// `wgpu::RenderPipeline` and insert it into the cache keyed on
|
|
||||||
/// `(vs_blob_key, ps_blob_key)`. Returns `true` on success. Duplicate
|
|
||||||
/// inserts are no-ops. Emits `gpu.shader.compile_ok` on success.
|
|
||||||
pub fn insert_translated(
|
pub fn insert_translated(
|
||||||
&mut self,
|
&mut self,
|
||||||
device: &wgpu::Device,
|
device: &wgpu::Device,
|
||||||
@@ -404,7 +560,7 @@ impl XenosPipeline {
|
|||||||
wgsl: &str,
|
wgsl: &str,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
let key = (vs_blob_key, ps_blob_key);
|
let key = (vs_blob_key, ps_blob_key);
|
||||||
if self.translated_cache.contains_key(&key) {
|
if self.translated_modules.contains_key(&key) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||||
@@ -420,31 +576,42 @@ impl XenosPipeline {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
self.translated_modules.insert(key, shader);
|
||||||
|
metrics::counter!("gpu.shader.compile_ok").increment(1);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// iterate-3Y: ensure a translator pipeline exists for `(vs,ps,rstate)`,
|
||||||
|
/// building it from the cached module + the per-draw color/blend target.
|
||||||
|
fn ensure_translated_for_state(
|
||||||
|
&mut self,
|
||||||
|
device: &wgpu::Device,
|
||||||
|
vs_key: u32,
|
||||||
|
ps_key: u32,
|
||||||
|
rstate: RenderState,
|
||||||
|
) -> bool {
|
||||||
|
let pkey = (vs_key, ps_key, rstate);
|
||||||
|
if self.translated_cache.contains_key(&pkey) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
let Some(module) = self.translated_modules.get(&(vs_key, ps_key)) else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
let target = rstate.color_target(self.target_format);
|
||||||
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||||||
label: Some("xenos translated pipeline"),
|
label: Some("xenos translated pipeline"),
|
||||||
layout: Some(&self.pipeline_layout),
|
layout: Some(&self.pipeline_layout),
|
||||||
vertex: wgpu::VertexState {
|
vertex: wgpu::VertexState {
|
||||||
module: &shader,
|
module,
|
||||||
entry_point: "vs_main",
|
entry_point: "vs_main",
|
||||||
compilation_options: Default::default(),
|
compilation_options: Default::default(),
|
||||||
buffers: &[],
|
buffers: &[],
|
||||||
},
|
},
|
||||||
fragment: Some(wgpu::FragmentState {
|
fragment: Some(wgpu::FragmentState {
|
||||||
module: &shader,
|
module,
|
||||||
entry_point: "fs_main",
|
entry_point: "fs_main",
|
||||||
compilation_options: Default::default(),
|
compilation_options: Default::default(),
|
||||||
targets: &[Some(wgpu::ColorTargetState {
|
targets: &[Some(target)],
|
||||||
format: self.target_format,
|
|
||||||
blend: Some(wgpu::BlendState {
|
|
||||||
color: wgpu::BlendComponent {
|
|
||||||
src_factor: wgpu::BlendFactor::SrcAlpha,
|
|
||||||
dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha,
|
|
||||||
operation: wgpu::BlendOperation::Add,
|
|
||||||
},
|
|
||||||
alpha: wgpu::BlendComponent::OVER,
|
|
||||||
}),
|
|
||||||
write_mask: wgpu::ColorWrites::ALL,
|
|
||||||
})],
|
|
||||||
}),
|
}),
|
||||||
primitive: wgpu::PrimitiveState {
|
primitive: wgpu::PrimitiveState {
|
||||||
topology: wgpu::PrimitiveTopology::TriangleList,
|
topology: wgpu::PrimitiveTopology::TriangleList,
|
||||||
@@ -460,30 +627,78 @@ impl XenosPipeline {
|
|||||||
multiview: None,
|
multiview: None,
|
||||||
cache: None,
|
cache: None,
|
||||||
});
|
});
|
||||||
self.translated_cache.insert(key, pipeline);
|
self.translated_cache.insert(pkey, pipeline);
|
||||||
metrics::counter!("gpu.shader.compile_ok").increment(1);
|
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Render one draw with the translator-produced pipeline instead of
|
/// iterate-3Y: ensure an interpreter pipeline exists for `rstate`.
|
||||||
/// the interpreter. Mirrors [`render_one`] except the bound pipeline
|
fn ensure_interp_for_state(&mut self, device: &wgpu::Device, rstate: RenderState) {
|
||||||
/// is swapped for `pipeline`.
|
if self.interp_cache.contains_key(&rstate) {
|
||||||
pub fn render_one_with_pipeline(
|
return;
|
||||||
&self,
|
}
|
||||||
|
let target = rstate.color_target(self.target_format);
|
||||||
|
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
|
||||||
|
label: Some("xenos interp pipeline (per-state)"),
|
||||||
|
layout: Some(&self.pipeline_layout),
|
||||||
|
vertex: wgpu::VertexState {
|
||||||
|
module: &self.interp_shader,
|
||||||
|
entry_point: "vs_main",
|
||||||
|
compilation_options: Default::default(),
|
||||||
|
buffers: &[],
|
||||||
|
},
|
||||||
|
fragment: Some(wgpu::FragmentState {
|
||||||
|
module: &self.interp_shader,
|
||||||
|
entry_point: "fs_main",
|
||||||
|
compilation_options: Default::default(),
|
||||||
|
targets: &[Some(target)],
|
||||||
|
}),
|
||||||
|
primitive: wgpu::PrimitiveState {
|
||||||
|
topology: wgpu::PrimitiveTopology::TriangleList,
|
||||||
|
strip_index_format: None,
|
||||||
|
front_face: wgpu::FrontFace::Ccw,
|
||||||
|
cull_mode: None,
|
||||||
|
polygon_mode: wgpu::PolygonMode::Fill,
|
||||||
|
unclipped_depth: false,
|
||||||
|
conservative: false,
|
||||||
|
},
|
||||||
|
depth_stencil: None,
|
||||||
|
multisample: wgpu::MultisampleState::default(),
|
||||||
|
multiview: None,
|
||||||
|
cache: None,
|
||||||
|
});
|
||||||
|
self.interp_cache.insert(rstate, pipeline);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// iterate-3Y: render one draw through the translator pipeline built for
|
||||||
|
/// this draw's render state. Returns `false` if no module is cached for
|
||||||
|
/// `(vs,ps)` (caller should fall back to the interpreter).
|
||||||
|
pub fn render_one_translated(
|
||||||
|
&mut self,
|
||||||
|
device: &wgpu::Device,
|
||||||
queue: &wgpu::Queue,
|
queue: &wgpu::Queue,
|
||||||
encoder: &mut wgpu::CommandEncoder,
|
encoder: &mut wgpu::CommandEncoder,
|
||||||
target_view: &wgpu::TextureView,
|
target_view: &wgpu::TextureView,
|
||||||
req: DrawRequest,
|
req: DrawRequest,
|
||||||
pipeline: &wgpu::RenderPipeline,
|
vs_key: u32,
|
||||||
) {
|
ps_key: u32,
|
||||||
|
rstate: RenderState,
|
||||||
|
) -> bool {
|
||||||
|
if !self.ensure_translated_for_state(device, vs_key, ps_key, rstate) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
let cb = DrawConstants {
|
let cb = DrawConstants {
|
||||||
draw_index: req.draw_index,
|
draw_index: req.draw_index,
|
||||||
vertex_count: req.vertex_count.max(3),
|
vertex_count: req.vertex_count.max(3),
|
||||||
prim_kind: req.prim_kind,
|
prim_kind: req.prim_kind,
|
||||||
_pad: 0,
|
vertex_base_dwords: req.vertex_base_dwords,
|
||||||
|
ndc_scale: req.ndc_scale,
|
||||||
|
ndc_offset: req.ndc_offset,
|
||||||
};
|
};
|
||||||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||||||
|
let pipeline = self
|
||||||
|
.translated_cache
|
||||||
|
.get(&(vs_key, ps_key, rstate))
|
||||||
|
.expect("just ensured");
|
||||||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||||||
label: Some("xenos translated draw"),
|
label: Some("xenos translated draw"),
|
||||||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||||||
@@ -503,6 +718,7 @@ impl XenosPipeline {
|
|||||||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||||||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||||||
pass.draw(0..rounded.max(3), 0..1);
|
pass.draw(0..rounded.max(3), 0..1);
|
||||||
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Number of distinct translator pipelines cached. Surfaced to the HUD.
|
/// Number of distinct translator pipelines cached. Surfaced to the HUD.
|
||||||
@@ -594,22 +810,34 @@ impl XenosPipeline {
|
|||||||
queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
|
queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Render one captured draw.
|
/// Render one captured draw through the interpreter, using the per-draw
|
||||||
|
/// `rstate` (blend/write-mask) so flat draws composite correctly even
|
||||||
|
/// when their (vs,ps) didn't translate. `RenderState::OPAQUE` reproduces
|
||||||
|
/// the legacy fixed behaviour for procedural/synthetic draws.
|
||||||
pub fn render_one(
|
pub fn render_one(
|
||||||
&self,
|
&mut self,
|
||||||
|
device: &wgpu::Device,
|
||||||
queue: &wgpu::Queue,
|
queue: &wgpu::Queue,
|
||||||
encoder: &mut wgpu::CommandEncoder,
|
encoder: &mut wgpu::CommandEncoder,
|
||||||
target_view: &wgpu::TextureView,
|
target_view: &wgpu::TextureView,
|
||||||
req: DrawRequest,
|
req: DrawRequest,
|
||||||
|
rstate: RenderState,
|
||||||
) {
|
) {
|
||||||
|
self.ensure_interp_for_state(device, rstate);
|
||||||
let cb = DrawConstants {
|
let cb = DrawConstants {
|
||||||
draw_index: req.draw_index,
|
draw_index: req.draw_index,
|
||||||
vertex_count: req.vertex_count.max(3),
|
vertex_count: req.vertex_count.max(3),
|
||||||
prim_kind: req.prim_kind,
|
prim_kind: req.prim_kind,
|
||||||
_pad: 0,
|
vertex_base_dwords: req.vertex_base_dwords,
|
||||||
|
ndc_scale: req.ndc_scale,
|
||||||
|
ndc_offset: req.ndc_offset,
|
||||||
};
|
};
|
||||||
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));
|
||||||
|
|
||||||
|
let pipeline = self
|
||||||
|
.interp_cache
|
||||||
|
.get(&rstate)
|
||||||
|
.expect("just ensured");
|
||||||
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||||||
label: Some("xenos draw"),
|
label: Some("xenos draw"),
|
||||||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||||||
@@ -624,7 +852,7 @@ impl XenosPipeline {
|
|||||||
timestamp_writes: None,
|
timestamp_writes: None,
|
||||||
occlusion_query_set: None,
|
occlusion_query_set: None,
|
||||||
});
|
});
|
||||||
pass.set_pipeline(&self.pipeline);
|
pass.set_pipeline(pipeline);
|
||||||
pass.set_bind_group(0, &self.bind_group, &[]);
|
pass.set_bind_group(0, &self.bind_group, &[]);
|
||||||
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
pass.set_bind_group(1, &self.tex_bind_group, &[]);
|
||||||
let rounded = req.vertex_count.div_ceil(3) * 3;
|
let rounded = req.vertex_count.div_ceil(3) * 3;
|
||||||
@@ -638,6 +866,6 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn draw_constants_layout_matches_wgsl_uniform() {
|
fn draw_constants_layout_matches_wgsl_uniform() {
|
||||||
assert_eq!(std::mem::size_of::<DrawConstants>(), 16);
|
assert_eq!(std::mem::size_of::<DrawConstants>(), 32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user