diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs index 3bb3d56..a205dcf 100644 --- a/crates/xenia-cpu/src/interpreter.rs +++ b/crates/xenia-cpu/src/interpreter.rs @@ -2982,16 +2982,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - ctx.pc += 4; } PpcOpcode::stvewx128 => { - let ea = ea_indexed(ctx, instr) & !0xF; - // TODO PPCBUG-510: stvewx128 currently writes 16 bytes at ea & !0xF; the EA scope is - // wrong (should be word-aligned, 4 bytes only). When P3 fixes EA, this invalidate's - // range narrows automatically. - // PPCBUG-512: stvewx128 was missing invalidate_for_write. + // Mirror of stvewx: word-align EA, extract one 32-bit lane, write 4 bytes only. + // Previous code used & !0xF (16-byte) and wrote all 16 bytes, corrupting 12 + // adjacent bytes on every execution (PPCBUG-510). + let ea_unaligned = ea_indexed(ctx, instr); + let ea = ea_unaligned & !0x3u32; if let Some(t) = ctx.reservation_table.as_ref().filter(|t| t.is_enabled()) { if t.has_active_reservers() { t.invalidate_for_write(ea); } } + let slot = ((ea_unaligned & 0xF) >> 2) as usize; let bytes = ctx.vr[instr.vs128()].as_bytes(); - for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); } + let w = ((bytes[slot * 4] as u32) << 24) + | ((bytes[slot * 4 + 1] as u32) << 16) + | ((bytes[slot * 4 + 2] as u32) << 8) + | (bytes[slot * 4 + 3] as u32); + mem.write_u32(ea, w); ctx.pc += 4; } @@ -6463,4 +6468,47 @@ mod tests { assert_eq!(r[2], 0x3333_3333, "pack=1 shift=3: lane 2 from prev"); assert_eq!(r[3], 0x4444_4444, "pack=1 shift=3: lane 3 from prev"); } + + // ---- PPCBUG-510: stvewx128 should write one word (4 bytes), not 16 ---- + + fn encode_stvewx128(vs_lo: u32, ra: u32, rb: u32) -> u32 { + // stvewx128 is a VMX128 load/store at op6=4. + // decode_op4 key1 = (bits21-27 << 4) | bits30-31 = 0b00110000011 for stvewx128. + // bits21-27 = 0b0011000 (host bits 10-4), bits30-31 = 0b11 (host bits 1-0). + // VS128[4:0] at host bits 25-21; RA at host bits 20-16; RB at host bits 15-11. + // VS128[5] at host bit 3 (PPC bit 28); VS128[6] at host bit 1 (PPC bit 30). + (4u32 << 26) + | (vs_lo << 21) // VS128[4:0] + | (ra << 16) // RA + | (rb << 11) // RB + | (0b0011000 << 4) // bits 21-27 of key1 pattern + | 0b11 // bits 30-31 of key1 pattern + } + + #[test] + fn stvewx128_writes_one_word_at_word_aligned_ea() { + // PPCBUG-510: old code wrote all 16 bytes at ea & !0xF, corrupting 12 adjacent bytes. + // Fix: word-align EA, extract lane from (ea & 0xF) >> 2, write 4 bytes only. + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + // VS128 = v96 (vs_lo=0 | key bits → vs128=0 since key bits 21-27 set bit4=1 and bit5=1 + // in the key, but vs128 uses bits 6-10 for low 5 bits). + // Actually: vs128 uses decode bits 6-10 (host 25-21) and bits 21,22 (host 10,9). + // encode_stvewx128 sets vs_lo in bits 25-21 and key bits at bits 10-4. + // vs128 = bits6-10 | (bit21<<5) | (bit22<<6) = vs_lo | 0 | 0 = vs_lo. + // So vs128 = vs_lo. We'll use vs_lo=3 → vs128=3. + let raw = encode_stvewx128(3, 1, 2); + ctx.vr[3] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444); + ctx.gpr[1] = 0x1000; // base + ctx.gpr[2] = 0x008; // offset → EA = 0x1008 → word-aligned EA = 0x1008, slot = (0x8 & 0xF)>>2 = 2 + write_instr(&mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mem); + assert_eq!(ctx.pc, 4, "PC must advance"); + // Slot 2 → lane 2 = 0x3333_3333 + assert_eq!(mem.read_u32(0x1008), 0x3333_3333, "only lane 2 word at ea"); + // Adjacent words must be untouched (mem is zero-init) + assert_eq!(mem.read_u32(0x1000), 0x0000_0000, "byte below must be untouched"); + assert_eq!(mem.read_u32(0x100C), 0x0000_0000, "byte above must be untouched"); + } }