From 9cadab8fa91a63564774ae7dbe74e7c18715f586 Mon Sep 17 00:00:00 2001 From: merry Date: Tue, 29 Mar 2022 20:57:34 +0100 Subject: [PATCH] backend/emit_x64_memory: Enforce memory ordering --- src/dynarmic/backend/x64/a32_emit_x64.h | 10 +- .../backend/x64/a32_emit_x64_memory.cpp | 134 ++++++----- src/dynarmic/backend/x64/a64_emit_x64.h | 6 +- .../backend/x64/a64_emit_x64_memory.cpp | 224 ++++++++++-------- .../backend/x64/emit_x64_memory.cpp.inc | 62 +++-- src/dynarmic/backend/x64/emit_x64_memory.h | 93 +++++++- src/dynarmic/backend/x64/reg_alloc.cpp | 5 + src/dynarmic/backend/x64/reg_alloc.h | 5 + 8 files changed, 333 insertions(+), 206 deletions(-) diff --git a/src/dynarmic/backend/x64/a32_emit_x64.h b/src/dynarmic/backend/x64/a32_emit_x64.h index e87454b4..7bdc7e02 100644 --- a/src/dynarmic/backend/x64/a32_emit_x64.h +++ b/src/dynarmic/backend/x64/a32_emit_x64.h @@ -71,12 +71,12 @@ protected: std::array fast_dispatch_table; void ClearFastDispatchTable(); - void (*memory_read_128)() = nullptr; // Dummy - void (*memory_write_128)() = nullptr; // Dummy + void (*memory_read_128)() = nullptr; // Dummy + void (*memory_write_128)() = nullptr; // Dummy - std::map, void (*)()> read_fallbacks; - std::map, void (*)()> write_fallbacks; - std::map, void (*)()> exclusive_write_fallbacks; + std::map, void (*)()> read_fallbacks; + std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; diff --git a/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp b/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp index 6608bc2c..adbe00e2 100644 --- a/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp +++ b/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp @@ -47,77 +47,85 @@ void A32EmitX64::GenFastmemFallbacks() { {64, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, }}; - for (int vaddr_idx : idxes) { - for (int value_idx : idxes) { - for (const auto& [bitsize, callback] : read_callbacks) { - code.align(); - read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - callback.EmitCall(code); - if (value_idx != code.ABI_RETURN.getIdx()) { - code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); - } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); - code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); - } - - for (const auto& [bitsize, callback] : write_callbacks) { - code.align(); - write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } + for (bool ordered : {false, true}) { + for (int vaddr_idx : idxes) { + for (int value_idx : idxes) { + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } + if (ordered) { + code.mfence(); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStack(code); - code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); - } - for (const auto& [bitsize, callback] : exclusive_write_callbacks) { - code.align(); - exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + callback.EmitCall(code); + if (ordered) { + code.mfence(); + } + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + code.mov(code.ABI_PARAM4, rax); + code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - code.mov(code.ABI_PARAM4, rax); - code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - code.ret(); - PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); } } } diff --git a/src/dynarmic/backend/x64/a64_emit_x64.h b/src/dynarmic/backend/x64/a64_emit_x64.h index cbfa28bd..f5d1a917 100644 --- a/src/dynarmic/backend/x64/a64_emit_x64.h +++ b/src/dynarmic/backend/x64/a64_emit_x64.h @@ -71,9 +71,9 @@ protected: void (*memory_exclusive_write_128)(); void GenMemory128Accessors(); - std::map, void (*)()> read_fallbacks; - std::map, void (*)()> write_fallbacks; - std::map, void (*)()> exclusive_write_fallbacks; + std::map, void (*)()> read_fallbacks; + std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; diff --git a/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp b/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp index 9025b623..0ab2ff9b 100644 --- a/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp +++ b/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp @@ -131,135 +131,149 @@ void A64EmitX64::GenFastmemFallbacks() { {64, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, }}; - for (int vaddr_idx : idxes) { - if (vaddr_idx == 4 || vaddr_idx == 15) { - continue; - } - - for (int value_idx : idxes) { - code.align(); - read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - code.call(memory_read_128); - if (value_idx != 1) { - code.movaps(Xbyak::Xmm{value_idx}, xmm1); - } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); - code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); - - code.align(); - write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - if (value_idx != 1) { - code.movaps(xmm1, Xbyak::Xmm{value_idx}); - } - code.call(memory_write_128); - ABI_PopCallerSaveRegistersAndAdjustStack(code); - code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); - - code.align(); - exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - if (value_idx != 1) { - code.movaps(xmm1, Xbyak::Xmm{value_idx}); - } - if (code.HasHostFeature(HostFeature::SSE41)) { - code.movq(xmm2, rax); - code.pinsrq(xmm2, rdx, 1); - } else { - code.movq(xmm2, rax); - code.movq(xmm0, rdx); - code.punpcklqdq(xmm2, xmm0); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - code.call(memory_exclusive_write_128); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - code.ret(); - PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); - - if (value_idx == 4 || value_idx == 15) { + for (bool ordered : {false, true}) { + for (int vaddr_idx : idxes) { + if (vaddr_idx == 4 || vaddr_idx == 15) { continue; } - for (const auto& [bitsize, callback] : read_callbacks) { + for (int value_idx : idxes) { code.align(); - read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } - callback.EmitCall(code); - if (value_idx != code.ABI_RETURN.getIdx()) { - code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + if (ordered) { + code.mfence(); } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.call(memory_read_128); + if (value_idx != 1) { + code.movaps(Xbyak::Xmm{value_idx}, xmm1); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); - } + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); - for (const auto& [bitsize, callback] : write_callbacks) { code.align(); - write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - callback.EmitCall(code); + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + code.call(memory_write_128); + if (ordered) { + code.mfence(); + } ABI_PopCallerSaveRegistersAndAdjustStack(code); code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); - } + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); - for (const auto& [bitsize, callback] : exclusive_write_callbacks) { code.align(); - exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(xmm2, rax); + code.pinsrq(xmm2, rdx, 1); } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } + code.movq(xmm2, rax); + code.movq(xmm0, rdx); + code.punpcklqdq(xmm2, xmm0); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + code.call(memory_exclusive_write_128); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_exclusive_write_fallback_128"); + + if (value_idx == 4 || value_idx == 15) { + continue; + } + + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } + if (ordered) { + code.mfence(); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + callback.EmitCall(code); + if (ordered) { + code.mfence(); + } + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + code.mov(code.ABI_PARAM4, rax); + code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - code.mov(code.ABI_PARAM4, rax); - code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - code.ret(); - PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); } } } diff --git a/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc index 1054f616..74899034 100644 --- a/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc +++ b/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc @@ -52,16 +52,23 @@ FakeCall AxxEmitX64::FastmemCallback(u64 rip_) { template void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[1].GetImmediateAccType()); const auto fastmem_marker = ShouldFastmem(ctx, inst); if (!conf.page_table && !fastmem_marker) { // Neither fastmem nor page table: Use callbacks if constexpr (bitsize == 128) { ctx.reg_alloc.HostCall(nullptr, {}, args[0]); + if (ordered) { + code.mfence(); + } code.CallFunction(memory_read_128); ctx.reg_alloc.DefineValue(inst, xmm1); } else { ctx.reg_alloc.HostCall(inst, {}, args[0]); + if (ordered) { + code.mfence(); + } Devirtualize(conf.callbacks).EmitCall(code); code.ZeroExtendFrom(bitsize, code.ABI_RETURN); } @@ -71,7 +78,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; + const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; Xbyak::Label abort, end; bool require_abort_handling = false; @@ -80,8 +87,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { // Use fastmem const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value_idx, src_ptr); + const auto location = EmitReadMemoryMov(code, value_idx, src_ptr, ordered); fastmem_patch_info.emplace( Common::BitCast(location), @@ -96,7 +102,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { ASSERT(conf.page_table); const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); require_abort_handling = true; - EmitReadMemoryMov(code, value_idx, src_ptr); + EmitReadMemoryMov(code, value_idx, src_ptr, ordered); } code.L(end); @@ -118,6 +124,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { template void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); const auto fastmem_marker = ShouldFastmem(ctx, inst); if (!conf.page_table && !fastmem_marker) { @@ -132,13 +139,18 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); Devirtualize(conf.callbacks).EmitCall(code); } + if (ordered) { + code.mfence(); + } return; } const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const int value_idx = bitsize == 128 ? ctx.reg_alloc.UseXmm(args[1]).getIdx() : ctx.reg_alloc.UseGpr(args[1]).getIdx(); + const int value_idx = bitsize == 128 + ? ctx.reg_alloc.UseXmm(args[1]).getIdx() + : (ordered ? ctx.reg_alloc.UseScratchGpr(args[1]).getIdx() : ctx.reg_alloc.UseGpr(args[1]).getIdx()); - const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; + const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; Xbyak::Label abort, end; bool require_abort_handling = false; @@ -147,8 +159,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { // Use fastmem const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - const auto location = code.getCurr(); - EmitWriteMemoryMov(code, dest_ptr, value_idx); + const auto location = EmitWriteMemoryMov(code, dest_ptr, value_idx, ordered); fastmem_patch_info.emplace( Common::BitCast(location), @@ -163,7 +174,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { ASSERT(conf.page_table); const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); require_abort_handling = true; - EmitWriteMemoryMov(code, dest_ptr, value_idx); + EmitWriteMemoryMov(code, dest_ptr, value_idx, ordered); } code.L(end); @@ -180,6 +191,7 @@ template void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) { ASSERT(conf.global_monitor != nullptr); auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[1].GetImmediateAccType()); if constexpr (bitsize != 128) { using T = mp::unsigned_integer_of_size; @@ -188,6 +200,9 @@ void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) { code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + if (ordered) { + code.mfence(); + } code.CallLambda( [](AxxUserConfig& conf, Axx::VAddr vaddr) -> T { return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { @@ -205,6 +220,9 @@ void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) { code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + if (ordered) { + code.mfence(); + } code.CallLambda( [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& ret) { ret = conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> Vector { @@ -222,6 +240,7 @@ template void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) { ASSERT(conf.global_monitor != nullptr); auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); if constexpr (bitsize != 128) { ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); @@ -251,6 +270,9 @@ void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) { ? 0 : 1; }); + if (ordered) { + code.mfence(); + } } else { ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); @@ -264,6 +286,9 @@ void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) { ? 0 : 1; }); + if (ordered) { + code.mfence(); + } ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); } code.L(end); @@ -278,13 +303,14 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in } auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[1].GetImmediateAccType()); const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; + const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); @@ -299,8 +325,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value_idx, src_ptr); + const auto location = EmitReadMemoryMov(code, value_idx, src_ptr, ordered); fastmem_patch_info.emplace( Common::BitCast(location), @@ -325,7 +350,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in } code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); - EmitWriteMemoryMov(code, tmp, value_idx); + EmitWriteMemoryMov(code, tmp, value_idx, false); EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); @@ -345,6 +370,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i } auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); const auto value = [&] { if constexpr (bitsize == 128) { @@ -362,7 +388,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + const auto wrapped_fn = exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value.getIdx())]; EmitExclusiveLock(code, conf, tmp, eax); @@ -393,7 +419,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i code.movq(rcx, xmm0); } } else { - EmitReadMemoryMov(code, rax.getIdx(), tmp); + EmitReadMemoryMov(code, rax.getIdx(), tmp, false); } const auto fastmem_marker = ShouldFastmem(ctx, inst); @@ -435,13 +461,13 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i code.SwitchToFarCode(); code.L(abort); - code.call(fallback_fn); + code.call(wrapped_fn); fastmem_patch_info.emplace( Common::BitCast(location), FastmemPatchInfo{ Common::BitCast(code.getCurr()), - Common::BitCast(fallback_fn), + Common::BitCast(wrapped_fn), *fastmem_marker, conf.recompile_on_exclusive_fastmem_failure, }); @@ -452,7 +478,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i code.jmp(end, code.T_NEAR); code.SwitchToNearCode(); } else { - code.call(fallback_fn); + code.call(wrapped_fn); code.cmp(al, 0); code.setz(status.cvt8()); code.movzx(status.cvt32(), status.cvt8()); diff --git a/src/dynarmic/backend/x64/emit_x64_memory.h b/src/dynarmic/backend/x64/emit_x64_memory.h index 23b567ef..c9ce765e 100644 --- a/src/dynarmic/backend/x64/emit_x64_memory.h +++ b/src/dynarmic/backend/x64/emit_x64_memory.h @@ -10,6 +10,7 @@ #include "dynarmic/backend/x64/exclusive_monitor_friend.h" #include "dynarmic/common/spin_lock_x64.h" #include "dynarmic/interface/exclusive_monitor.h" +#include "dynarmic/ir/acc_type.h" namespace Dynarmic::Backend::X64 { @@ -198,49 +199,113 @@ template<> } template -void EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr) { +const void* EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr, bool ordered) { + if (ordered) { + if constexpr (bitsize == 128) { + code.mfence(); + } else { + code.xor_(Xbyak::Reg32{value_idx}, Xbyak::Reg32{value_idx}); + } + + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.lock(); + code.xadd(code.byte[addr], Xbyak::Reg32{value_idx}.cvt8()); + break; + case 16: + code.lock(); + code.xadd(word[addr], Xbyak::Reg32{value_idx}); + break; + case 32: + code.lock(); + code.xadd(dword[addr], Xbyak::Reg32{value_idx}); + break; + case 64: + code.lock(); + code.xadd(qword[addr], Xbyak::Reg64{value_idx}); + break; + case 128: + // TODO (HACK): Detect CPUs where this load is not atomic + code.movaps(Xbyak::Xmm{value_idx}, xword[addr]); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; + } + + const void* fastmem_location = code.getCurr(); switch (bitsize) { case 8: code.movzx(Xbyak::Reg32{value_idx}, code.byte[addr]); - return; + break; case 16: code.movzx(Xbyak::Reg32{value_idx}, word[addr]); - return; + break; case 32: code.mov(Xbyak::Reg32{value_idx}, dword[addr]); - return; + break; case 64: code.mov(Xbyak::Reg64{value_idx}, qword[addr]); - return; + break; case 128: code.movups(Xbyak::Xmm{value_idx}, xword[addr]); - return; + break; default: ASSERT_FALSE("Invalid bitsize"); } + return fastmem_location; } template -void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx) { +const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx, bool ordered) { + if (ordered) { + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.xchg(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); + break; + case 16: + code.xchg(word[addr], Xbyak::Reg16{value_idx}); + break; + case 32: + code.xchg(dword[addr], Xbyak::Reg32{value_idx}); + break; + case 64: + code.xchg(qword[addr], Xbyak::Reg64{value_idx}); + break; + case 128: + code.movaps(xword[addr], Xbyak::Xmm{value_idx}); + code.mfence(); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; + } + + const void* fastmem_location = code.getCurr(); switch (bitsize) { case 8: code.mov(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); - return; + break; case 16: code.mov(word[addr], Xbyak::Reg16{value_idx}); - return; + break; case 32: code.mov(dword[addr], Xbyak::Reg32{value_idx}); - return; + break; case 64: code.mov(qword[addr], Xbyak::Reg64{value_idx}); - return; + break; case 128: code.movups(xword[addr], Xbyak::Xmm{value_idx}); - return; + break; default: ASSERT_FALSE("Invalid bitsize"); } + return fastmem_location; } template @@ -284,6 +349,10 @@ void EmitExclusiveTestAndClear(BlockOfCode& code, const UserConfig& conf, Xbyak: } } +inline bool IsOrdered(IR::AccType acctype) { + return acctype == IR::AccType::ORDERED || acctype == IR::AccType::ORDEREDRW || acctype == IR::AccType::LIMITEDORDERED; +} + } // namespace } // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/backend/x64/reg_alloc.cpp b/src/dynarmic/backend/x64/reg_alloc.cpp index ae53a620..7ff8c780 100644 --- a/src/dynarmic/backend/x64/reg_alloc.cpp +++ b/src/dynarmic/backend/x64/reg_alloc.cpp @@ -208,6 +208,11 @@ IR::Cond Argument::GetImmediateCond() const { return value.GetCond(); } +IR::AccType Argument::GetImmediateAccType() const { + ASSERT(IsImmediate() && GetType() == IR::Type::AccType); + return value.GetAccType(); +} + bool Argument::IsInGpr() const { if (IsImmediate()) return false; diff --git a/src/dynarmic/backend/x64/reg_alloc.h b/src/dynarmic/backend/x64/reg_alloc.h index 58bfe87f..5002932c 100644 --- a/src/dynarmic/backend/x64/reg_alloc.h +++ b/src/dynarmic/backend/x64/reg_alloc.h @@ -21,6 +21,10 @@ #include "dynarmic/ir/microinstruction.h" #include "dynarmic/ir/value.h" +namespace Dynarmic::IR { +enum class AccType; +} // namespace Dynarmic::IR + namespace Dynarmic::Backend::X64 { class RegAlloc; @@ -75,6 +79,7 @@ public: u64 GetImmediateS32() const; u64 GetImmediateU64() const; IR::Cond GetImmediateCond() const; + IR::AccType GetImmediateAccType() const; /// Is this value currently in a GPR? bool IsInGpr() const;