From 5ea2b49ef012d9cac402b5713cde35e652d5a29d Mon Sep 17 00:00:00 2001 From: merry Date: Mon, 28 Feb 2022 08:13:10 +0000 Subject: [PATCH] backend/x64: Inline exclusive memory access operations (#664) * a64_emit_x64_memory: Add Unsafe_IgnoreGlobalMonitor optimization * a32_emit_x64_memory: Add Unsafe_IgnoreGlobalMonitor optimization * a32_emit_x64_memory: Remove dead code * {a32,a64}_emit_x64_memory: Also verify vaddr in Exclusive{Read,Write}MemoryInlineUnsafe * a64_emit_x64_memory: Full fallback for ExclusiveWriteMemoryInlineUnsafe * a64_emit_x64_memory: Inline full locking * a64_emit_x64_memory: Allow inlined locking to be optionally removed * spin_lock: Use xbyak instead of inline asm * a64_emit_x64_memory: Recompile on exclusive fastmem failure * Avoid variable shadowing * a32_emit_x64_memory: Implement recompilation * Fix recompilation * spin_lock: Clang format fix * fix fallback function calls --- src/dynarmic/CMakeLists.txt | 5 + src/dynarmic/backend/x64/a32_emit_x64.h | 6 + .../backend/x64/a32_emit_x64_memory.cpp | 244 ++++++++++- src/dynarmic/backend/x64/a64_emit_x64.h | 8 + .../backend/x64/a64_emit_x64_memory.cpp | 393 ++++++++++++++++-- src/dynarmic/backend/x64/emit_x64_memory.h | 62 +++ .../backend/x64/exclusive_monitor.cpp | 4 +- .../backend/x64/exclusive_monitor_friend.h | 28 ++ src/dynarmic/common/spin_lock.h | 17 + src/dynarmic/common/spin_lock_x64.cpp | 70 ++++ src/dynarmic/common/spin_lock_x64.h | 15 + src/dynarmic/interface/A32/config.h | 9 + src/dynarmic/interface/A64/config.h | 9 + src/dynarmic/interface/exclusive_monitor.h | 9 +- src/dynarmic/interface/optimization_flags.h | 4 + 15 files changed, 836 insertions(+), 47 deletions(-) create mode 100644 src/dynarmic/backend/x64/emit_x64_memory.h create mode 100644 src/dynarmic/backend/x64/exclusive_monitor_friend.h create mode 100644 src/dynarmic/common/spin_lock.h create mode 100644 src/dynarmic/common/spin_lock_x64.cpp create mode 100644 src/dynarmic/common/spin_lock_x64.h diff --git a/src/dynarmic/CMakeLists.txt b/src/dynarmic/CMakeLists.txt index 2e8273ec..126b9dba 100644 --- a/src/dynarmic/CMakeLists.txt +++ b/src/dynarmic/CMakeLists.txt @@ -58,6 +58,7 @@ add_library(dynarmic common/memory_pool.h common/safe_ops.h common/scope_exit.h + common/spin_lock.h common/string_util.h common/u128.cpp common/u128.h @@ -281,6 +282,7 @@ if (ARCHITECTURE STREQUAL "x86_64") backend/x64/emit_x64_crc32.cpp backend/x64/emit_x64_data_processing.cpp backend/x64/emit_x64_floating_point.cpp + backend/x64/emit_x64_memory.h backend/x64/emit_x64_packed.cpp backend/x64/emit_x64_saturation.cpp backend/x64/emit_x64_sm4.cpp @@ -289,6 +291,7 @@ if (ARCHITECTURE STREQUAL "x86_64") backend/x64/emit_x64_vector_saturation.cpp backend/x64/exception_handler.h backend/x64/exclusive_monitor.cpp + backend/x64/exclusive_monitor_friend.h backend/x64/host_feature.h backend/x64/hostloc.cpp backend/x64/hostloc.h @@ -299,6 +302,8 @@ if (ARCHITECTURE STREQUAL "x86_64") backend/x64/reg_alloc.cpp backend/x64/reg_alloc.h backend/x64/stack_layout.h + common/spin_lock_x64.cpp + common/spin_lock_x64.h ) if ("A32" IN_LIST DYNARMIC_FRONTENDS) diff --git a/src/dynarmic/backend/x64/a32_emit_x64.h b/src/dynarmic/backend/x64/a32_emit_x64.h index 6684835f..43f90f6a 100644 --- a/src/dynarmic/backend/x64/a32_emit_x64.h +++ b/src/dynarmic/backend/x64/a32_emit_x64.h @@ -73,6 +73,7 @@ protected: std::map, void (*)()> read_fallbacks; std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; @@ -98,6 +99,7 @@ protected: u64 resume_rip; u64 callback; DoNotFastmemMarker marker; + bool compile; }; tsl::robin_map fastmem_patch_info; std::set do_not_fastmem; @@ -113,6 +115,10 @@ protected: void ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst); template void ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst); + template + void ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst); + template + void ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst); // Terminal instruction emitters void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location); diff --git a/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp b/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp index 804a114a..33f1ddfc 100644 --- a/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp +++ b/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp @@ -16,6 +16,8 @@ #include "dynarmic/backend/x64/a32_emit_x64.h" #include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64_memory.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" #include "dynarmic/backend/x64/perf_map.h" #include "dynarmic/common/x64_disassemble.h" #include "dynarmic/interface/exclusive_monitor.h" @@ -38,6 +40,12 @@ void A32EmitX64::GenFastmemFallbacks() { {32, Devirtualize<&A32::UserCallbacks::MemoryWrite32>(conf.callbacks)}, {64, Devirtualize<&A32::UserCallbacks::MemoryWrite64>(conf.callbacks)}, }}; + const std::array, 4> exclusive_write_callbacks{{ + {8, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)}, + {16, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)}, + {32, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)}, + {64, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, + }}; for (int vaddr_idx : idxes) { for (int value_idx : idxes) { @@ -82,6 +90,32 @@ void A32EmitX64::GenFastmemFallbacks() { code.ret(); PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.mov(code.ABI_PARAM4, rax); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); + } } } } @@ -109,15 +143,16 @@ FakeCall A32EmitX64::FastmemCallback(u64 rip_) { ASSERT_FALSE("iter != fastmem_patch_info.end()"); } - if (conf.recompile_on_fastmem_failure) { + if (iter->second.compile) { const auto marker = iter->second.marker; do_not_fastmem.emplace(marker); InvalidateBasicBlocks({std::get<0>(marker)}); } - FakeCall ret; - ret.call_rip = iter->second.callback; - ret.ret_rip = iter->second.resume_rip; - return ret; + + return FakeCall{ + .call_rip = iter->second.callback, + .ret_rip = iter->second.resume_rip, + }; } namespace { @@ -265,6 +300,7 @@ void A32EmitX64::EmitMemoryRead(A32EmitContext& ctx, IR::Inst* inst) { Common::BitCast(code.getCurr()), Common::BitCast(wrapped_fn), *fastmem_marker, + conf.recompile_on_fastmem_failure, }); ctx.reg_alloc.DefineValue(inst, value); @@ -318,6 +354,7 @@ void A32EmitX64::EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst) { Common::BitCast(code.getCurr()), Common::BitCast(wrapped_fn), *fastmem_marker, + conf.recompile_on_fastmem_failure, }); return; @@ -418,40 +455,221 @@ void A32EmitX64::ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst) { code.L(end); } +template +void A32EmitX64::ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + ExclusiveReadMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); + + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(qword[tmp], vaddr); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label end; + + const auto src_ptr = r13 + vaddr; + + const auto location = code.getCurr(); + EmitReadMemoryMov(code, value, src_ptr); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.L(end); + } else { + code.call(wrapped_fn); + } + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + EmitWriteMemoryMov(code, tmp, value); + + EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); + + ctx.reg_alloc.DefineValue(inst, value); +} + +template +void A32EmitX64::ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + ExclusiveWriteMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, eax); + + Xbyak::Label end; + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(status, u32(1)); + code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); + code.je(end, code.T_NEAR); + code.cmp(qword[tmp], vaddr); + code.jne(end, code.T_NEAR); + + EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); + + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + + EmitReadMemoryMov(code, rax, tmp); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + const auto dest_ptr = r13 + vaddr; + + const auto location = code.getCurr(); + + switch (bitsize) { + case 8: + code.lock(); + code.cmpxchg(code.byte[dest_ptr], value.cvt8()); + break; + case 16: + code.lock(); + code.cmpxchg(word[dest_ptr], value.cvt16()); + break; + case 32: + code.lock(); + code.cmpxchg(dword[dest_ptr], value.cvt32()); + break; + case 64: + code.lock(); + code.cmpxchg(qword[dest_ptr], value.cvt64()); + break; + default: + UNREACHABLE(); + } + + code.setnz(status.cvt8()); + + code.SwitchToFarCode(); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(fallback_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } else { + code.call(fallback_fn); + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + } + + code.L(end); + + EmitExclusiveUnlock(code, conf, tmp, eax); + + ctx.reg_alloc.DefineValue(inst, status); +} + void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) { code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); } void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + } else { + ExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + } } void A32EmitX64::EmitA32ExclusiveReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + } else { + ExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + } } void A32EmitX64::EmitA32ExclusiveReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + } else { + ExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + } } void A32EmitX64::EmitA32ExclusiveReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + } else { + ExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + } } void A32EmitX64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } else { + ExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } } void A32EmitX64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } else { + ExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } } void A32EmitX64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } else { + ExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } } void A32EmitX64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } else { + ExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } } } // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/backend/x64/a64_emit_x64.h b/src/dynarmic/backend/x64/a64_emit_x64.h index 32ab24e8..cbfa28bd 100644 --- a/src/dynarmic/backend/x64/a64_emit_x64.h +++ b/src/dynarmic/backend/x64/a64_emit_x64.h @@ -7,6 +7,7 @@ #include #include +#include #include #include "dynarmic/backend/x64/a64_jitstate.h" @@ -67,10 +68,12 @@ protected: void (*memory_read_128)(); void (*memory_write_128)(); + void (*memory_exclusive_write_128)(); void GenMemory128Accessors(); std::map, void (*)()> read_fallbacks; std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; @@ -97,6 +100,7 @@ protected: u64 resume_rip; u64 callback; DoNotFastmemMarker marker; + bool recompile; }; tsl::robin_map fastmem_patch_info; std::set do_not_fastmem; @@ -112,6 +116,10 @@ protected: void EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst); template void EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst); + template + void EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst); + template + void EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst); // Terminal instruction emitters void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; diff --git a/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp b/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp index 29aeaf01..0b2b6977 100644 --- a/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp +++ b/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp @@ -16,7 +16,10 @@ #include "dynarmic/backend/x64/a64_emit_x64.h" #include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64_memory.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" #include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/common/spin_lock_x64.h" #include "dynarmic/common/x64_disassemble.h" #include "dynarmic/interface/exclusive_monitor.h" @@ -73,7 +76,38 @@ void A64EmitX64::GenMemory128Accessors() { code.add(rsp, 8); #endif code.ret(); - PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_write_128"); + PerfMapRegister(memory_write_128, code.getCurr(), "a64_memory_write_128"); + + code.align(); + memory_exclusive_write_128 = code.getCurr(); +#ifdef _WIN32 + code.sub(rsp, 8 + 32 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 16]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + code.movaps(xword[code.ABI_PARAM4], xmm2); + Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); +#else + code.sub(rsp, 8); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(code.ABI_PARAM3, xmm1); + code.pextrq(code.ABI_PARAM4, xmm1, 1); + code.movq(code.ABI_PARAM5, xmm2); + code.pextrq(code.ABI_PARAM6, xmm2, 1); + } else { + code.movq(code.ABI_PARAM3, xmm1); + code.punpckhqdq(xmm1, xmm1); + code.movq(code.ABI_PARAM4, xmm1); + code.movq(code.ABI_PARAM5, xmm2); + code.punpckhqdq(xmm2, xmm2); + code.movq(code.ABI_PARAM6, xmm2); + } + Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8); +#endif + code.ret(); + PerfMapRegister(memory_exclusive_write_128, code.getCurr(), "a64_memory_exclusive_write_128"); } void A64EmitX64::GenFastmemFallbacks() { @@ -90,6 +124,12 @@ void A64EmitX64::GenFastmemFallbacks() { {32, Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks)}, {64, Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks)}, }}; + const std::array, 4> exclusive_write_callbacks{{ + {8, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)}, + {16, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)}, + {32, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)}, + {64, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, + }}; for (int vaddr_idx : idxes) { if (vaddr_idx == 4 || vaddr_idx == 15) { @@ -125,6 +165,28 @@ void A64EmitX64::GenFastmemFallbacks() { code.ret(); PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); + code.align(); + exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(xmm2, rax); + code.pinsrq(xmm2, rdx, 1); + } else { + code.movq(xmm2, rax); + code.movq(xmm0, rdx); + code.punpcklqdq(xmm2, xmm0); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + code.call(memory_exclusive_write_128); + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); + if (value_idx == 4 || value_idx == 15) { continue; } @@ -170,6 +232,32 @@ void A64EmitX64::GenFastmemFallbacks() { code.ret(); PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.mov(code.ABI_PARAM4, rax); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); + } } } } @@ -197,15 +285,16 @@ FakeCall A64EmitX64::FastmemCallback(u64 rip_) { ASSERT_FALSE("iter != fastmem_patch_info.end()"); } - if (conf.recompile_on_fastmem_failure) { + if (iter->second.recompile) { const auto marker = iter->second.marker; do_not_fastmem.emplace(marker); InvalidateBasicBlocks({std::get<0>(marker)}); } - FakeCall ret; - ret.call_rip = iter->second.callback; - ret.ret_rip = iter->second.resume_rip; - return ret; + + return FakeCall{ + .call_rip = iter->second.callback, + .ret_rip = iter->second.resume_rip, + }; } namespace { @@ -309,24 +398,26 @@ Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A64EmitContext& ctx, size_t bit return page + tmp; } -Xbyak::RegExp EmitFastmemVAddr(BlockOfCode& code, A64EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling) { +Xbyak::RegExp EmitFastmemVAddr(BlockOfCode& code, A64EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling, std::optional tmp = std::nullopt) { const size_t unused_top_bits = 64 - ctx.conf.fastmem_address_space_bits; if (unused_top_bits == 0) { return r13 + vaddr; } else if (ctx.conf.silently_mirror_fastmem) { - Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - if (unused_top_bits < 32) { - code.mov(tmp, vaddr); - code.shl(tmp, int(unused_top_bits)); - code.shr(tmp, int(unused_top_bits)); - } else if (unused_top_bits == 32) { - code.mov(tmp.cvt32(), vaddr.cvt32()); - } else { - code.mov(tmp.cvt32(), vaddr.cvt32()); - code.and_(tmp, u32((1 << ctx.conf.fastmem_address_space_bits) - 1)); + if (!tmp) { + tmp = ctx.reg_alloc.ScratchGpr(); } - return r13 + tmp; + if (unused_top_bits < 32) { + code.mov(*tmp, vaddr); + code.shl(*tmp, int(unused_top_bits)); + code.shr(*tmp, int(unused_top_bits)); + } else if (unused_top_bits == 32) { + code.mov(tmp->cvt32(), vaddr.cvt32()); + } else { + code.mov(tmp->cvt32(), vaddr.cvt32()); + code.and_(*tmp, u32((1 << ctx.conf.fastmem_address_space_bits) - 1)); + } + return r13 + *tmp; } else { if (ctx.conf.fastmem_address_space_bits < 32) { code.test(vaddr, u32(-(1 << ctx.conf.fastmem_address_space_bits))); @@ -334,9 +425,11 @@ Xbyak::RegExp EmitFastmemVAddr(BlockOfCode& code, A64EmitContext& ctx, Xbyak::La require_abort_handling = true; } else { // TODO: Consider having TEST as above but coalesce 64-bit constant in register allocator - Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - code.mov(tmp, vaddr); - code.shr(tmp, int(ctx.conf.fastmem_address_space_bits)); + if (!tmp) { + tmp = ctx.reg_alloc.ScratchGpr(); + } + code.mov(*tmp, vaddr); + code.shr(*tmp, int(ctx.conf.fastmem_address_space_bits)); code.jnz(abort, code.T_NEAR); require_abort_handling = true; } @@ -432,6 +525,7 @@ void A64EmitX64::EmitMemoryRead(A64EmitContext& ctx, IR::Inst* inst) { Common::BitCast(code.getCurr()), Common::BitCast(wrapped_fn), *fastmem_marker, + conf.recompile_on_fastmem_failure, }); } else { // Use page table @@ -498,6 +592,7 @@ void A64EmitX64::EmitMemoryWrite(A64EmitContext& ctx, IR::Inst* inst) { Common::BitCast(code.getCurr()), Common::BitCast(wrapped_fn), *fastmem_marker, + conf.recompile_on_fastmem_failure, }); } else { // Use page table @@ -650,48 +745,284 @@ void A64EmitX64::EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst) { code.L(end); } +template +void A64EmitX64::EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveReadMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; + + EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); + + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(qword[tmp], vaddr); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label abort, end; + bool require_abort_handling = false; + + const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); + + const auto location = code.getCurr(); + EmitReadMemoryMov(code, value_idx, src_ptr); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.L(end); + + if (require_abort_handling) { + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } + } else { + code.call(wrapped_fn); + } + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + EmitWriteMemoryMov(code, tmp, value_idx); + + EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); + + if constexpr (bitsize == 128) { + ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); + } else { + ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); + } +} + +template +void A64EmitX64::EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveWriteMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto value = [&] { + if constexpr (bitsize == 128) { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RBX); + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + return ctx.reg_alloc.UseXmm(args[1]); + } else { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + return ctx.reg_alloc.UseGpr(args[1]); + } + }(); + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, eax); + + Xbyak::Label end; + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(status, u32(1)); + code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); + code.je(end, code.T_NEAR); + code.cmp(qword[tmp], vaddr); + code.jne(end, code.T_NEAR); + + EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); + + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + + if constexpr (bitsize == 128) { + code.mov(rax, qword[tmp + 0]); + code.mov(rdx, qword[tmp + 8]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(rbx, value); + code.pextrq(rcx, value, 1); + } else { + code.movaps(xmm0, value); + code.movq(rbx, xmm0); + code.punpckhqdq(xmm0, xmm0); + code.movq(rcx, xmm0); + } + } else { + EmitReadMemoryMov(code, rax.getIdx(), tmp); + } + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label abort; + bool require_abort_handling = false; + + const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling, tmp); + + const auto location = code.getCurr(); + + if constexpr (bitsize == 128) { + code.lock(); + code.cmpxchg16b(ptr[dest_ptr]); + } else { + switch (bitsize) { + case 8: + code.lock(); + code.cmpxchg(code.byte[dest_ptr], value.cvt8()); + break; + case 16: + code.lock(); + code.cmpxchg(word[dest_ptr], value.cvt16()); + break; + case 32: + code.lock(); + code.cmpxchg(dword[dest_ptr], value.cvt32()); + break; + case 64: + code.lock(); + code.cmpxchg(qword[dest_ptr], value.cvt64()); + break; + default: + UNREACHABLE(); + } + } + + code.setnz(status.cvt8()); + + code.SwitchToFarCode(); + code.L(abort); + code.call(fallback_fn); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(fallback_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } else { + code.call(fallback_fn); + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + } + + code.L(end); + + EmitExclusiveUnlock(code, conf, tmp, eax); + + ctx.reg_alloc.DefineValue(inst, status); +} + void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) { code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); } void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); + } else { + EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); + } else { + EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); + } else { + EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); + } else { + EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); + } else { + EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveWriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } else { + EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveWriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } else { + EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveWriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } else { + EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveWriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } else { + EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } } void A64EmitX64::EmitA64ExclusiveWriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); + } else { + EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); + } } } // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/backend/x64/emit_x64_memory.h b/src/dynarmic/backend/x64/emit_x64_memory.h new file mode 100644 index 00000000..e5cf6a48 --- /dev/null +++ b/src/dynarmic/backend/x64/emit_x64_memory.h @@ -0,0 +1,62 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include + +#include "dynarmic/backend/x64/a64_emit_x64.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" +#include "dynarmic/common/spin_lock_x64.h" +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic::Backend::X64 { + +namespace { + +using namespace Xbyak::util; + +template +void EmitExclusiveLock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(pointer, Common::BitCast(GetExclusiveMonitorLockPointer(conf.global_monitor))); + EmitSpinLockLock(code, pointer, tmp); +} + +template +void EmitExclusiveUnlock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(pointer, Common::BitCast(GetExclusiveMonitorLockPointer(conf.global_monitor))); + EmitSpinLockUnlock(code, pointer, tmp); +} + +template +void EmitExclusiveTestAndClear(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 vaddr, Xbyak::Reg64 pointer, Xbyak::Reg64 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(tmp, 0xDEAD'DEAD'DEAD'DEAD); + const size_t processor_count = GetExclusiveMonitorProcessorCount(conf.global_monitor); + for (size_t processor_index = 0; processor_index < processor_count; processor_index++) { + if (processor_index == conf.processor_id) { + continue; + } + Xbyak::Label ok; + code.mov(pointer, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, processor_index))); + code.cmp(qword[pointer], vaddr); + code.jne(ok); + code.mov(qword[pointer], tmp); + code.L(ok); + } +} + +} // namespace + +} // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/backend/x64/exclusive_monitor.cpp b/src/dynarmic/backend/x64/exclusive_monitor.cpp index 0f66270f..6a323b9f 100644 --- a/src/dynarmic/backend/x64/exclusive_monitor.cpp +++ b/src/dynarmic/backend/x64/exclusive_monitor.cpp @@ -21,11 +21,11 @@ size_t ExclusiveMonitor::GetProcessorCount() const { } void ExclusiveMonitor::Lock() { - while (is_locked.test_and_set(std::memory_order_acquire)) {} + lock.Lock(); } void ExclusiveMonitor::Unlock() { - is_locked.clear(std::memory_order_release); + lock.Unlock(); } bool ExclusiveMonitor::CheckAndClear(size_t processor_id, VAddr address) { diff --git a/src/dynarmic/backend/x64/exclusive_monitor_friend.h b/src/dynarmic/backend/x64/exclusive_monitor_friend.h new file mode 100644 index 00000000..7f7fa242 --- /dev/null +++ b/src/dynarmic/backend/x64/exclusive_monitor_friend.h @@ -0,0 +1,28 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic { + +inline volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor* monitor) { + return &monitor->lock.storage; +} + +inline size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor* monitor) { + return monitor->exclusive_addresses.size(); +} + +inline VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor* monitor, size_t index) { + return monitor->exclusive_addresses.data() + index; +} + +inline Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor* monitor, size_t index) { + return monitor->exclusive_values.data() + index; +} + +} // namespace Dynarmic diff --git a/src/dynarmic/common/spin_lock.h b/src/dynarmic/common/spin_lock.h new file mode 100644 index 00000000..a6ea9b68 --- /dev/null +++ b/src/dynarmic/common/spin_lock.h @@ -0,0 +1,17 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic { + +struct SpinLock { + void Lock(); + void Unlock(); + + volatile int storage; +}; + +} // namespace Dynarmic diff --git a/src/dynarmic/common/spin_lock_x64.cpp b/src/dynarmic/common/spin_lock_x64.cpp new file mode 100644 index 00000000..0f849931 --- /dev/null +++ b/src/dynarmic/common/spin_lock_x64.cpp @@ -0,0 +1,70 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/hostloc.h" +#include "dynarmic/common/spin_lock.h" + +namespace Dynarmic { + +void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) { + Xbyak::Label start, loop; + + code.jmp(start); + code.L(loop); + code.pause(); + code.L(start); + code.mov(tmp, 1); + code.lock(); + code.xchg(code.dword[ptr], tmp); + code.test(tmp, tmp); + code.jnz(loop); +} + +void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) { + code.xor_(tmp, tmp); + code.xchg(code.dword[ptr], tmp); + code.mfence(); +} + +namespace { + +struct SpinLockImpl { + SpinLockImpl(); + + Xbyak::CodeGenerator code; + void (*lock)(volatile int*); + void (*unlock)(volatile int*); +}; + +SpinLockImpl impl; + +SpinLockImpl::SpinLockImpl() { + const Xbyak::Reg64 ABI_PARAM1 = Backend::X64::HostLocToReg64(Backend::X64::ABI_PARAM1); + + code.align(); + lock = code.getCurr(); + EmitSpinLockLock(code, ABI_PARAM1, code.eax); + code.ret(); + + code.align(); + unlock = code.getCurr(); + EmitSpinLockUnlock(code, ABI_PARAM1, code.eax); + code.ret(); +} + +} // namespace + +void SpinLock::Lock() { + impl.lock(&storage); +} + +void SpinLock::Unlock() { + impl.unlock(&storage); +} + +} // namespace Dynarmic diff --git a/src/dynarmic/common/spin_lock_x64.h b/src/dynarmic/common/spin_lock_x64.h new file mode 100644 index 00000000..df6a3d74 --- /dev/null +++ b/src/dynarmic/common/spin_lock_x64.h @@ -0,0 +1,15 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include + +namespace Dynarmic { + +void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp); +void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp); + +} // namespace Dynarmic diff --git a/src/dynarmic/interface/A32/config.h b/src/dynarmic/interface/A32/config.h index dbf64c6b..75c1ec59 100644 --- a/src/dynarmic/interface/A32/config.h +++ b/src/dynarmic/interface/A32/config.h @@ -177,6 +177,15 @@ struct UserConfig { /// accesses will hit the memory callbacks. bool recompile_on_fastmem_failure = true; + /// Determines if we should use the above fastmem_pointer for exclusive reads and + /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not + /// provide fully accurate emulation. + bool fastmem_exclusive_access = false; + /// Determines if exclusive access instructions that pagefault should cause + /// recompilation of that block with fastmem disabled. Recompiled code will use memory + /// callbacks. + bool recompile_on_exclusive_fastmem_failure = true; + // Coprocessors std::array, 16> coprocessors{}; diff --git a/src/dynarmic/interface/A64/config.h b/src/dynarmic/interface/A64/config.h index 1d4a758c..7926fb5a 100644 --- a/src/dynarmic/interface/A64/config.h +++ b/src/dynarmic/interface/A64/config.h @@ -254,6 +254,15 @@ struct UserConfig { /// This is only used if fastmem_pointer is not nullptr. bool silently_mirror_fastmem = true; + /// Determines if we should use the above fastmem_pointer for exclusive reads and + /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not + /// provide fully accurate emulation. + bool fastmem_exclusive_access = false; + /// Determines if exclusive access instructions that pagefault should cause + /// recompilation of that block with fastmem disabled. Recompiled code will use memory + /// callbacks. + bool recompile_on_exclusive_fastmem_failure = true; + /// This option relates to translation. Generally when we run into an unpredictable /// instruction the ExceptionRaised callback is called. If this is true, we define /// definite behaviour for some unpredictable instructions. diff --git a/src/dynarmic/interface/exclusive_monitor.h b/src/dynarmic/interface/exclusive_monitor.h index 70b0c7f8..48136758 100644 --- a/src/dynarmic/interface/exclusive_monitor.h +++ b/src/dynarmic/interface/exclusive_monitor.h @@ -12,6 +12,8 @@ #include #include +#include + namespace Dynarmic { using VAddr = std::uint64_t; @@ -71,9 +73,14 @@ private: void Lock(); void Unlock(); + friend volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor*); + friend size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor*); + friend VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor*, size_t index); + friend Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor*, size_t index); + static constexpr VAddr RESERVATION_GRANULE_MASK = 0xFFFF'FFFF'FFFF'FFFFull; static constexpr VAddr INVALID_EXCLUSIVE_ADDRESS = 0xDEAD'DEAD'DEAD'DEADull; - std::atomic_flag is_locked; + SpinLock lock; std::vector exclusive_addresses; std::vector exclusive_values; }; diff --git a/src/dynarmic/interface/optimization_flags.h b/src/dynarmic/interface/optimization_flags.h index df7eee3e..2f65f0bf 100644 --- a/src/dynarmic/interface/optimization_flags.h +++ b/src/dynarmic/interface/optimization_flags.h @@ -45,6 +45,10 @@ enum class OptimizationFlag : std::uint32_t { /// This is an UNSAFE optimization that causes ASIMD floating-point instructions to be run with incorrect /// rounding modes. This may result in inaccurate results with all floating-point ASIMD instructions. Unsafe_IgnoreStandardFPCRValue = 0x00080000, + /// This is an UNSAFE optimization that causes the global monitor to be ignored. This may + /// result in unexpected behaviour in multithreaded scenarios, including but not limited + /// to data races and deadlocks. + Unsafe_IgnoreGlobalMonitor = 0x00100000, }; constexpr OptimizationFlag no_optimizations = static_cast(0);