From f96c43d4224ad451a2915e5957da813d19c74606 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Fri, 7 Sep 2018 21:30:12 +0100 Subject: [PATCH] A32: Implement FastDispatchHint --- include/dynarmic/A32/config.h | 3 + src/backend/x64/a32_emit_x64.cpp | 83 ++++++++++++++++--- src/backend/x64/a32_emit_x64.h | 16 ++++ src/backend/x64/block_of_code.cpp | 4 + src/backend/x64/block_of_code.h | 5 +- .../A32/translate/translate_arm/branch.cpp | 4 +- .../translate/translate_arm/load_store.cpp | 8 +- .../A32/translate/translate_thumb.cpp | 8 +- src/frontend/A64/translate/impl/system.cpp | 2 +- tests/A32/fuzz_arm.cpp | 1 + tests/A32/fuzz_thumb.cpp | 1 + 11 files changed, 112 insertions(+), 23 deletions(-) diff --git a/include/dynarmic/A32/config.h b/include/dynarmic/A32/config.h index 9486c62d..5e18e408 100644 --- a/include/dynarmic/A32/config.h +++ b/include/dynarmic/A32/config.h @@ -87,6 +87,9 @@ struct UserConfig { /// instruction the ExceptionRaised callback is called. If this is true, we define /// definite behaviour for some unpredictable instructions. bool define_unpredictable_behaviour = false; + + /// This enables the fast dispatcher. + bool enable_fast_dispatch = true; }; } // namespace A32 diff --git a/src/backend/x64/a32_emit_x64.cpp b/src/backend/x64/a32_emit_x64.cpp index 3ae0968c..4f2a25c6 100644 --- a/src/backend/x64/a32_emit_x64.cpp +++ b/src/backend/x64/a32_emit_x64.cpp @@ -79,10 +79,11 @@ bool A32EmitContext::FPSCR_DN() const { } A32EmitX64::A32EmitX64(BlockOfCode& code, A32::UserConfig config, A32::Jit* jit_interface) - : EmitX64(code), config(std::move(config)), jit_interface(jit_interface) -{ + : EmitX64(code), config(std::move(config)), jit_interface(jit_interface) { GenMemoryAccessors(); + GenTerminalHandlers(); code.PreludeComplete(); + ClearFastDispatchTable(); } A32EmitX64::~A32EmitX64() = default; @@ -148,10 +149,16 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { void A32EmitX64::ClearCache() { EmitX64::ClearCache(); block_ranges.ClearCache(); + ClearFastDispatchTable(); } void A32EmitX64::InvalidateCacheRanges(const boost::icl::interval_set& ranges) { InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); + ClearFastDispatchTable(); +} + +void A32EmitX64::ClearFastDispatchTable() { + fast_dispatch_table.fill({0xFFFFFFFFFFFFFFFFull, nullptr}); } void A32EmitX64::GenMemoryAccessors() { @@ -220,6 +227,61 @@ void A32EmitX64::GenMemoryAccessors() { PerfMapRegister(write_memory_64, code.getCurr(), "a32_write_memory_64"); } +void A32EmitX64::GenTerminalHandlers() { + // PC ends up in ebp, location_descriptor ends up in rbx + const auto calculate_location_descriptor = [this] { + // This calculation has to match up with IREmitter::PushRSB + // TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et. + code.mov(ecx, MJitStateReg(A32::Reg::PC)); + code.mov(ebp, ecx); + code.shl(rcx, 32); + code.mov(ebx, dword[r15 + offsetof(A32JitState, FPSCR_mode)]); + code.or_(ebx, dword[r15 + offsetof(A32JitState, CPSR_et)]); + code.or_(rbx, rcx); + }; + + Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss; + + code.align(); + terminal_handler_pop_rsb_hint = code.getCurr(); + calculate_location_descriptor(); + code.mov(eax, dword[r15 + offsetof(A32JitState, rsb_ptr)]); + code.sub(eax, 1); + code.and_(eax, u32(A32JitState::RSBPtrMask)); + code.mov(dword[r15 + offsetof(A32JitState, rsb_ptr)], eax); + code.cmp(rbx, qword[r15 + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]); + if (config.enable_fast_dispatch) { + code.jne(rsb_cache_miss); + } else { + code.jne(code.GetReturnFromRunCodeAddress()); + } + code.mov(rax, qword[r15 + offsetof(A32JitState, rsb_codeptrs) + rax * sizeof(u64)]); + code.jmp(rax); + PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a32_terminal_handler_pop_rsb_hint"); + + if (config.enable_fast_dispatch) { + code.align(); + terminal_handler_fast_dispatch_hint = code.getCurr(); + calculate_location_descriptor(); + code.L(rsb_cache_miss); + code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { + code.crc32(ebp, r12d); + } + code.and_(ebp, fast_dispatch_table_mask); + code.lea(rbp, ptr[r12 + rbp]); + code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss); + code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.L(fast_dispatch_cache_miss); + code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.LookupBlock(); + code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.jmp(rax); + PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a32_terminal_handler_fast_dispatch_hint"); + } +} + void A32EmitX64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) { A32::Reg reg = inst->GetArg(0).GetA32RegRef(); @@ -1222,16 +1284,15 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::Location } void A32EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor) { - // This calculation has to match up with IREmitter::PushRSB - // TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et. - code.mov(ecx, MJitStateReg(A32::Reg::PC)); - code.shl(rcx, 32); - code.mov(ebx, dword[r15 + offsetof(A32JitState, FPSCR_mode)]); - code.or_(ebx, dword[r15 + offsetof(A32JitState, CPSR_et)]); - code.or_(rbx, rcx); + code.jmp(terminal_handler_pop_rsb_hint); +} -void A32EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor initial_location) { - EmitTerminalImpl(IR::Term::ReturnToDispatch{}, initial_location); +void A32EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor) { + if (config.enable_fast_dispatch) { + code.jmp(terminal_handler_fast_dispatch_hint); + } else { + code.ReturnFromRunCode(); + } } void A32EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location) { diff --git a/src/backend/x64/a32_emit_x64.h b/src/backend/x64/a32_emit_x64.h index 01bfb0cf..670fedad 100644 --- a/src/backend/x64/a32_emit_x64.h +++ b/src/backend/x64/a32_emit_x64.h @@ -6,6 +6,8 @@ #pragma once +#include + #include #include "backend/x64/a32_jitstate.h" @@ -49,6 +51,16 @@ protected: A32::Jit* jit_interface; BlockRangeInformation block_ranges; + struct FastDispatchEntry { + u64 location_descriptor; + const void* code_ptr; + }; + static_assert(sizeof(FastDispatchEntry) == 0x10); + static constexpr u64 fast_dispatch_table_mask = 0xFFFF0; + static constexpr size_t fast_dispatch_table_size = 0x10000; + std::array fast_dispatch_table; + void ClearFastDispatchTable(); + const void* read_memory_8; const void* read_memory_16; const void* read_memory_32; @@ -59,6 +71,10 @@ protected: const void* write_memory_64; void GenMemoryAccessors(); + const void* terminal_handler_pop_rsb_hint; + const void* terminal_handler_fast_dispatch_hint = nullptr; + void GenTerminalHandlers(); + // Microinstruction emitters #define OPCODE(...) #define A32OPC(name, type, ...) void EmitA32##name(A32EmitContext& ctx, IR::Inst* inst); diff --git a/src/backend/x64/block_of_code.cpp b/src/backend/x64/block_of_code.cpp index 73ebb3a3..b2bfec46 100644 --- a/src/backend/x64/block_of_code.cpp +++ b/src/backend/x64/block_of_code.cpp @@ -257,6 +257,10 @@ void BlockOfCode::UpdateTicks() { mov(qword[r15 + jsi.offsetof_cycles_remaining], ABI_RETURN); } +void BlockOfCode::LookupBlock() { + cb.LookupBlock->EmitCall(*this); +} + Xbyak::Address BlockOfCode::MConst(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) { return constant_pool.GetConstant(frame, lower, upper); } diff --git a/src/backend/x64/block_of_code.h b/src/backend/x64/block_of_code.h index a99db2a6..fd5b4ee8 100644 --- a/src/backend/x64/block_of_code.h +++ b/src/backend/x64/block_of_code.h @@ -57,8 +57,11 @@ public: /// Code emitter: Makes saved host MXCSR the current MXCSR void SwitchMxcsrOnExit(); /// Code emitter: Updates cycles remaining my calling cb.AddTicks and cb.GetTicksRemaining - /// @note this clobbers ABI callee-save registers + /// @note this clobbers ABI caller-save registers void UpdateTicks(); + /// Code emitter: Performs a block lookup based on current state + /// @note this clobbers ABI caller-save registers + void LookupBlock(); /// Code emitter: Calls the function template diff --git a/src/frontend/A32/translate/translate_arm/branch.cpp b/src/frontend/A32/translate/translate_arm/branch.cpp index c593e953..b9610c07 100644 --- a/src/frontend/A32/translate/translate_arm/branch.cpp +++ b/src/frontend/A32/translate/translate_arm/branch.cpp @@ -52,7 +52,7 @@ bool ArmTranslatorVisitor::arm_BLX_reg(Cond cond, Reg m) { ir.PushRSB(ir.current_location.AdvancePC(4)); ir.BXWritePC(ir.GetRegister(m)); ir.SetRegister(Reg::LR, ir.Imm32(ir.current_location.PC() + 4)); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } return true; @@ -65,7 +65,7 @@ bool ArmTranslatorVisitor::arm_BX(Cond cond, Reg m) { if (m == Reg::R14) ir.SetTerm(IR::Term::PopRSBHint{}); else - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } return true; diff --git a/src/frontend/A32/translate/translate_arm/load_store.cpp b/src/frontend/A32/translate/translate_arm/load_store.cpp index 05fb407a..70a3df76 100644 --- a/src/frontend/A32/translate/translate_arm/load_store.cpp +++ b/src/frontend/A32/translate/translate_arm/load_store.cpp @@ -66,7 +66,7 @@ bool ArmTranslatorVisitor::arm_LDR_lit(Cond cond, bool U, Reg t, Imm12 imm12) { if (t == Reg::PC) { ir.LoadWritePC(data); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } @@ -96,7 +96,7 @@ bool ArmTranslatorVisitor::arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, if (!P && W && n == Reg::R13) ir.SetTerm(IR::Term::PopRSBHint{}); else - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } @@ -121,7 +121,7 @@ bool ArmTranslatorVisitor::arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, if (t == Reg::PC) { ir.LoadWritePC(data); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } @@ -623,7 +623,7 @@ static bool LDMHelper(A32::IREmitter& ir, bool W, Reg n, RegList list, IR::U32 s if (n == Reg::R13) ir.SetTerm(IR::Term::PopRSBHint{}); else - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } return true; diff --git a/src/frontend/A32/translate/translate_thumb.cpp b/src/frontend/A32/translate/translate_thumb.cpp index 7967d86d..14cf33aa 100644 --- a/src/frontend/A32/translate/translate_thumb.cpp +++ b/src/frontend/A32/translate/translate_thumb.cpp @@ -370,7 +370,7 @@ struct ThumbTranslatorVisitor final { if (d == Reg::PC) { ir.ALUWritePC(result.result); // Return to dispatch as we can't predict what PC is going to be. Stop compilation. - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } else { ir.SetRegister(d, result.result); @@ -400,7 +400,7 @@ struct ThumbTranslatorVisitor final { auto result = ir.GetRegister(m); if (d == Reg::PC) { ir.ALUWritePC(result); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } else { ir.SetRegister(d, result); @@ -775,7 +775,7 @@ struct ThumbTranslatorVisitor final { if (m == Reg::R14) ir.SetTerm(IR::Term::PopRSBHint{}); else - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } @@ -784,7 +784,7 @@ struct ThumbTranslatorVisitor final { ir.PushRSB(ir.current_location.AdvancePC(2)); ir.BXWritePC(ir.GetRegister(m)); ir.SetRegister(Reg::LR, ir.Imm32((ir.current_location.PC() + 2) | 1)); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } diff --git a/src/frontend/A64/translate/impl/system.cpp b/src/frontend/A64/translate/impl/system.cpp index fe7ca3cb..281ff1c8 100644 --- a/src/frontend/A64/translate/impl/system.cpp +++ b/src/frontend/A64/translate/impl/system.cpp @@ -84,7 +84,7 @@ bool TranslatorVisitor::MSR_reg(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, I case SystemRegisterEncoding::FPCR: ir.SetFPCR(X(32, Rt)); ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; case SystemRegisterEncoding::FPSR: ir.SetFPSR(X(32, Rt)); diff --git a/tests/A32/fuzz_arm.cpp b/tests/A32/fuzz_arm.cpp index 3a1ee1b0..ec1b4f6f 100644 --- a/tests/A32/fuzz_arm.cpp +++ b/tests/A32/fuzz_arm.cpp @@ -41,6 +41,7 @@ using Dynarmic::Common::Bits; static Dynarmic::A32::UserConfig GetUserConfig(ArmTestEnv* testenv) { Dynarmic::A32::UserConfig user_config; + user_config.enable_fast_dispatch = false; user_config.callbacks = testenv; return user_config; } diff --git a/tests/A32/fuzz_thumb.cpp b/tests/A32/fuzz_thumb.cpp index 5cc4cd1c..6f159bff 100644 --- a/tests/A32/fuzz_thumb.cpp +++ b/tests/A32/fuzz_thumb.cpp @@ -32,6 +32,7 @@ static Dynarmic::A32::UserConfig GetUserConfig(ThumbTestEnv* testenv) { Dynarmic::A32::UserConfig user_config; + user_config.enable_fast_dispatch = false; user_config.callbacks = testenv; return user_config; }