diff --git a/include/dynarmic/A64/config.h b/include/dynarmic/A64/config.h index a00fb8c3..64c05b72 100644 --- a/include/dynarmic/A64/config.h +++ b/include/dynarmic/A64/config.h @@ -163,6 +163,9 @@ struct UserConfig { /// definite behaviour for some unpredictable instructions. bool define_unpredictable_behaviour = false; + /// This enables the fast dispatcher. + bool enable_fast_dispatch = true; + // The below options relate to accuracy of floating-point emulation. /// Determines how accurate NaN handling is. diff --git a/src/backend/x64/a64_emit_x64.cpp b/src/backend/x64/a64_emit_x64.cpp index 46e2c74b..25fde3a6 100644 --- a/src/backend/x64/a64_emit_x64.cpp +++ b/src/backend/x64/a64_emit_x64.cpp @@ -64,11 +64,12 @@ bool A64EmitContext::AccurateNaN() const { } A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface) - : EmitX64(code), conf(conf), jit_interface{jit_interface} -{ + : EmitX64(code), conf(conf), jit_interface{jit_interface} { GenMemory128Accessors(); GenFastmemFallbacks(); + GenTerminalHandlers(); code.PreludeComplete(); + ClearFastDispatchTable(); } A64EmitX64::~A64EmitX64() = default; @@ -134,10 +135,16 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) { void A64EmitX64::ClearCache() { EmitX64::ClearCache(); block_ranges.ClearCache(); + ClearFastDispatchTable(); } void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set& ranges) { InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); + ClearFastDispatchTable(); +} + +void A64EmitX64::ClearFastDispatchTable() { + fast_dispatch_table.fill({0xFFFFFFFFFFFFFFFFull, nullptr}); } void A64EmitX64::GenMemory128Accessors() { @@ -290,6 +297,62 @@ void A64EmitX64::GenFastmemFallbacks() { } } +void A64EmitX64::GenTerminalHandlers() { + // PC ends up in rbp, location_descriptor ends up in rbx + const auto calculate_location_descriptor = [this] { + // This calculation has to match up with A64::LocationDescriptor::UniqueHash + // TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et. + code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]); + code.mov(rcx, A64::LocationDescriptor::PC_MASK); + code.and_(rcx, rbp); + code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]); + code.and_(ebx, A64::LocationDescriptor::FPCR_MASK); + code.shl(ebx, 37); + code.or_(rbx, rcx); + }; + + Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss; + + code.align(); + terminal_handler_pop_rsb_hint = code.getCurr(); + calculate_location_descriptor(); + code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]); + code.sub(eax, 1); + code.and_(eax, u32(A64JitState::RSBPtrMask)); + code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax); + code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]); + if (conf.enable_fast_dispatch) { + code.jne(rsb_cache_miss); + } else { + code.jne(code.GetReturnFromRunCodeAddress()); + } + code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]); + code.jmp(rax); + PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint"); + + if (conf.enable_fast_dispatch) { + code.align(); + terminal_handler_fast_dispatch_hint = code.getCurr(); + calculate_location_descriptor(); + code.L(rsb_cache_miss); + code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { + code.crc32(rbp, r12d); + } + code.and_(ebp, fast_dispatch_table_mask); + code.lea(rbp, ptr[r12 + rbp]); + code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss); + code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.L(fast_dispatch_cache_miss); + code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.LookupBlock(); + code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.jmp(rax); + PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint"); + } +} + void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8(); @@ -1051,27 +1114,15 @@ void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::Location } void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor) { - // This calculation has to match up with A64::LocationDescriptor::UniqueHash - // TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et. - code.mov(rcx, A64::LocationDescriptor::PC_MASK); - code.and_(rcx, qword[r15 + offsetof(A64JitState, pc)]); - code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]); - code.and_(ebx, A64::LocationDescriptor::FPCR_MASK); - code.shl(ebx, 37); - code.or_(rbx, rcx); - - code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]); - code.sub(eax, 1); - code.and_(eax, u32(A64JitState::RSBPtrMask)); - code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax); - code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]); - code.jne(code.GetReturnFromRunCodeAddress()); - code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]); - code.jmp(rax); + code.jmp(terminal_handler_pop_rsb_hint); } -void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor initial_location) { - EmitTerminalImpl(IR::Term::ReturnToDispatch{}, initial_location); +void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor) { + if (conf.enable_fast_dispatch) { + code.jmp(terminal_handler_fast_dispatch_hint); + } else { + code.ReturnFromRunCode(); + } } void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location) { diff --git a/src/backend/x64/a64_emit_x64.h b/src/backend/x64/a64_emit_x64.h index 8a169a89..d349e238 100644 --- a/src/backend/x64/a64_emit_x64.h +++ b/src/backend/x64/a64_emit_x64.h @@ -53,6 +53,16 @@ protected: A64::Jit* jit_interface; BlockRangeInformation block_ranges; + struct FastDispatchEntry { + u64 location_descriptor; + const void* code_ptr; + }; + static_assert(sizeof(FastDispatchEntry) == 0x10); + static constexpr u64 fast_dispatch_table_mask = 0xFFFFF0; + static constexpr size_t fast_dispatch_table_size = 0x100000; + std::array fast_dispatch_table; + void ClearFastDispatchTable(); + void (*memory_read_128)(); void (*memory_write_128)(); void GenMemory128Accessors(); @@ -61,6 +71,10 @@ protected: std::map, void(*)()> write_fallbacks; void GenFastmemFallbacks(); + const void* terminal_handler_pop_rsb_hint; + const void* terminal_handler_fast_dispatch_hint = nullptr; + void GenTerminalHandlers(); + void EmitDirectPageTableMemoryRead(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize); void EmitDirectPageTableMemoryWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize); void EmitExclusiveWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize); diff --git a/src/frontend/A64/translate/impl/branch.cpp b/src/frontend/A64/translate/impl/branch.cpp index 061f8280..3129264e 100644 --- a/src/frontend/A64/translate/impl/branch.cpp +++ b/src/frontend/A64/translate/impl/branch.cpp @@ -44,7 +44,7 @@ bool TranslatorVisitor::BLR(Reg Rn) { ir.PushRSB(ir.current_location->AdvancePC(4)); ir.SetPC(target); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } @@ -52,7 +52,7 @@ bool TranslatorVisitor::BR(Reg Rn) { auto target = X(64, Rn); ir.SetPC(target); - ir.SetTerm(IR::Term::ReturnToDispatch{}); + ir.SetTerm(IR::Term::FastDispatchHint{}); return false; } diff --git a/tests/A64/fuzz_with_unicorn.cpp b/tests/A64/fuzz_with_unicorn.cpp index 35a35499..4851b20f 100644 --- a/tests/A64/fuzz_with_unicorn.cpp +++ b/tests/A64/fuzz_with_unicorn.cpp @@ -150,6 +150,7 @@ static u32 GenFloatInst(u64 pc, bool is_last_inst) { static Dynarmic::A64::UserConfig GetUserConfig(A64TestEnv& jit_env) { Dynarmic::A64::UserConfig jit_user_config{&jit_env}; + jit_user_config.enable_fast_dispatch = false; // The below corresponds to the settings for qemu's aarch64_max_initfn jit_user_config.dczid_el0 = 7; jit_user_config.ctr_el0 = 0x80038003;