A64: Implement FastDispatchHint

This commit is contained in:
MerryMage 2018-09-07 21:51:42 +01:00
parent f96c43d422
commit 9b65100660
5 changed files with 92 additions and 23 deletions

View file

@ -163,6 +163,9 @@ struct UserConfig {
/// definite behaviour for some unpredictable instructions. /// definite behaviour for some unpredictable instructions.
bool define_unpredictable_behaviour = false; bool define_unpredictable_behaviour = false;
/// This enables the fast dispatcher.
bool enable_fast_dispatch = true;
// The below options relate to accuracy of floating-point emulation. // The below options relate to accuracy of floating-point emulation.
/// Determines how accurate NaN handling is. /// Determines how accurate NaN handling is.

View file

@ -64,11 +64,12 @@ bool A64EmitContext::AccurateNaN() const {
} }
A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface) A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface)
: EmitX64(code), conf(conf), jit_interface{jit_interface} : EmitX64(code), conf(conf), jit_interface{jit_interface} {
{
GenMemory128Accessors(); GenMemory128Accessors();
GenFastmemFallbacks(); GenFastmemFallbacks();
GenTerminalHandlers();
code.PreludeComplete(); code.PreludeComplete();
ClearFastDispatchTable();
} }
A64EmitX64::~A64EmitX64() = default; A64EmitX64::~A64EmitX64() = default;
@ -134,10 +135,16 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) {
void A64EmitX64::ClearCache() { void A64EmitX64::ClearCache() {
EmitX64::ClearCache(); EmitX64::ClearCache();
block_ranges.ClearCache(); block_ranges.ClearCache();
ClearFastDispatchTable();
} }
void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) { void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) {
InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
ClearFastDispatchTable();
}
void A64EmitX64::ClearFastDispatchTable() {
fast_dispatch_table.fill({0xFFFFFFFFFFFFFFFFull, nullptr});
} }
void A64EmitX64::GenMemory128Accessors() { void A64EmitX64::GenMemory128Accessors() {
@ -290,6 +297,62 @@ void A64EmitX64::GenFastmemFallbacks() {
} }
} }
void A64EmitX64::GenTerminalHandlers() {
// PC ends up in rbp, location_descriptor ends up in rbx
const auto calculate_location_descriptor = [this] {
// This calculation has to match up with A64::LocationDescriptor::UniqueHash
// TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et.
code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]);
code.mov(rcx, A64::LocationDescriptor::PC_MASK);
code.and_(rcx, rbp);
code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]);
code.and_(ebx, A64::LocationDescriptor::FPCR_MASK);
code.shl(ebx, 37);
code.or_(rbx, rcx);
};
Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
code.align();
terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]);
code.sub(eax, 1);
code.and_(eax, u32(A64JitState::RSBPtrMask));
code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
if (conf.enable_fast_dispatch) {
code.jne(rsb_cache_miss);
} else {
code.jne(code.GetReturnFromRunCodeAddress());
}
code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
code.jmp(rax);
PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint");
if (conf.enable_fast_dispatch) {
code.align();
terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.L(rsb_cache_miss);
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) {
code.crc32(rbp, r12d);
}
code.and_(ebp, fast_dispatch_table_mask);
code.lea(rbp, ptr[r12 + rbp]);
code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
code.jne(fast_dispatch_cache_miss);
code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]);
code.L(fast_dispatch_cache_miss);
code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx);
code.LookupBlock();
code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax);
code.jmp(rax);
PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint");
}
}
void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) { void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8(); Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
@ -1051,27 +1114,15 @@ void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::Location
} }
void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor) { void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor) {
// This calculation has to match up with A64::LocationDescriptor::UniqueHash code.jmp(terminal_handler_pop_rsb_hint);
// TODO: Optimization is available here based on known state of FPSCR_mode and CPSR_et.
code.mov(rcx, A64::LocationDescriptor::PC_MASK);
code.and_(rcx, qword[r15 + offsetof(A64JitState, pc)]);
code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]);
code.and_(ebx, A64::LocationDescriptor::FPCR_MASK);
code.shl(ebx, 37);
code.or_(rbx, rcx);
code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]);
code.sub(eax, 1);
code.and_(eax, u32(A64JitState::RSBPtrMask));
code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax);
code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
code.jne(code.GetReturnFromRunCodeAddress());
code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
code.jmp(rax);
} }
void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor initial_location) { void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor) {
EmitTerminalImpl(IR::Term::ReturnToDispatch{}, initial_location); if (conf.enable_fast_dispatch) {
code.jmp(terminal_handler_fast_dispatch_hint);
} else {
code.ReturnFromRunCode();
}
} }
void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location) { void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location) {

View file

@ -53,6 +53,16 @@ protected:
A64::Jit* jit_interface; A64::Jit* jit_interface;
BlockRangeInformation<u64> block_ranges; BlockRangeInformation<u64> block_ranges;
struct FastDispatchEntry {
u64 location_descriptor;
const void* code_ptr;
};
static_assert(sizeof(FastDispatchEntry) == 0x10);
static constexpr u64 fast_dispatch_table_mask = 0xFFFFF0;
static constexpr size_t fast_dispatch_table_size = 0x100000;
std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table;
void ClearFastDispatchTable();
void (*memory_read_128)(); void (*memory_read_128)();
void (*memory_write_128)(); void (*memory_write_128)();
void GenMemory128Accessors(); void GenMemory128Accessors();
@ -61,6 +71,10 @@ protected:
std::map<std::tuple<size_t, int, int>, void(*)()> write_fallbacks; std::map<std::tuple<size_t, int, int>, void(*)()> write_fallbacks;
void GenFastmemFallbacks(); void GenFastmemFallbacks();
const void* terminal_handler_pop_rsb_hint;
const void* terminal_handler_fast_dispatch_hint = nullptr;
void GenTerminalHandlers();
void EmitDirectPageTableMemoryRead(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize); void EmitDirectPageTableMemoryRead(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize);
void EmitDirectPageTableMemoryWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize); void EmitDirectPageTableMemoryWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize);
void EmitExclusiveWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize); void EmitExclusiveWrite(A64EmitContext& ctx, IR::Inst* inst, size_t bitsize);

View file

@ -44,7 +44,7 @@ bool TranslatorVisitor::BLR(Reg Rn) {
ir.PushRSB(ir.current_location->AdvancePC(4)); ir.PushRSB(ir.current_location->AdvancePC(4));
ir.SetPC(target); ir.SetPC(target);
ir.SetTerm(IR::Term::ReturnToDispatch{}); ir.SetTerm(IR::Term::FastDispatchHint{});
return false; return false;
} }
@ -52,7 +52,7 @@ bool TranslatorVisitor::BR(Reg Rn) {
auto target = X(64, Rn); auto target = X(64, Rn);
ir.SetPC(target); ir.SetPC(target);
ir.SetTerm(IR::Term::ReturnToDispatch{}); ir.SetTerm(IR::Term::FastDispatchHint{});
return false; return false;
} }

View file

@ -150,6 +150,7 @@ static u32 GenFloatInst(u64 pc, bool is_last_inst) {
static Dynarmic::A64::UserConfig GetUserConfig(A64TestEnv& jit_env) { static Dynarmic::A64::UserConfig GetUserConfig(A64TestEnv& jit_env) {
Dynarmic::A64::UserConfig jit_user_config{&jit_env}; Dynarmic::A64::UserConfig jit_user_config{&jit_env};
jit_user_config.enable_fast_dispatch = false;
// The below corresponds to the settings for qemu's aarch64_max_initfn // The below corresponds to the settings for qemu's aarch64_max_initfn
jit_user_config.dczid_el0 = 7; jit_user_config.dczid_el0 = 7;
jit_user_config.ctr_el0 = 0x80038003; jit_user_config.ctr_el0 = 0x80038003;