From 960d14d18e8dc4a9fa07e7ef71badcd6de121f86 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 13 Aug 2016 00:10:23 +0100 Subject: [PATCH] Optimization: Implement Return Stack Buffer --- src/backend_x64/block_of_code.cpp | 20 ++++++--- src/backend_x64/block_of_code.h | 10 +++++ src/backend_x64/emit_x64.cpp | 44 ++++++++++++++++++- src/backend_x64/emit_x64.h | 2 + src/backend_x64/interface_x64.cpp | 16 ++++--- src/backend_x64/jitstate.cpp | 20 ++++++--- src/backend_x64/jitstate.h | 13 +++++- src/backend_x64/reg_alloc.cpp | 2 +- src/frontend/arm_types.h | 21 ++++++--- src/frontend/ir/ir.cpp | 11 +++++ src/frontend/ir/ir.h | 3 ++ src/frontend/ir/ir_emitter.cpp | 4 ++ src/frontend/ir/ir_emitter.h | 1 + src/frontend/ir/opcodes.inc | 3 ++ .../translate/translate_arm/branch.cpp | 8 +++- .../translate/translate_arm/load_store.cpp | 10 ++++- src/frontend/translate/translate_thumb.cpp | 9 +++- src/ir_opt/dead_code_elimination_pass.cpp | 1 + 18 files changed, 167 insertions(+), 31 deletions(-) diff --git a/src/backend_x64/block_of_code.cpp b/src/backend_x64/block_of_code.cpp index adf3de7e..e45bd719 100644 --- a/src/backend_x64/block_of_code.cpp +++ b/src/backend_x64/block_of_code.cpp @@ -15,7 +15,7 @@ using namespace Gen; namespace Dynarmic { namespace BackendX64 { -BlockOfCode::BlockOfCode() { +BlockOfCode::BlockOfCode() : Gen::XCodeBlock() { AllocCodeSpace(128 * 1024 * 1024); ClearCache(false); } @@ -29,6 +29,7 @@ void BlockOfCode::ClearCache(bool poison_memory) { GenConstants(); GenRunCode(); + GenReturnFromRunCode(); } size_t BlockOfCode::RunCode(JitState* jit_state, CodePtr basic_block, size_t cycles_to_run) const { @@ -41,11 +42,7 @@ size_t BlockOfCode::RunCode(JitState* jit_state, CodePtr basic_block, size_t cyc } void BlockOfCode::ReturnFromRunCode(bool MXCSR_switch) { - if (MXCSR_switch) - SwitchMxcsrOnExit(); - - ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); - RET(); + JMP(MXCSR_switch ? return_from_run_code : return_from_run_code_without_mxcsr_switch, true); } void BlockOfCode::GenConstants() { @@ -80,6 +77,17 @@ void BlockOfCode::GenRunCode() { JMPptr(R(ABI_PARAM2)); } +void BlockOfCode::GenReturnFromRunCode() { + return_from_run_code = GetCodePtr(); + + SwitchMxcsrOnExit(); + + return_from_run_code_without_mxcsr_switch = GetCodePtr(); + + ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); + RET(); +} + void BlockOfCode::SwitchMxcsrOnEntry() { STMXCSR(MDisp(R15, offsetof(JitState, save_host_MXCSR))); LDMXCSR(MDisp(R15, offsetof(JitState, guest_MXCSR))); diff --git a/src/backend_x64/block_of_code.h b/src/backend_x64/block_of_code.h index db1a168d..a5ba4eec 100644 --- a/src/backend_x64/block_of_code.h +++ b/src/backend_x64/block_of_code.h @@ -6,6 +6,8 @@ #pragma once +#include + #include "backend_x64/jitstate.h" #include "common/common_types.h" #include "common/x64/emitter.h" @@ -51,6 +53,10 @@ public: return Gen::M(const_FloatPenultimatePositiveDenormal64); } + CodePtr GetReturnFromRunCodeAddress() const { + return return_from_run_code; + } + private: const u8* const_FloatNegativeZero32; const u8* const_FloatNaN32; @@ -64,6 +70,10 @@ private: using RunCodeFuncType = void(*)(JitState*, CodePtr); RunCodeFuncType run_code; void GenRunCode(); + + CodePtr return_from_run_code; + CodePtr return_from_run_code_without_mxcsr_switch; + void GenReturnFromRunCode(); }; } // namespace BackendX64 diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 4d3f5cde..460e20f5 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -67,6 +67,7 @@ EmitX64::BlockDescriptor EmitX64::Emit(const Arm::LocationDescriptor descriptor, code->INT3(); const CodePtr code_ptr = code->GetCodePtr(); basic_blocks[descriptor].code_ptr = code_ptr; + unique_hash_to_code_ptr[descriptor.UniqueHash()] = code_ptr; EmitCondPrelude(block.cond, block.cond_failed, block.location); @@ -328,6 +329,27 @@ void EmitX64::EmitCallSupervisor(IR::Block&, IR::Inst* inst) { code->SwitchMxcsrOnEntry(); } +void EmitX64::EmitPushRSB(IR::Block&, IR::Inst* inst) { + ASSERT(inst->GetArg(0).IsImmediate()); + u64 imm64 = inst->GetArg(0).GetU64(); + + X64Reg tmp = reg_alloc.ScratchRegister({HostLoc::RCX}); + X64Reg rsb_index = reg_alloc.ScratchRegister(any_gpr); + u64 code_ptr = unique_hash_to_code_ptr.find(imm64) != unique_hash_to_code_ptr.end() + ? u64(unique_hash_to_code_ptr[imm64]) + : u64(code->GetReturnFromRunCodeAddress()); + + code->MOV(32, R(rsb_index), MDisp(R15, offsetof(JitState, rsb_ptr))); + code->AND(32, R(rsb_index), Imm32(u32(JitState::RSBSize - 1))); + code->MOV(64, R(tmp), Imm64(imm64)); + code->MOV(64, MComplex(R15, rsb_index, SCALE_1, offsetof(JitState, rsb_location_descriptors)), R(tmp)); + patch_unique_hash_locations[imm64].emplace_back(code->GetCodePtr()); + code->MOV(64, R(tmp), Imm64(code_ptr)); // This line has to match up with EmitX64::Patch. + code->MOV(64, MComplex(R15, rsb_index, SCALE_1, offsetof(JitState, rsb_codeptrs)), R(tmp)); + code->ADD(32, R(rsb_index), Imm32(1)); + code->MOV(32, MDisp(R15, offsetof(JitState, rsb_ptr)), R(rsb_index)); +} + void EmitX64::EmitGetCarryFromOp(IR::Block&, IR::Inst*) { ASSERT_MSG(0, "should never happen"); } @@ -1696,7 +1718,22 @@ void EmitX64::EmitTerminalLinkBlockFast(IR::Term::LinkBlockFast terminal, Arm::L } void EmitX64::EmitTerminalPopRSBHint(IR::Term::PopRSBHint, Arm::LocationDescriptor initial_location) { - EmitTerminalReturnToDispatch({}, initial_location); // TODO: Implement RSB + // This calculation has to match up with IREmitter::PushRSB + code->MOV(32, R(RBX), MJitStateCpsr()); + code->MOV(32, R(RCX), MJitStateReg(Arm::Reg::PC)); + code->AND(32, R(RBX), Imm32((1 << 5) | (1 << 9))); + code->SHL(32, R(RBX), Imm8(2)); + code->OR(32, R(RBX), MDisp(R15, offsetof(JitState, guest_FPSCR_mode))); + code->SHR(64, R(RBX), Imm8(32)); + code->OR(64, R(RBX), R(RCX)); + + code->MOV(64, R(RAX), Imm64(u64(code->GetReturnFromRunCodeAddress()))); + for (size_t i = 0; i < JitState::RSBSize; ++i) { + code->CMP(64, R(RBX), MDisp(R15, int(offsetof(JitState, rsb_location_descriptors) + i * sizeof(u64)))); + code->CMOVcc(64, RAX, MDisp(R15, int(offsetof(JitState, rsb_codeptrs) + i * sizeof(u64))), CC_E); + } + code->SUB(32, MDisp(R15, offsetof(JitState, rsb_ptr)), Imm32(1)); + code->JMPptr(R(RAX)); } void EmitX64::EmitTerminalIf(IR::Term::If terminal, Arm::LocationDescriptor initial_location) { @@ -1716,6 +1753,11 @@ void EmitX64::Patch(Arm::LocationDescriptor desc, CodePtr bb) { ASSERT(code->GetCodePtr() - location == 6); } + for (CodePtr location : patch_unique_hash_locations[desc.UniqueHash()]) { + code->SetCodePtr(const_cast(location)); + code->MOV(64, R(RCX), Imm64(u64(bb))); + } + code->SetCodePtr(save_code_ptr); } diff --git a/src/backend_x64/emit_x64.h b/src/backend_x64/emit_x64.h index fc2f185d..eb437105 100644 --- a/src/backend_x64/emit_x64.h +++ b/src/backend_x64/emit_x64.h @@ -74,6 +74,8 @@ private: BlockOfCode* code; UserCallbacks cb; Jit* jit_interface; + std::unordered_map unique_hash_to_code_ptr; + std::unordered_map> patch_unique_hash_locations; std::unordered_map basic_blocks; std::unordered_map, Arm::LocationDescriptorHash> patch_jg_locations; }; diff --git a/src/backend_x64/interface_x64.cpp b/src/backend_x64/interface_x64.cpp index 801f9e3f..7f48d0e2 100644 --- a/src/backend_x64/interface_x64.cpp +++ b/src/backend_x64/interface_x64.cpp @@ -29,10 +29,15 @@ namespace Dynarmic { using namespace BackendX64; struct Jit::Impl { - Impl(Jit* jit, UserCallbacks callbacks) : emitter(&block_of_code, callbacks, jit), callbacks(callbacks) {} + Impl(Jit* jit, UserCallbacks callbacks) + : block_of_code() + , jit_state(&block_of_code) + , emitter(&block_of_code, callbacks, jit) + , callbacks(callbacks) + {} - JitState jit_state{}; - BlockOfCode block_of_code{}; + BlockOfCode block_of_code; + JitState jit_state; EmitX64 emitter; const UserCallbacks callbacks; @@ -41,7 +46,7 @@ struct Jit::Impl { bool TFlag = Common::Bit<5>(jit_state.Cpsr); bool EFlag = Common::Bit<9>(jit_state.Cpsr); - Arm::LocationDescriptor descriptor{pc, TFlag, EFlag, jit_state.guest_FPSCR_flags}; + Arm::LocationDescriptor descriptor{pc, TFlag, EFlag, jit_state.guest_FPSCR_mode}; CodePtr code_ptr = GetBasicBlock(descriptor).code_ptr; return block_of_code.RunCode(&jit_state, code_ptr, cycle_count); @@ -121,11 +126,12 @@ void Jit::ClearCache(bool poison_memory) { ASSERT(!is_executing); impl->block_of_code.ClearCache(poison_memory); impl->emitter.ClearCache(); + impl->jit_state.ResetRSB(&impl->block_of_code); } void Jit::Reset() { ASSERT(!is_executing); - impl->jit_state = {}; + impl->jit_state = JitState(&impl->block_of_code); } void Jit::HaltExecution() { diff --git a/src/backend_x64/jitstate.cpp b/src/backend_x64/jitstate.cpp index 36e90b42..bc67520b 100644 --- a/src/backend_x64/jitstate.cpp +++ b/src/backend_x64/jitstate.cpp @@ -4,14 +4,22 @@ * General Public License version 2 or any later version. */ +#include "backend_x64/block_of_code.h" #include "backend_x64/jitstate.h" #include "common/assert.h" #include "common/bit_util.h" #include "common/common_types.h" +#include "frontend/arm_types.h" namespace Dynarmic { namespace BackendX64 { +void JitState::ResetRSB(BlockOfCode* code) { + for (auto& value : rsb_codeptrs) { + value = u64(code->GetReturnFromRunCodeAddress()); + } +} + /** * Comparing MXCSR and FPSCR * ========================= @@ -68,14 +76,16 @@ namespace BackendX64 { */ // NZCV; QC (ASMID only), AHP; DN, FZ, RMode, Stride; SBZP; Len; trap enables; cumulative bits -constexpr u32 FPSCR_MASK = 0b1111'00'111111'0'111'10011111'00000000; +constexpr u32 FPSCR_MODE_MASK = Arm::LocationDescriptor::FPSCR_MODE_MASK; +constexpr u32 FPSCR_NZCV_MASK = 0xF0000000; u32 JitState::Fpscr() const { - ASSERT((guest_FPSCR_flags & ~FPSCR_MASK) == 0); + ASSERT((guest_FPSCR_mode & ~FPSCR_MODE_MASK) == 0); + ASSERT((guest_FPSCR_nzcv & ~FPSCR_NZCV_MASK) == 0); ASSERT((FPSCR_IDC & ~(1 << 7)) == 0); ASSERT((FPSCR_UFC & ~(1 << 3)) == 0); - u32 FPSCR = guest_FPSCR_flags; + u32 FPSCR = guest_FPSCR_mode | guest_FPSCR_nzcv; FPSCR |= (guest_MXCSR & 0b0000000000001); // IOC = IE FPSCR |= (guest_MXCSR & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE FPSCR |= FPSCR_IDC; @@ -86,7 +96,8 @@ u32 JitState::Fpscr() const { void JitState::SetFpscr(u32 FPSCR) { old_FPSCR = FPSCR; - guest_FPSCR_flags = FPSCR & FPSCR_MASK; + guest_FPSCR_mode = FPSCR & FPSCR_MODE_MASK; + guest_FPSCR_nzcv = FPSCR & FPSCR_NZCV_MASK; guest_MXCSR = 0; // Exception masks / enables @@ -114,6 +125,5 @@ void JitState::SetFpscr(u32 FPSCR) { } } - } // namespace BackendX64 } // namespace Dynarmic diff --git a/src/backend_x64/jitstate.h b/src/backend_x64/jitstate.h index e1e2618d..f110a40b 100644 --- a/src/backend_x64/jitstate.h +++ b/src/backend_x64/jitstate.h @@ -13,9 +13,13 @@ namespace Dynarmic { namespace BackendX64 { +class BlockOfCode; + constexpr size_t SpillCount = 32; struct JitState { + JitState(BlockOfCode* code) { ResetRSB(code); } + u32 Cpsr = 0; std::array Reg{}; // Current register file. // TODO: Mode-specific register sets unimplemented. @@ -34,9 +38,16 @@ struct JitState { u32 exclusive_state = 0; u32 exclusive_address = 0; + static constexpr size_t RSBSize = 4; // MUST be a power of 2. + u32 rsb_ptr = 0; + std::array rsb_location_descriptors; + std::array rsb_codeptrs; + void ResetRSB(BlockOfCode* code); + u32 FPSCR_IDC = 0; u32 FPSCR_UFC = 0; - u32 guest_FPSCR_flags = 0; + u32 guest_FPSCR_mode = 0; + u32 guest_FPSCR_nzcv = 0; u32 old_FPSCR = 0; u32 Fpscr() const; void SetFpscr(u32 FPSCR); diff --git a/src/backend_x64/reg_alloc.cpp b/src/backend_x64/reg_alloc.cpp index 83ab32c4..b5e1242e 100644 --- a/src/backend_x64/reg_alloc.cpp +++ b/src/backend_x64/reg_alloc.cpp @@ -45,7 +45,7 @@ static Gen::X64Reg HostLocToX64(HostLoc loc) { } static Gen::OpArg SpillToOpArg(HostLoc loc) { - static_assert(std::is_same::value, "Spill must be u64"); + static_assert(std::is_same::value, "Spill must be u64"); DEBUG_ASSERT(HostLocIsSpill(loc)); size_t i = static_cast(loc) - static_cast(HostLoc::FirstSpill); diff --git a/src/frontend/arm_types.h b/src/frontend/arm_types.h index 670471a8..a525c779 100644 --- a/src/frontend/arm_types.h +++ b/src/frontend/arm_types.h @@ -71,10 +71,10 @@ enum class SignExtendRotation { * tells us if the processor is in Thumb or Arm mode. */ struct LocationDescriptor { - static constexpr u32 FPSCR_MASK = 0x3F79F9F; + static constexpr u32 FPSCR_MODE_MASK = 0x03F79F00; LocationDescriptor(u32 arm_pc, bool tflag, bool eflag, u32 fpscr) - : arm_pc(arm_pc), tflag(tflag), eflag(eflag), fpscr(fpscr & FPSCR_MASK) {} + : arm_pc(arm_pc), tflag(tflag), eflag(eflag), fpscr(fpscr & FPSCR_MODE_MASK) {} u32 PC() const { return arm_pc; } bool TFlag() const { return tflag; } @@ -106,7 +106,17 @@ struct LocationDescriptor { } LocationDescriptor SetFPSCR(u32 new_fpscr) const { - return LocationDescriptor(arm_pc, tflag, eflag, new_fpscr & FPSCR_MASK); + return LocationDescriptor(arm_pc, tflag, eflag, new_fpscr & FPSCR_MODE_MASK); + } + + u64 UniqueHash() const { + // This value MUST BE UNIQUE. + // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint + u64 pc_u64 = u64(arm_pc); + u64 fpscr_u64 = u64(fpscr) << 32; + u64 t_u64 = tflag ? (1ull << 35) : 0; + u64 e_u64 = eflag ? (1ull << 39) : 0; + return pc_u64 | fpscr_u64 | t_u64 | e_u64; } private: @@ -118,10 +128,7 @@ private: struct LocationDescriptorHash { size_t operator()(const LocationDescriptor& x) const { - return std::hash()(static_cast(x.PC()) - ^ static_cast(x.TFlag()) - ^ (static_cast(x.EFlag()) << 1) - ^ (static_cast(x.FPSCR()) << 32)); + return std::hash()(x.UniqueHash()); } }; diff --git a/src/frontend/ir/ir.cpp b/src/frontend/ir/ir.cpp index fac45ff7..88fbba88 100644 --- a/src/frontend/ir/ir.cpp +++ b/src/frontend/ir/ir.cpp @@ -41,6 +41,10 @@ Value::Value(u32 value) : type(Type::U32) { inner.imm_u32 = value; } +Value::Value(u64 value) : type(Type::U64) { + inner.imm_u64 = value; +} + bool Value::IsImmediate() const { if (type == Type::Opaque) return inner.inst->GetOpcode() == Opcode::Identity ? inner.inst->GetArg(0).IsImmediate() : false; @@ -98,6 +102,13 @@ u32 Value::GetU32() const { return inner.imm_u32; } +u64 Value::GetU64() const { + if (type == Type::Opaque && inner.inst->GetOpcode() == Opcode::Identity) + return inner.inst->GetArg(0).GetU64(); + DEBUG_ASSERT(type == Type::U64); + return inner.imm_u64; +} + // Inst class member definitions Value Inst::GetArg(size_t index) const { diff --git a/src/frontend/ir/ir.h b/src/frontend/ir/ir.h index fceb49f5..d4be966d 100644 --- a/src/frontend/ir/ir.h +++ b/src/frontend/ir/ir.h @@ -50,6 +50,7 @@ public: explicit Value(bool value); explicit Value(u8 value); explicit Value(u32 value); + explicit Value(u64 value); bool IsEmpty() const; bool IsImmediate() const; @@ -61,6 +62,7 @@ public: bool GetU1() const; u8 GetU8() const; u32 GetU32() const; + u64 GetU64() const; private: Type type; @@ -72,6 +74,7 @@ private: bool imm_u1; u8 imm_u8; u32 imm_u32; + u64 imm_u64; } inner; }; diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index b59d65e5..8fbba46a 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -98,6 +98,10 @@ void IREmitter::CallSupervisor(const IR::Value& value) { Inst(IR::Opcode::CallSupervisor, {value}); } +void IREmitter::PushRSB(const LocationDescriptor& return_location) { + Inst(IR::Opcode::PushRSB, {IR::Value(return_location.UniqueHash())}); +} + IR::Value IREmitter::GetCFlag() { return Inst(IR::Opcode::GetCFlag, {}); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 89372087..58ac6f75 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -54,6 +54,7 @@ public: void BXWritePC(const IR::Value& value); void LoadWritePC(const IR::Value& value); void CallSupervisor(const IR::Value& value); + void PushRSB(const LocationDescriptor& return_location); IR::Value GetCFlag(); void SetNFlag(const IR::Value& value); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 6aac8ad2..c81be50e 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -22,6 +22,9 @@ OPCODE(OrQFlag, T::Void, T::U1 OPCODE(BXWritePC, T::Void, T::U32 ) OPCODE(CallSupervisor, T::Void, T::U32 ) +// Hints +OPCODE(PushRSB, T::Void, T::U64 ) + // Pseudo-operation, handled specially at final emit OPCODE(GetCarryFromOp, T::U1, T::U32 ) OPCODE(GetOverflowFromOp, T::U1, T::U32 ) diff --git a/src/frontend/translate/translate_arm/branch.cpp b/src/frontend/translate/translate_arm/branch.cpp index 36ce9b20..b9b3596a 100644 --- a/src/frontend/translate/translate_arm/branch.cpp +++ b/src/frontend/translate/translate_arm/branch.cpp @@ -26,6 +26,7 @@ bool ArmTranslatorVisitor::arm_BL(Cond cond, Imm24 imm24) { u32 imm32 = Common::SignExtend<26, u32>(imm24 << 2) + 8; // BL