diff --git a/src/dynarmic/backend/arm64/a32_address_space.cpp b/src/dynarmic/backend/arm64/a32_address_space.cpp index 982789a6..f839f3c3 100644 --- a/src/dynarmic/backend/arm64/a32_address_space.cpp +++ b/src/dynarmic/backend/arm64/a32_address_space.cpp @@ -152,55 +152,6 @@ void A32AddressSpace::EmitPrelude() { mem.unprotect(); - prelude_info.run_code = code.ptr(); - ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); - - code.MOV(Xstate, X1); - code.MOV(Xhalt, X2); - - code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); - code.AND(Wscratch0, Wscratch0, 0xffff0000); - code.MRS(Xscratch1, oaknut::SystemReg::FPCR); - code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); - code.MSR(oaknut::SystemReg::FPCR, Xscratch0); - - code.BR(X0); - - prelude_info.step_code = code.ptr(); - ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); - - code.MOV(Xstate, X1); - code.MOV(Xhalt, X2); - - code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); - code.AND(Wscratch0, Wscratch0, 0xffff0000); - code.MRS(Xscratch1, oaknut::SystemReg::FPCR); - code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); - code.MSR(oaknut::SystemReg::FPCR, Xscratch0); - - oaknut::Label step_hr_loop; - code.l(step_hr_loop); - code.LDAXR(Wscratch0, Xhalt); - code.ORR(Wscratch0, Wscratch0, static_cast(HaltReason::Step)); - code.STLXR(Wscratch1, Wscratch0, Xhalt); - code.CBNZ(Wscratch1, step_hr_loop); - - code.BR(X0); - - prelude_info.return_from_run_code = code.ptr(); - - code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr)); - code.MSR(oaknut::SystemReg::FPCR, Xscratch0); - - oaknut::Label exit_hr_loop; - code.l(exit_hr_loop); - code.LDAXR(W0, Xhalt); - code.STLXR(Wscratch0, WZR, Xhalt); - code.CBNZ(Wscratch0, exit_hr_loop); - - ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); - code.RET(); - prelude_info.read_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks); prelude_info.read_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks); prelude_info.read_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks); @@ -223,6 +174,112 @@ void A32AddressSpace::EmitPrelude() { prelude_info.add_ticks = EmitCallTrampoline<&A32::UserCallbacks::AddTicks>(code, conf.callbacks); prelude_info.get_ticks_remaining = EmitCallTrampoline<&A32::UserCallbacks::GetTicksRemaining>(code, conf.callbacks); + oaknut::Label return_from_run_code; + + prelude_info.run_code = code.ptr(); + { + ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + + code.MOV(X19, X0); + code.MOV(Xstate, X1); + code.MOV(Xhalt, X2); + + if (conf.enable_cycle_counting) { + code.BL(prelude_info.get_ticks_remaining); + code.MOV(Xticks, X0); + code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); + } + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.AND(Wscratch0, Wscratch0, 0xffff0000); + code.MRS(Xscratch1, oaknut::SystemReg::FPCR); + code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + code.BR(X19); + } + + prelude_info.step_code = code.ptr(); + { + ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + + code.MOV(X19, X0); + code.MOV(Xstate, X1); + code.MOV(Xhalt, X2); + + if (conf.enable_cycle_counting) { + code.MOV(Xticks, 1); + code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); + } + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.AND(Wscratch0, Wscratch0, 0xffff0000); + code.MRS(Xscratch1, oaknut::SystemReg::FPCR); + code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + oaknut::Label step_hr_loop; + code.l(step_hr_loop); + code.LDAXR(Wscratch0, Xhalt); + code.ORR(Wscratch0, Wscratch0, static_cast(HaltReason::Step)); + code.STLXR(Wscratch1, Wscratch0, Xhalt); + code.CBNZ(Wscratch1, step_hr_loop); + + code.BR(X19); + } + + prelude_info.return_to_dispatcher = code.ptr(); + { + oaknut::Label l_this, l_addr; + + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, return_from_run_code); + + if (conf.enable_cycle_counting) { + code.CMP(Xticks, 0); + code.B(LE, return_from_run_code); + } + + code.LDR(X0, l_this); + code.MOV(X1, Xstate); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.BR(X0); + + const auto fn = [](A32AddressSpace& self, A32JitState& context) -> CodePtr { + return self.GetOrEmit(context.GetLocationDescriptor()); + }; + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast(this)); + code.l(l_addr); + code.dx(mcl::bit_cast(Common::FptrCast(fn))); + } + + prelude_info.return_from_run_code = code.ptr(); + { + code.l(return_from_run_code); + + if (conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + code.BL(prelude_info.add_ticks); + } + + code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + oaknut::Label exit_hr_loop; + code.l(exit_hr_loop); + code.LDAXR(W0, Xhalt); + code.STLXR(Wscratch0, WZR, Xhalt); + code.CBNZ(Wscratch0, exit_hr_loop); + + ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + code.RET(); + } + prelude_info.end_of_prelude = code.ptr(); mem.invalidate_all(); @@ -267,6 +324,9 @@ void A32AddressSpace::Link(EmittedBlockInfo& block_info) { CodeGenerator c{reinterpret_cast(block_info.entry_point + ptr_offset)}; switch (target) { + case LinkTarget::ReturnToDispatcher: + c.B(prelude_info.return_to_dispatcher); + break; case LinkTarget::ReturnFromRunCode: c.B(prelude_info.return_from_run_code); break; diff --git a/src/dynarmic/backend/arm64/a32_address_space.h b/src/dynarmic/backend/arm64/a32_address_space.h index b4c276f4..7bca0885 100644 --- a/src/dynarmic/backend/arm64/a32_address_space.h +++ b/src/dynarmic/backend/arm64/a32_address_space.h @@ -55,6 +55,7 @@ private: using RunCodeFuncType = HaltReason (*)(CodePtr entry_point, A32JitState* context, volatile u32* halt_reason); RunCodeFuncType run_code; RunCodeFuncType step_code; + void* return_to_dispatcher; void* return_from_run_code; void* read_memory_8; diff --git a/src/dynarmic/backend/arm64/abi.h b/src/dynarmic/backend/arm64/abi.h index 599ea720..3a14848a 100644 --- a/src/dynarmic/backend/arm64/abi.h +++ b/src/dynarmic/backend/arm64/abi.h @@ -14,6 +14,7 @@ namespace Dynarmic::Backend::Arm64 { constexpr oaknut::XReg Xstate{28}; constexpr oaknut::XReg Xhalt{27}; +constexpr oaknut::XReg Xticks{26}; constexpr oaknut::XReg Xscratch0{16}, Xscratch1{17}; constexpr oaknut::WReg Wscratch0{16}, Wscratch1{17}; @@ -40,7 +41,7 @@ constexpr auto Rscratch1() { } } -constexpr std::initializer_list GPR_ORDER{19, 20, 21, 22, 23, 24, 25, 26, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8}; +constexpr std::initializer_list GPR_ORDER{19, 20, 21, 22, 23, 24, 25, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8}; constexpr std::initializer_list FPR_ORDER{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; using RegisterList = u64; diff --git a/src/dynarmic/backend/arm64/emit_arm64.cpp b/src/dynarmic/backend/arm64/emit_arm64.cpp index 91dd28ce..0e72a80b 100644 --- a/src/dynarmic/backend/arm64/emit_arm64.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64.cpp @@ -135,15 +135,17 @@ void EmitIR(oaknut::CodeGenerator&, EmitContext ctx.reg_alloc.DefineAsExisting(inst, args[0]); } -static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext&, size_t cycles_to_add) { - code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_remaining)); +static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext& ctx, size_t cycles_to_add) { + if (!ctx.conf.enable_cycle_counting) { + return; + } + if (oaknut::AddSubImm::is_valid(cycles_to_add)) { - code.SUBS(Xscratch0, Xscratch0, cycles_to_add); + code.SUB(Xticks, Xticks, cycles_to_add); } else { code.MOV(Xscratch1, cycles_to_add); - code.SUBS(Xscratch0, Xscratch0, Xscratch1); + code.SUB(Xticks, Xticks, Xscratch1); } - code.STR(Xscratch0, SP, offsetof(StackLayout, cycles_remaining)); } EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& conf) { @@ -161,9 +163,7 @@ EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const E ASSERT(ctx.block.HasConditionFailedLocation()); oaknut::Label pass = EmitA32Cond(code, ctx, ctx.block.GetCondition()); - if (conf.enable_cycle_counting) { - EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount()); - } + EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount()); EmitA32ConditionFailedTerminal(code, ctx); code.l(pass); } @@ -201,10 +201,7 @@ EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const E reg_alloc.AssertNoMoreUses(); - if (ctx.conf.enable_cycle_counting) { - EmitAddCycles(code, ctx, block.CycleCount()); - } - + EmitAddCycles(code, ctx, block.CycleCount()); EmitA32Terminal(code, ctx); ebi.size = code.ptr() - ebi.entry_point; diff --git a/src/dynarmic/backend/arm64/emit_arm64.h b/src/dynarmic/backend/arm64/emit_arm64.h index db8c544b..ed03fa84 100644 --- a/src/dynarmic/backend/arm64/emit_arm64.h +++ b/src/dynarmic/backend/arm64/emit_arm64.h @@ -39,6 +39,7 @@ namespace Dynarmic::Backend::Arm64 { using CodePtr = std::byte*; enum class LinkTarget { + ReturnToDispatcher, ReturnFromRunCode, ReadMemory8, ReadMemory16, diff --git a/src/dynarmic/backend/arm64/emit_arm64_a32.cpp b/src/dynarmic/backend/arm64/emit_arm64_a32.cpp index 53bc3029..f713d688 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_a32.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_a32.cpp @@ -38,7 +38,7 @@ void EmitA32Terminal(oaknut::CodeGenerator&, EmitContext&, IR::Term::Interpret, } void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { - EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); } void EmitSetUpperLocationDescriptor(oaknut::CodeGenerator& code, EmitContext& ctx, IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) { @@ -63,7 +63,7 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC()); code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15); - EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); // TODO: Implement LinkBlock optimization } @@ -73,19 +73,19 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Li code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC()); code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15); - EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); // TODO: Implement LinkBlockFast optimization } void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool) { - EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); // TODO: Implement PopRSBHint optimization } void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) { - EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); // TODO: Implement FastDispatchHint optimization } @@ -112,7 +112,7 @@ void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Ch code.CBNZ(Wscratch0, fail); EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step); code.l(fail); - EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); } void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) { @@ -508,11 +508,9 @@ void EmitIR(oaknut::CodeGenerator& code, EmitCont auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.PrepareForCall(nullptr); - static_assert(offsetof(StackLayout, cycles_remaining) + sizeof(u64) == offsetof(StackLayout, cycles_to_run)); - if (ctx.conf.enable_cycle_counting) { - code.LDP(Xscratch0, Xscratch1, SP, offsetof(StackLayout, cycles_remaining)); - code.SUB(Xscratch0, Xscratch1, Xscratch0); + code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(Xscratch0, Xscratch0, Xticks); EmitRelocation(code, ctx, LinkTarget::AddTicks); } @@ -521,7 +519,8 @@ void EmitIR(oaknut::CodeGenerator& code, EmitCont if (ctx.conf.enable_cycle_counting) { EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); - code.STP(X0, X0, SP, offsetof(StackLayout, cycles_remaining)); + code.STR(X0, SP, offsetof(StackLayout, cycles_to_run)); + code.MOV(Xticks, X0); } } @@ -530,11 +529,9 @@ void EmitIR(oaknut::CodeGenerator& code, EmitCon auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.PrepareForCall(nullptr); - static_assert(offsetof(StackLayout, cycles_remaining) + sizeof(u64) == offsetof(StackLayout, cycles_to_run)); - if (ctx.conf.enable_cycle_counting) { - code.LDP(Xscratch0, Xscratch1, SP, offsetof(StackLayout, cycles_remaining)); - code.SUB(Xscratch0, Xscratch1, Xscratch0); + code.LDR(Xscratch0, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(Xscratch0, Xscratch0, Xticks); EmitRelocation(code, ctx, LinkTarget::AddTicks); } @@ -544,7 +541,8 @@ void EmitIR(oaknut::CodeGenerator& code, EmitCon if (ctx.conf.enable_cycle_counting) { EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); - code.STP(X0, X0, SP, offsetof(StackLayout, cycles_remaining)); + code.STR(X0, SP, offsetof(StackLayout, cycles_to_run)); + code.MOV(Xticks, X0); } } diff --git a/src/dynarmic/backend/arm64/stack_layout.h b/src/dynarmic/backend/arm64/stack_layout.h index 9eafdfc4..63218b18 100644 --- a/src/dynarmic/backend/arm64/stack_layout.h +++ b/src/dynarmic/backend/arm64/stack_layout.h @@ -19,11 +19,10 @@ constexpr size_t SpillCount = 64; #endif struct alignas(16) StackLayout { - s64 cycles_remaining; - s64 cycles_to_run; - std::array, SpillCount> spill; + s64 cycles_to_run; + u32 save_host_fpcr; bool check_bit;