/* This file is part of the dynarmic project. * Copyright (c) 2016 MerryMage * SPDX-License-Identifier: 0BSD */ #include #include #include #include #include #include "backend/x64/a64_emit_x64.h" #include "backend/x64/a64_jitstate.h" #include "backend/x64/abi.h" #include "backend/x64/block_of_code.h" #include "backend/x64/devirtualize.h" #include "backend/x64/emit_x64.h" #include "backend/x64/nzcv_util.h" #include "backend/x64/perf_map.h" #include "common/assert.h" #include "common/bit_util.h" #include "common/common_types.h" #include "common/scope_exit.h" #include "frontend/A64/location_descriptor.h" #include "frontend/A64/types.h" #include "ir/basic_block.h" #include "ir/cond.h" #include "ir/microinstruction.h" #include "ir/opcodes.h" // TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary. // TODO: Actually implement that proper instruction selector you've always wanted to sweetheart. namespace Dynarmic::Backend::X64 { using namespace Xbyak::util; A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block) : EmitContext(reg_alloc, block), conf(conf) {} A64::LocationDescriptor A64EmitContext::Location() const { return A64::LocationDescriptor{block.Location()}; } bool A64EmitContext::IsSingleStep() const { return Location().SingleStepping(); } FP::FPCR A64EmitContext::FPCR(bool fpcr_controlled) const { return fpcr_controlled ? Location().FPCR() : Location().FPCR().ASIMDStandardValue(); } A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface) : EmitX64(code), conf(conf), jit_interface{jit_interface} { GenMemory128Accessors(); GenFastmemFallbacks(); GenTerminalHandlers(); code.PreludeComplete(); ClearFastDispatchTable(); } A64EmitX64::~A64EmitX64() = default; A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) { code.EnableWriting(); SCOPE_EXIT { code.DisableWriting(); }; static const std::vector gpr_order = [this]{ std::vector gprs{any_gpr}; if (conf.page_table) { gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14)); } return gprs; }(); RegAlloc reg_alloc{code, A64JitState::SpillCount, SpillToOpArg, gpr_order, any_xmm}; A64EmitContext ctx{conf, reg_alloc, block}; // Start emitting. code.align(); const u8* const entrypoint = code.getCurr(); code.SwitchToFarCode(); const u8* const entrypoint_far = code.getCurr(); code.SwitchToNearCode(); ASSERT(block.GetCondition() == IR::Cond::AL); for (auto iter = block.begin(); iter != block.end(); ++iter) { IR::Inst* inst = &*iter; // Call the relevant Emit* member function. switch (inst->GetOpcode()) { #define OPCODE(name, type, ...) \ case IR::Opcode::name: \ A64EmitX64::Emit##name(ctx, inst); \ break; #define A32OPC(...) #define A64OPC(name, type, ...) \ case IR::Opcode::A64##name: \ A64EmitX64::EmitA64##name(ctx, inst); \ break; #include "ir/opcodes.inc" #undef OPCODE #undef A32OPC #undef A64OPC default: ASSERT_MSG(false, "Invalid opcode: {}", inst->GetOpcode()); break; } ctx.reg_alloc.EndOfAllocScope(); } reg_alloc.AssertNoMoreUses(); EmitAddCycles(block.CycleCount()); EmitX64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep()); code.int3(); const size_t size = static_cast(code.getCurr() - entrypoint); const A64::LocationDescriptor descriptor{block.Location()}; const A64::LocationDescriptor end_location{block.EndLocation()}; const auto range = boost::icl::discrete_interval::closed(descriptor.PC(), end_location.PC() - 1); block_ranges.AddRange(range, descriptor); return RegisterBlock(descriptor, entrypoint, entrypoint_far, size); } void A64EmitX64::ClearCache() { EmitX64::ClearCache(); block_ranges.ClearCache(); ClearFastDispatchTable(); } void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set& ranges) { InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); } void A64EmitX64::ClearFastDispatchTable() { if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { fast_dispatch_table.fill({}); } } void A64EmitX64::GenMemory128Accessors() { code.align(); memory_read_128 = code.getCurr(); #ifdef _WIN32 Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCallWithReturnPointer(code, [&](Xbyak::Reg64 return_value_ptr, [[maybe_unused]] RegList args) { code.mov(code.ABI_PARAM3, code.ABI_PARAM2); code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); code.lea(return_value_ptr, ptr[rsp + ABI_SHADOW_SPACE]); }); code.movups(xmm1, xword[code.ABI_RETURN]); code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); #else code.sub(rsp, 8); Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code); if (code.HasSSE41()) { code.movq(xmm1, code.ABI_RETURN); code.pinsrq(xmm1, code.ABI_RETURN2, 1); } else { code.movq(xmm1, code.ABI_RETURN); code.movq(xmm2, code.ABI_RETURN2); code.punpcklqdq(xmm1, xmm2); } code.add(rsp, 8); #endif code.ret(); PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_read_128"); code.align(); memory_write_128 = code.getCurr(); #ifdef _WIN32 code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); code.movaps(xword[code.ABI_PARAM3], xmm1); Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); #else code.sub(rsp, 8); if (code.HasSSE41()) { code.movq(code.ABI_PARAM3, xmm1); code.pextrq(code.ABI_PARAM4, xmm1, 1); } else { code.movq(code.ABI_PARAM3, xmm1); code.punpckhqdq(xmm1, xmm1); code.movq(code.ABI_PARAM4, xmm1); } Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); code.add(rsp, 8); #endif code.ret(); PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_write_128"); } void A64EmitX64::GenFastmemFallbacks() { const std::initializer_list idxes{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const std::array, 4> read_callbacks{{ {8, Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks)}, {16, Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks)}, {32, Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks)}, {64, Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks)}, }}; const std::array, 4> write_callbacks{{ {8, Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks)}, {16, Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks)}, {32, Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks)}, {64, Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks)}, }}; for (int vaddr_idx : idxes) { if (vaddr_idx == 4 || vaddr_idx == 15) { continue; } for (int value_idx : idxes) { code.align(); read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } code.call(memory_read_128); if (value_idx != 1) { code.movaps(Xbyak::Xmm{value_idx}, xmm1); } ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); code.ret(); PerfMapRegister(read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); code.align(); write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStack(code); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } if (value_idx != 1) { code.movaps(xmm1, Xbyak::Xmm{value_idx}); } code.call(memory_write_128); ABI_PopCallerSaveRegistersAndAdjustStack(code); code.ret(); PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); if (value_idx == 4 || value_idx == 15) { continue; } for (const auto& [bitsize, callback] : read_callbacks) { code.align(); read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } callback.EmitCall(code); if (value_idx != code.ABI_RETURN.getIdx()) { code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); } ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); code.ret(); PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); } for (const auto& [bitsize, callback] : write_callbacks) { code.align(); write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStack(code); if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); if (value_idx != code.ABI_PARAM3.getIdx()) { code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); } } else { if (value_idx != code.ABI_PARAM3.getIdx()) { code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); } if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } } callback.EmitCall(code); ABI_PopCallerSaveRegistersAndAdjustStack(code); code.ret(); PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); } } } } void A64EmitX64::GenTerminalHandlers() { // PC ends up in rbp, location_descriptor ends up in rbx const auto calculate_location_descriptor = [this] { // This calculation has to match up with A64::LocationDescriptor::UniqueHash // TODO: Optimization is available here based on known state of fpcr. code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]); code.mov(rcx, A64::LocationDescriptor::pc_mask); code.and_(rcx, rbp); code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]); code.and_(ebx, A64::LocationDescriptor::fpcr_mask); code.shl(rbx, A64::LocationDescriptor::fpcr_shift); code.or_(rbx, rcx); }; Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss; code.align(); terminal_handler_pop_rsb_hint = code.getCurr(); calculate_location_descriptor(); code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]); code.sub(eax, 1); code.and_(eax, u32(A64JitState::RSBPtrMask)); code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax); code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]); if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { code.jne(rsb_cache_miss); } else { code.jne(code.GetReturnFromRunCodeAddress()); } code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]); code.jmp(rax); PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint"); if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { code.align(); terminal_handler_fast_dispatch_hint = code.getCurr(); calculate_location_descriptor(); code.L(rsb_cache_miss); code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); if (code.HasSSE42()) { code.crc32(rbx, r12d); } code.and_(ebp, fast_dispatch_table_mask); code.lea(rbp, ptr[r12 + rbp]); code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); code.jne(fast_dispatch_cache_miss); code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); code.L(fast_dispatch_cache_miss); code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); code.LookupBlock(); code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); code.jmp(rax); PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint"); code.align(); fast_dispatch_table_lookup = code.getCurr(); code.mov(code.ABI_PARAM2, reinterpret_cast(fast_dispatch_table.data())); if (code.HasSSE42()) { code.crc32(code.ABI_PARAM1, code.ABI_PARAM2); } code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask); code.lea(code.ABI_RETURN, code.ptr[code.ABI_PARAM1 + code.ABI_PARAM2]); code.ret(); PerfMapRegister(fast_dispatch_table_lookup, code.getCurr(), "a64_fast_dispatch_table_lookup"); } } void A64EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) { if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { return; } EmitX64::EmitPushRSB(ctx, inst); } void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8(); code.mov(code.byte[r15 + offsetof(A64JitState, check_bit)], to_store); } void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(result, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]); code.shr(result, NZCV::x64_c_flag_bit); code.and_(result, 1); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]); if (code.HasFastBMI2()) { const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(tmp, NZCV::x64_mask); code.pext(nzcv_raw, nzcv_raw, tmp); code.shl(nzcv_raw, 28); } else { code.and_(nzcv_raw, NZCV::x64_mask); code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier); code.and_(nzcv_raw, NZCV::arm_mask); } ctx.reg_alloc.DefineValue(inst, nzcv_raw); } void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); code.shr(nzcv_raw, 28); if (code.HasFastBMI2()) { const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(tmp, NZCV::x64_mask); code.pdep(nzcv_raw, nzcv_raw, tmp); } else { code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier); code.and_(nzcv_raw, NZCV::x64_mask); } code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw); } void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], to_store); } void A64EmitX64::EmitA64GetW(A64EmitContext& ctx, IR::Inst* inst) { const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(result, dword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast(reg)]); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetX(A64EmitContext& ctx, IR::Inst* inst) { const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); code.mov(result, qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast(reg)]); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) { const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); code.movd(result, addr); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) { const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); code.movq(result, addr); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) { const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); code.movaps(result, addr); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetSP(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); code.mov(result, qword[r15 + offsetof(A64JitState, sp)]); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetFPCR(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(result, dword[r15 + offsetof(A64JitState, fpcr)]); ctx.reg_alloc.DefineValue(inst, result); } static u32 GetFPSRImpl(A64JitState* jit_state) { return jit_state->GetFpsr(); } void A64EmitX64::EmitA64GetFPSR(A64EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.HostCall(inst); code.mov(code.ABI_PARAM1, code.r15); code.stmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]); code.CallFunction(GetFPSRImpl); } void A64EmitX64::EmitA64SetW(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast(reg)]; if (args[1].FitsInImmediateS32()) { code.mov(addr, args[1].GetImmediateS32()); } else { // TODO: zext tracking, xmm variant const Xbyak::Reg64 to_store = ctx.reg_alloc.UseScratchGpr(args[1]); code.mov(to_store.cvt32(), to_store.cvt32()); code.mov(addr, to_store); } } void A64EmitX64::EmitA64SetX(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast(reg)]; if (args[1].FitsInImmediateS32()) { code.mov(addr, args[1].GetImmediateS32()); } else if (args[1].IsInXmm()) { const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); code.movq(addr, to_store); } else { const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]); code.mov(addr, to_store); } } void A64EmitX64::EmitA64SetS(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); // TODO: Optimize code.pxor(tmp, tmp); code.movss(tmp, to_store); code.movaps(addr, tmp); } void A64EmitX64::EmitA64SetD(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm to_store = ctx.reg_alloc.UseScratchXmm(args[1]); code.movq(to_store, to_store); // TODO: Remove when able code.movaps(addr, to_store); } void A64EmitX64::EmitA64SetQ(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); code.movaps(addr, to_store); } void A64EmitX64::EmitA64SetSP(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const auto addr = qword[r15 + offsetof(A64JitState, sp)]; if (args[0].FitsInImmediateS32()) { code.mov(addr, args[0].GetImmediateS32()); } else if (args[0].IsInXmm()) { const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]); code.movq(addr, to_store); } else { const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]); code.mov(addr, to_store); } } static void SetFPCRImpl(A64JitState* jit_state, u32 value) { jit_state->SetFpcr(value); } void A64EmitX64::EmitA64SetFPCR(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0]); code.mov(code.ABI_PARAM1, code.r15); code.CallFunction(SetFPCRImpl); code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]); } static void SetFPSRImpl(A64JitState* jit_state, u32 value) { jit_state->SetFpsr(value); } void A64EmitX64::EmitA64SetFPSR(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0]); code.mov(code.ABI_PARAM1, code.r15); code.CallFunction(SetFPSRImpl); code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]); } void A64EmitX64::EmitA64OrQC(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (args[0].IsImmediate()) { if (!args[0].GetImmediateU1()) { return; } code.mov(code.byte[code.r15 + offsetof(A64JitState, fpsr_qc)], u8(1)); return; } const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8(); code.or_(code.byte[code.r15 + offsetof(A64JitState, fpsr_qc)], to_store); } void A64EmitX64::EmitA64SetPC(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const auto addr = qword[r15 + offsetof(A64JitState, pc)]; if (args[0].FitsInImmediateS32()) { code.mov(addr, args[0].GetImmediateS32()); } else if (args[0].IsInXmm()) { const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]); code.movq(addr, to_store); } else { const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]); code.mov(addr, to_store); } } void A64EmitX64::EmitA64CallSupervisor(A64EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.HostCall(nullptr); auto args = ctx.reg_alloc.GetArgumentInfo(inst); ASSERT(args[0].IsImmediate()); const u32 imm = args[0].GetImmediateU32(); Devirtualize<&A64::UserCallbacks::CallSVC>(conf.callbacks).EmitCall(code, [&](RegList param) { code.mov(param[0], imm); }); // The kernel would have to execute ERET to get here, which would clear exclusive state. code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); } void A64EmitX64::EmitA64ExceptionRaised(A64EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.HostCall(nullptr); auto args = ctx.reg_alloc.GetArgumentInfo(inst); ASSERT(args[0].IsImmediate() && args[1].IsImmediate()); const u64 pc = args[0].GetImmediateU64(); const u64 exception = args[1].GetImmediateU64(); Devirtualize<&A64::UserCallbacks::ExceptionRaised>(conf.callbacks).EmitCall(code, [&](RegList param) { code.mov(param[0], pc); code.mov(param[1], exception); }); } void A64EmitX64::EmitA64DataCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); Devirtualize<&A64::UserCallbacks::DataCacheOperationRaised>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64InstructionCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); Devirtualize<&A64::UserCallbacks::InstructionCacheOperationRaised>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64DataSynchronizationBarrier(A64EmitContext&, IR::Inst*) { code.mfence(); } void A64EmitX64::EmitA64DataMemoryBarrier(A64EmitContext&, IR::Inst*) { code.lfence(); } void A64EmitX64::EmitA64InstructionSynchronizationBarrier(A64EmitContext& ctx, IR::Inst*) { if (!conf.hook_isb) { return; } ctx.reg_alloc.HostCall(nullptr); Devirtualize<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64GetCNTFRQ(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(result, conf.cntfrq_el0); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetCNTPCT(A64EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.HostCall(inst); if (!conf.wall_clock_cntpct) { code.UpdateTicks(); } Devirtualize<&A64::UserCallbacks::GetCNTPCT>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64GetCTR(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(result, conf.ctr_el0); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetDCZID(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); code.mov(result, conf.dczid_el0); ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetTPIDR(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); if (conf.tpidr_el0) { code.mov(result, u64(conf.tpidr_el0)); code.mov(result, qword[result]); } else { code.xor_(result.cvt32(), result.cvt32()); } ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64GetTPIDRRO(A64EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); if (conf.tpidrro_el0) { code.mov(result, u64(conf.tpidrro_el0)); code.mov(result, qword[result]); } else { code.xor_(result.cvt32(), result.cvt32()); } ctx.reg_alloc.DefineValue(inst, result); } void A64EmitX64::EmitA64SetTPIDR(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Reg64 addr = ctx.reg_alloc.ScratchGpr(); if (conf.tpidr_el0) { code.mov(addr, u64(conf.tpidr_el0)); code.mov(qword[addr], value); } } void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) { code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); } namespace { constexpr size_t page_bits = 12; constexpr size_t page_size = 1 << page_bits; constexpr size_t page_mask = (1 << page_bits) - 1; void EmitDetectMisaignedVAddr(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr, Xbyak::Reg64 tmp) { if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) { return; } const u32 align_mask = [bitsize]() -> u32 { switch (bitsize) { case 16: return 0b1; case 32: return 0b11; case 64: return 0b111; case 128: return 0b1111; } UNREACHABLE(); }(); code.test(vaddr, align_mask); if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { code.jnz(abort, code.T_NEAR); return; } const u32 page_align_mask = static_cast(page_size - 1) & ~align_mask; Xbyak::Label detect_boundary, resume; code.jnz(detect_boundary, code.T_NEAR); code.L(resume); code.SwitchToFarCode(); code.L(detect_boundary); code.mov(tmp, vaddr); code.and_(tmp, page_align_mask); code.cmp(tmp, page_align_mask); code.jne(resume, code.T_NEAR); // NOTE: We expect to fallthrough into abort code here. code.SwitchToNearCode(); } Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) { const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits; const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits; const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(); const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr(); EmitDetectMisaignedVAddr(code, ctx, bitsize, abort, vaddr, tmp); if (unused_top_bits == 0) { code.mov(tmp, vaddr); code.shr(tmp, int(page_bits)); } else if (ctx.conf.silently_mirror_page_table) { if (valid_page_index_bits >= 32) { if (code.HasBMI2()) { const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr(); code.mov(bit_count, unused_top_bits); code.bzhi(tmp, vaddr, bit_count); code.shr(tmp, int(page_bits)); ctx.reg_alloc.Release(bit_count); } else { code.mov(tmp, vaddr); code.shl(tmp, int(unused_top_bits)); code.shr(tmp, int(unused_top_bits + page_bits)); } } else { code.mov(tmp, vaddr); code.shr(tmp, int(page_bits)); code.and_(tmp, u32((1 << valid_page_index_bits) - 1)); } } else { ASSERT(valid_page_index_bits < 32); code.mov(tmp, vaddr); code.shr(tmp, int(page_bits)); code.test(tmp, u32(-(1 << valid_page_index_bits))); code.jnz(abort, code.T_NEAR); } code.mov(page, qword[r14 + tmp * sizeof(void*)]); if (ctx.conf.page_table_pointer_mask_bits == 0) { code.test(page, page); } else { code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits); } code.jz(abort, code.T_NEAR); if (ctx.conf.absolute_offset_page_table) { return page + vaddr; } code.mov(tmp, vaddr); code.and_(tmp, static_cast(page_mask)); return page + tmp; } template void EmitReadMemoryMov(BlockOfCode& code, const Xbyak::Reg64& value, const Xbyak::RegExp& addr) { switch (bitsize) { case 8: code.movzx(value.cvt32(), code.byte[addr]); return; case 16: code.movzx(value.cvt32(), word[addr]); return; case 32: code.mov(value.cvt32(), dword[addr]); return; case 64: code.mov(value, qword[addr]); return; default: ASSERT_FALSE("Invalid bitsize"); } } template void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, const Xbyak::Reg64& value) { switch (bitsize) { case 8: code.mov(code.byte[addr], value.cvt8()); return; case 16: code.mov(word[addr], value.cvt16()); return; case 32: code.mov(dword[addr], value.cvt32()); return; case 64: code.mov(qword[addr], value); return; default: ASSERT_FALSE("Invalid bitsize"); } } } // anonymous namepsace template void A64EmitX64::EmitDirectPageTableMemoryRead(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; Xbyak::Label abort, end; const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); EmitReadMemoryMov(code, value, src_ptr); code.L(end); code.SwitchToFarCode(); code.L(abort); code.call(wrapped_fn); code.jmp(end, code.T_NEAR); code.SwitchToNearCode(); ctx.reg_alloc.DefineValue(inst, value); } template void A64EmitX64::EmitDirectPageTableMemoryWrite(A64EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; Xbyak::Label abort, end; const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); EmitWriteMemoryMov(code, dest_ptr, value); code.L(end); code.SwitchToFarCode(); code.L(abort); code.call(wrapped_fn); code.jmp(end, code.T_NEAR); code.SwitchToNearCode(); } void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryRead<8>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, {}, args[0]); Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64ReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryRead<16>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, {}, args[0]); Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64ReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryRead<32>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, {}, args[0]); Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64ReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryRead<64>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, {}, args[0]); Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64ReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { Xbyak::Label abort, end; auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(); const auto src_ptr = EmitVAddrLookup(code, ctx, 128, abort, vaddr); code.movups(value, xword[src_ptr]); code.L(end); code.SwitchToFarCode(); code.L(abort); code.call(read_fallbacks[std::make_tuple(128, vaddr.getIdx(), value.getIdx())]); code.jmp(end, code.T_NEAR); code.SwitchToNearCode(); ctx.reg_alloc.DefineValue(inst, value); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0]); code.CallFunction(memory_read_128); ctx.reg_alloc.DefineValue(inst, xmm1); } void A64EmitX64::EmitA64WriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryWrite<8>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64WriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryWrite<16>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64WriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryWrite<32>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64WriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { EmitDirectPageTableMemoryWrite<64>(ctx, inst); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks).EmitCall(code); } void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { if (conf.page_table) { Xbyak::Label abort, end; auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[1]); const auto dest_ptr = EmitVAddrLookup(code, ctx, 128, abort, vaddr); code.movups(xword[dest_ptr], value); code.L(end); code.SwitchToFarCode(); code.L(abort); code.call(write_fallbacks[std::make_tuple(128, vaddr.getIdx(), value.getIdx())]); code.jmp(end, code.T_NEAR); code.SwitchToNearCode(); return; } auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.Use(args[0], ABI_PARAM2); ctx.reg_alloc.Use(args[1], HostLoc::XMM1); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(nullptr); code.CallFunction(memory_write_128); } template void A64EmitX64::EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst) { ASSERT(conf.global_monitor != nullptr); auto args = ctx.reg_alloc.GetArgumentInfo(inst); if constexpr (bitsize != 128) { using T = mp::unsigned_integer_of_size; ctx.reg_alloc.HostCall(inst, {}, args[0]); code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); code.CallLambda( [](A64::UserConfig& conf, u64 vaddr) -> T { return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { return (conf.callbacks->*callback)(vaddr); }); } ); } else { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); ctx.reg_alloc.Use(args[0], ABI_PARAM2); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(nullptr); code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); code.sub(rsp, 16 + ABI_SHADOW_SPACE); code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); code.CallLambda( [](A64::UserConfig& conf, u64 vaddr, A64::Vector& ret) { ret = conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> A64::Vector { return (conf.callbacks->*callback)(vaddr); }); } ); code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); code.add(rsp, 16 + ABI_SHADOW_SPACE); ctx.reg_alloc.DefineValue(inst, result); } } void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); } template void A64EmitX64::EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst) { ASSERT(conf.global_monitor != nullptr); auto args = ctx.reg_alloc.GetArgumentInfo(inst); if constexpr (bitsize != 128) { ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); } else { ctx.reg_alloc.Use(args[0], ABI_PARAM2); ctx.reg_alloc.Use(args[1], HostLoc::XMM1); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(inst); } Xbyak::Label end; code.mov(code.ABI_RETURN, u32(1)); code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); code.je(end); code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); if constexpr (bitsize != 128) { using T = mp::unsigned_integer_of_size; code.CallLambda( [](A64::UserConfig& conf, u64 vaddr, T value) -> u32 { return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, [&](T expected) -> bool { return (conf.callbacks->*callback)(vaddr, value, expected); }) ? 0 : 1; } ); } else { code.sub(rsp, 16 + ABI_SHADOW_SPACE); code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); code.movaps(xword[code.ABI_PARAM3], xmm1); code.CallLambda( [](A64::UserConfig& conf, u64 vaddr, A64::Vector& value) -> u32 { return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, [&](A64::Vector expected) -> bool { return (conf.callbacks->*callback)(vaddr, value, expected); }) ? 0 : 1; } ); code.add(rsp, 16 + ABI_SHADOW_SPACE); } code.L(end); } void A64EmitX64::EmitA64ExclusiveWriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveWriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveWriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveWriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); } void A64EmitX64::EmitA64ExclusiveWriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); } std::string A64EmitX64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const { const A64::LocationDescriptor descriptor{ir_descriptor}; return fmt::format("a64_{:016X}_fpcr{:08X}", descriptor.PC(), descriptor.FPCR().Value()); } void A64EmitX64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor, bool) { code.SwitchMxcsrOnExit(); Devirtualize<&A64::UserCallbacks::InterpreterFallback>(conf.callbacks).EmitCall(code, [&](RegList param) { code.mov(param[0], A64::LocationDescriptor{terminal.next}.PC()); code.mov(qword[r15 + offsetof(A64JitState, pc)], param[0]); code.mov(param[1].cvt32(), terminal.num_instructions); }); code.ReturnFromRunCode(true); // TODO: Check cycles } void A64EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { code.ReturnFromRunCode(); } void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor, bool is_single_step) { if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) { code.mov(rax, A64::LocationDescriptor{terminal.next}.PC()); code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); code.ReturnFromRunCode(); return; } code.cmp(qword[r15 + offsetof(A64JitState, cycles_remaining)], 0); patch_information[terminal.next].jg.emplace_back(code.getCurr()); if (auto next_bb = GetBasicBlock(terminal.next)) { EmitPatchJg(terminal.next, next_bb->entrypoint); } else { EmitPatchJg(terminal.next); } code.mov(rax, A64::LocationDescriptor{terminal.next}.PC()); code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); code.ForceReturnFromRunCode(); } void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor, bool is_single_step) { if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) { code.mov(rax, A64::LocationDescriptor{terminal.next}.PC()); code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); code.ReturnFromRunCode(); return; } patch_information[terminal.next].jmp.emplace_back(code.getCurr()); if (auto next_bb = GetBasicBlock(terminal.next)) { EmitPatchJmp(terminal.next, next_bb->entrypoint); } else { EmitPatchJmp(terminal.next); } } void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) || is_single_step) { code.ReturnFromRunCode(); return; } code.jmp(terminal_handler_pop_rsb_hint); } void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) { if (!conf.HasOptimization(OptimizationFlag::FastDispatch) || is_single_step) { code.ReturnFromRunCode(); return; } code.jmp(terminal_handler_fast_dispatch_hint); } void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) { switch (terminal.if_) { case IR::Cond::AL: case IR::Cond::NV: EmitTerminal(terminal.then_, initial_location, is_single_step); break; default: Xbyak::Label pass = EmitCond(terminal.if_); EmitTerminal(terminal.else_, initial_location, is_single_step); code.L(pass); EmitTerminal(terminal.then_, initial_location, is_single_step); break; } } void A64EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) { Xbyak::Label fail; code.cmp(code.byte[r15 + offsetof(A64JitState, check_bit)], u8(0)); code.jz(fail); EmitTerminal(terminal.then_, initial_location, is_single_step); code.L(fail); EmitTerminal(terminal.else_, initial_location, is_single_step); } void A64EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) { code.cmp(code.byte[r15 + offsetof(A64JitState, halt_requested)], u8(0)); code.jne(code.GetForceReturnFromRunCodeAddress()); EmitTerminal(terminal.else_, initial_location, is_single_step); } void A64EmitX64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { const CodePtr patch_location = code.getCurr(); if (target_code_ptr) { code.jg(target_code_ptr); } else { code.mov(rax, A64::LocationDescriptor{target_desc}.PC()); code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); code.jg(code.GetReturnFromRunCodeAddress()); } code.EnsurePatchLocationSize(patch_location, 23); } void A64EmitX64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { const CodePtr patch_location = code.getCurr(); if (target_code_ptr) { code.jmp(target_code_ptr); } else { code.mov(rax, A64::LocationDescriptor{target_desc}.PC()); code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); code.jmp(code.GetReturnFromRunCodeAddress()); } code.EnsurePatchLocationSize(patch_location, 22); } void A64EmitX64::EmitPatchMovRcx(CodePtr target_code_ptr) { if (!target_code_ptr) { target_code_ptr = code.GetReturnFromRunCodeAddress(); } const CodePtr patch_location = code.getCurr(); code.mov(code.rcx, reinterpret_cast(target_code_ptr)); code.EnsurePatchLocationSize(patch_location, 10); } void A64EmitX64::Unpatch(const IR::LocationDescriptor& location) { EmitX64::Unpatch(location); if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { (*fast_dispatch_table_lookup)(location.Value()) = {}; } } } // namespace Dynarmic::Backend::X64