4a9a0d07f7
Add config entry to mask out the lower bits in page table pointers. This is intended to allow users of Dynarmic to pack small integers inside pointers and update the pair atomically without locks. These lower bits can be masked out due to the expected alignment in pointers inside the page table. For the given usage, using AND on the pointer acts the same way as a TEST instruction. That said when the mask value is zero, TEST is still emitted to keep the same behavior.
1343 lines
50 KiB
C++
1343 lines
50 KiB
C++
/* This file is part of the dynarmic project.
|
|
* Copyright (c) 2016 MerryMage
|
|
* SPDX-License-Identifier: 0BSD
|
|
*/
|
|
|
|
#include <initializer_list>
|
|
|
|
#include <fmt/format.h>
|
|
#include <fmt/ostream.h>
|
|
#include <mp/traits/integer_of_size.h>
|
|
|
|
#include <dynarmic/exclusive_monitor.h>
|
|
|
|
#include "backend/x64/a64_emit_x64.h"
|
|
#include "backend/x64/a64_jitstate.h"
|
|
#include "backend/x64/abi.h"
|
|
#include "backend/x64/block_of_code.h"
|
|
#include "backend/x64/devirtualize.h"
|
|
#include "backend/x64/emit_x64.h"
|
|
#include "backend/x64/nzcv_util.h"
|
|
#include "backend/x64/perf_map.h"
|
|
#include "common/assert.h"
|
|
#include "common/bit_util.h"
|
|
#include "common/common_types.h"
|
|
#include "common/scope_exit.h"
|
|
#include "frontend/A64/location_descriptor.h"
|
|
#include "frontend/A64/types.h"
|
|
#include "frontend/ir/basic_block.h"
|
|
#include "frontend/ir/cond.h"
|
|
#include "frontend/ir/microinstruction.h"
|
|
#include "frontend/ir/opcodes.h"
|
|
|
|
// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary.
|
|
// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart.
|
|
|
|
namespace Dynarmic::Backend::X64 {
|
|
|
|
using namespace Xbyak::util;
|
|
|
|
A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
|
|
: EmitContext(reg_alloc, block), conf(conf) {}
|
|
|
|
A64::LocationDescriptor A64EmitContext::Location() const {
|
|
return A64::LocationDescriptor{block.Location()};
|
|
}
|
|
|
|
bool A64EmitContext::IsSingleStep() const {
|
|
return Location().SingleStepping();
|
|
}
|
|
|
|
FP::FPCR A64EmitContext::FPCR(bool fpcr_controlled) const {
|
|
return fpcr_controlled ? Location().FPCR() : Location().FPCR().ASIMDStandardValue();
|
|
}
|
|
|
|
A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface)
|
|
: EmitX64(code), conf(conf), jit_interface{jit_interface} {
|
|
GenMemory128Accessors();
|
|
GenFastmemFallbacks();
|
|
GenTerminalHandlers();
|
|
code.PreludeComplete();
|
|
ClearFastDispatchTable();
|
|
}
|
|
|
|
A64EmitX64::~A64EmitX64() = default;
|
|
|
|
A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) {
|
|
code.EnableWriting();
|
|
SCOPE_EXIT { code.DisableWriting(); };
|
|
|
|
static const std::vector<HostLoc> gpr_order = [this]{
|
|
std::vector<HostLoc> gprs{any_gpr};
|
|
if (conf.page_table) {
|
|
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
|
|
}
|
|
return gprs;
|
|
}();
|
|
|
|
RegAlloc reg_alloc{code, A64JitState::SpillCount, SpillToOpArg<A64JitState>, gpr_order, any_xmm};
|
|
A64EmitContext ctx{conf, reg_alloc, block};
|
|
|
|
// Start emitting.
|
|
code.align();
|
|
const u8* const entrypoint = code.getCurr();
|
|
|
|
ASSERT(block.GetCondition() == IR::Cond::AL);
|
|
|
|
for (auto iter = block.begin(); iter != block.end(); ++iter) {
|
|
IR::Inst* inst = &*iter;
|
|
|
|
// Call the relevant Emit* member function.
|
|
switch (inst->GetOpcode()) {
|
|
|
|
#define OPCODE(name, type, ...) \
|
|
case IR::Opcode::name: \
|
|
A64EmitX64::Emit##name(ctx, inst); \
|
|
break;
|
|
#define A32OPC(...)
|
|
#define A64OPC(name, type, ...) \
|
|
case IR::Opcode::A64##name: \
|
|
A64EmitX64::EmitA64##name(ctx, inst); \
|
|
break;
|
|
#include "frontend/ir/opcodes.inc"
|
|
#undef OPCODE
|
|
#undef A32OPC
|
|
#undef A64OPC
|
|
|
|
default:
|
|
ASSERT_MSG(false, "Invalid opcode: {}", inst->GetOpcode());
|
|
break;
|
|
}
|
|
|
|
ctx.reg_alloc.EndOfAllocScope();
|
|
}
|
|
|
|
reg_alloc.AssertNoMoreUses();
|
|
|
|
EmitAddCycles(block.CycleCount());
|
|
EmitX64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
|
|
code.int3();
|
|
|
|
const size_t size = static_cast<size_t>(code.getCurr() - entrypoint);
|
|
|
|
const A64::LocationDescriptor descriptor{block.Location()};
|
|
const A64::LocationDescriptor end_location{block.EndLocation()};
|
|
|
|
const auto range = boost::icl::discrete_interval<u64>::closed(descriptor.PC(), end_location.PC() - 1);
|
|
block_ranges.AddRange(range, descriptor);
|
|
|
|
return RegisterBlock(descriptor, entrypoint, size);
|
|
}
|
|
|
|
void A64EmitX64::ClearCache() {
|
|
EmitX64::ClearCache();
|
|
block_ranges.ClearCache();
|
|
ClearFastDispatchTable();
|
|
}
|
|
|
|
void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) {
|
|
InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
|
|
}
|
|
|
|
void A64EmitX64::ClearFastDispatchTable() {
|
|
if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
|
|
fast_dispatch_table.fill({});
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::GenMemory128Accessors() {
|
|
code.align();
|
|
memory_read_128 = code.getCurr<void(*)()>();
|
|
#ifdef _WIN32
|
|
Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCallWithReturnPointer(code,
|
|
[&](Xbyak::Reg64 return_value_ptr, [[maybe_unused]] RegList args) {
|
|
code.mov(code.ABI_PARAM3, code.ABI_PARAM2);
|
|
code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE);
|
|
code.lea(return_value_ptr, ptr[rsp + ABI_SHADOW_SPACE]);
|
|
});
|
|
code.movups(xmm1, xword[code.ABI_RETURN]);
|
|
code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE);
|
|
#else
|
|
code.sub(rsp, 8);
|
|
Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code);
|
|
if (code.HasSSE41()) {
|
|
code.movq(xmm1, code.ABI_RETURN);
|
|
code.pinsrq(xmm1, code.ABI_RETURN2, 1);
|
|
} else {
|
|
code.movq(xmm1, code.ABI_RETURN);
|
|
code.movq(xmm2, code.ABI_RETURN2);
|
|
code.punpcklqdq(xmm1, xmm2);
|
|
}
|
|
code.add(rsp, 8);
|
|
#endif
|
|
code.ret();
|
|
PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_read_128");
|
|
|
|
code.align();
|
|
memory_write_128 = code.getCurr<void(*)()>();
|
|
#ifdef _WIN32
|
|
code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE);
|
|
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
|
|
code.movaps(xword[code.ABI_PARAM3], xmm1);
|
|
Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code);
|
|
code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE);
|
|
#else
|
|
code.sub(rsp, 8);
|
|
if (code.HasSSE41()) {
|
|
code.movq(code.ABI_PARAM3, xmm1);
|
|
code.pextrq(code.ABI_PARAM4, xmm1, 1);
|
|
} else {
|
|
code.movq(code.ABI_PARAM3, xmm1);
|
|
code.punpckhqdq(xmm1, xmm1);
|
|
code.movq(code.ABI_PARAM4, xmm1);
|
|
}
|
|
Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code);
|
|
code.add(rsp, 8);
|
|
#endif
|
|
code.ret();
|
|
PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_write_128");
|
|
}
|
|
|
|
void A64EmitX64::GenFastmemFallbacks() {
|
|
const std::initializer_list<int> idxes{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
const std::array<std::pair<size_t, ArgCallback>, 4> read_callbacks{{
|
|
{8, Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks)},
|
|
{16, Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks)},
|
|
{32, Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks)},
|
|
{64, Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks)},
|
|
}};
|
|
const std::array<std::pair<size_t, ArgCallback>, 4> write_callbacks{{
|
|
{8, Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks)},
|
|
{16, Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks)},
|
|
{32, Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks)},
|
|
{64, Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks)},
|
|
}};
|
|
|
|
for (int vaddr_idx : idxes) {
|
|
if (vaddr_idx == 4 || vaddr_idx == 15) {
|
|
continue;
|
|
}
|
|
|
|
for (int value_idx : idxes) {
|
|
code.align();
|
|
read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr<void(*)()>();
|
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx));
|
|
if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
|
|
code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
|
|
}
|
|
code.call(memory_read_128);
|
|
if (value_idx != 1) {
|
|
code.movaps(Xbyak::Xmm{value_idx}, xmm1);
|
|
}
|
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx));
|
|
code.ret();
|
|
PerfMapRegister(read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128");
|
|
|
|
code.align();
|
|
write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr<void(*)()>();
|
|
ABI_PushCallerSaveRegistersAndAdjustStack(code);
|
|
if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
|
|
code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
|
|
}
|
|
if (value_idx != 1) {
|
|
code.movaps(xmm1, Xbyak::Xmm{value_idx});
|
|
}
|
|
code.call(memory_write_128);
|
|
ABI_PopCallerSaveRegistersAndAdjustStack(code);
|
|
code.ret();
|
|
PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128");
|
|
|
|
if (value_idx == 4 || value_idx == 15) {
|
|
continue;
|
|
}
|
|
|
|
for (const auto& [bitsize, callback] : read_callbacks) {
|
|
code.align();
|
|
read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void(*)()>();
|
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx));
|
|
if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
|
|
code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
|
|
}
|
|
callback.EmitCall(code);
|
|
if (value_idx != code.ABI_RETURN.getIdx()) {
|
|
code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN);
|
|
}
|
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx));
|
|
code.ret();
|
|
PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize));
|
|
}
|
|
|
|
for (const auto& [bitsize, callback] : write_callbacks) {
|
|
code.align();
|
|
write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void(*)()>();
|
|
ABI_PushCallerSaveRegistersAndAdjustStack(code);
|
|
if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) {
|
|
code.xchg(code.ABI_PARAM2, code.ABI_PARAM3);
|
|
} else if (vaddr_idx == code.ABI_PARAM3.getIdx()) {
|
|
code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
|
|
if (value_idx != code.ABI_PARAM3.getIdx()) {
|
|
code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
|
|
}
|
|
} else {
|
|
if (value_idx != code.ABI_PARAM3.getIdx()) {
|
|
code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
|
|
}
|
|
if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
|
|
code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
|
|
}
|
|
}
|
|
callback.EmitCall(code);
|
|
ABI_PopCallerSaveRegistersAndAdjustStack(code);
|
|
code.ret();
|
|
PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::GenTerminalHandlers() {
|
|
// PC ends up in rbp, location_descriptor ends up in rbx
|
|
const auto calculate_location_descriptor = [this] {
|
|
// This calculation has to match up with A64::LocationDescriptor::UniqueHash
|
|
// TODO: Optimization is available here based on known state of fpcr.
|
|
code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]);
|
|
code.mov(rcx, A64::LocationDescriptor::pc_mask);
|
|
code.and_(rcx, rbp);
|
|
code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]);
|
|
code.and_(ebx, A64::LocationDescriptor::fpcr_mask);
|
|
code.shl(rbx, A64::LocationDescriptor::fpcr_shift);
|
|
code.or_(rbx, rcx);
|
|
};
|
|
|
|
Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
|
|
|
|
code.align();
|
|
terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
|
|
calculate_location_descriptor();
|
|
code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]);
|
|
code.sub(eax, 1);
|
|
code.and_(eax, u32(A64JitState::RSBPtrMask));
|
|
code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax);
|
|
code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
|
|
if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
|
|
code.jne(rsb_cache_miss);
|
|
} else {
|
|
code.jne(code.GetReturnFromRunCodeAddress());
|
|
}
|
|
code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
|
|
code.jmp(rax);
|
|
PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint");
|
|
|
|
if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
|
|
code.align();
|
|
terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
|
|
calculate_location_descriptor();
|
|
code.L(rsb_cache_miss);
|
|
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
|
|
if (code.HasSSE42()) {
|
|
code.crc32(rbx, r12d);
|
|
}
|
|
code.and_(ebp, fast_dispatch_table_mask);
|
|
code.lea(rbp, ptr[r12 + rbp]);
|
|
code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
|
|
code.jne(fast_dispatch_cache_miss);
|
|
code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]);
|
|
code.L(fast_dispatch_cache_miss);
|
|
code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx);
|
|
code.LookupBlock();
|
|
code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax);
|
|
code.jmp(rax);
|
|
PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint");
|
|
|
|
code.align();
|
|
fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry&(*)(u64)>();
|
|
code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
|
|
if (code.HasSSE42()) {
|
|
code.crc32(code.ABI_PARAM1, code.ABI_PARAM2);
|
|
}
|
|
code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask);
|
|
code.lea(code.ABI_RETURN, code.ptr[code.ABI_PARAM1 + code.ABI_PARAM2]);
|
|
code.ret();
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) {
|
|
if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) {
|
|
return;
|
|
}
|
|
|
|
EmitX64::EmitPushRSB(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
|
|
code.mov(code.byte[r15 + offsetof(A64JitState, check_bit)], to_store);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
code.mov(result, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
|
|
code.shr(result, NZCV::x64_c_flag_bit);
|
|
code.and_(result, 1);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
|
|
code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
|
|
|
|
if (code.HasFastBMI2()) {
|
|
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
code.mov(tmp, NZCV::x64_mask);
|
|
code.pext(nzcv_raw, nzcv_raw, tmp);
|
|
code.shl(nzcv_raw, 28);
|
|
} else {
|
|
code.and_(nzcv_raw, NZCV::x64_mask);
|
|
code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier);
|
|
code.and_(nzcv_raw, NZCV::arm_mask);
|
|
}
|
|
|
|
ctx.reg_alloc.DefineValue(inst, nzcv_raw);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
|
|
code.shr(nzcv_raw, 28);
|
|
if (code.HasFastBMI2()) {
|
|
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
code.mov(tmp, NZCV::x64_mask);
|
|
code.pdep(nzcv_raw, nzcv_raw, tmp);
|
|
} else {
|
|
code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier);
|
|
code.and_(nzcv_raw, NZCV::x64_mask);
|
|
}
|
|
code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
|
code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], to_store);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetW(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
|
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
|
|
code.mov(result, dword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetX(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
|
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
|
|
|
code.mov(result, qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
|
|
const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
|
code.movd(result, addr);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
|
|
const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
|
code.movq(result, addr);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
|
|
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
|
|
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
|
code.movaps(result, addr);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetSP(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
|
code.mov(result, qword[r15 + offsetof(A64JitState, sp)]);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
code.mov(result, dword[r15 + offsetof(A64JitState, fpcr)]);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
static u32 GetFPSRImpl(A64JitState* jit_state) {
|
|
return jit_state->GetFpsr();
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
|
|
ctx.reg_alloc.HostCall(inst);
|
|
code.mov(code.ABI_PARAM1, code.r15);
|
|
code.stmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
|
|
code.CallFunction(GetFPSRImpl);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetW(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
|
|
const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
|
|
if (args[1].FitsInImmediateS32()) {
|
|
code.mov(addr, args[1].GetImmediateS32());
|
|
} else {
|
|
// TODO: zext tracking, xmm variant
|
|
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseScratchGpr(args[1]);
|
|
code.mov(to_store.cvt32(), to_store.cvt32());
|
|
code.mov(addr, to_store);
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetX(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
|
|
const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
|
|
if (args[1].FitsInImmediateS32()) {
|
|
code.mov(addr, args[1].GetImmediateS32());
|
|
} else if (args[1].IsInXmm()) {
|
|
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
|
|
code.movq(addr, to_store);
|
|
} else {
|
|
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]);
|
|
code.mov(addr, to_store);
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetS(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
|
|
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
|
|
|
|
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
|
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
|
// TODO: Optimize
|
|
code.pxor(tmp, tmp);
|
|
code.movss(tmp, to_store);
|
|
code.movaps(addr, tmp);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetD(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
|
|
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
|
|
|
|
const Xbyak::Xmm to_store = ctx.reg_alloc.UseScratchXmm(args[1]);
|
|
code.movq(to_store, to_store); // TODO: Remove when able
|
|
code.movaps(addr, to_store);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetQ(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
|
|
const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
|
|
|
|
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
|
|
code.movaps(addr, to_store);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetSP(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const auto addr = qword[r15 + offsetof(A64JitState, sp)];
|
|
if (args[0].FitsInImmediateS32()) {
|
|
code.mov(addr, args[0].GetImmediateS32());
|
|
} else if (args[0].IsInXmm()) {
|
|
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
|
|
code.movq(addr, to_store);
|
|
} else {
|
|
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]);
|
|
code.mov(addr, to_store);
|
|
}
|
|
}
|
|
|
|
static void SetFPCRImpl(A64JitState* jit_state, u32 value) {
|
|
jit_state->SetFpcr(value);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
|
|
code.mov(code.ABI_PARAM1, code.r15);
|
|
code.CallFunction(SetFPCRImpl);
|
|
code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
|
|
}
|
|
|
|
static void SetFPSRImpl(A64JitState* jit_state, u32 value) {
|
|
jit_state->SetFpsr(value);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
|
|
code.mov(code.ABI_PARAM1, code.r15);
|
|
code.CallFunction(SetFPSRImpl);
|
|
code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64OrQC(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
if (args[0].IsImmediate()) {
|
|
if (!args[0].GetImmediateU1()) {
|
|
return;
|
|
}
|
|
|
|
code.mov(code.byte[code.r15 + offsetof(A64JitState, fpsr_qc)], u8(1));
|
|
return;
|
|
}
|
|
|
|
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
|
|
code.or_(code.byte[code.r15 + offsetof(A64JitState, fpsr_qc)], to_store);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetPC(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const auto addr = qword[r15 + offsetof(A64JitState, pc)];
|
|
if (args[0].FitsInImmediateS32()) {
|
|
code.mov(addr, args[0].GetImmediateS32());
|
|
} else if (args[0].IsInXmm()) {
|
|
const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
|
|
code.movq(addr, to_store);
|
|
} else {
|
|
const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]);
|
|
code.mov(addr, to_store);
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitA64CallSupervisor(A64EmitContext& ctx, IR::Inst* inst) {
|
|
ctx.reg_alloc.HostCall(nullptr);
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ASSERT(args[0].IsImmediate());
|
|
const u32 imm = args[0].GetImmediateU32();
|
|
Devirtualize<&A64::UserCallbacks::CallSVC>(conf.callbacks).EmitCall(code,
|
|
[&](RegList param) {
|
|
code.mov(param[0], imm);
|
|
});
|
|
// The kernel would have to execute ERET to get here, which would clear exclusive state.
|
|
code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExceptionRaised(A64EmitContext& ctx, IR::Inst* inst) {
|
|
ctx.reg_alloc.HostCall(nullptr);
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ASSERT(args[0].IsImmediate() && args[1].IsImmediate());
|
|
const u64 pc = args[0].GetImmediateU64();
|
|
const u64 exception = args[1].GetImmediateU64();
|
|
Devirtualize<&A64::UserCallbacks::ExceptionRaised>(conf.callbacks).EmitCall(code,
|
|
[&](RegList param) {
|
|
code.mov(param[0], pc);
|
|
code.mov(param[1], exception);
|
|
});
|
|
}
|
|
|
|
void A64EmitX64::EmitA64DataCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, args[0], args[1]);
|
|
Devirtualize<&A64::UserCallbacks::DataCacheOperationRaised>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64DataSynchronizationBarrier(A64EmitContext&, IR::Inst*) {
|
|
code.mfence();
|
|
}
|
|
|
|
void A64EmitX64::EmitA64DataMemoryBarrier(A64EmitContext&, IR::Inst*) {
|
|
code.lfence();
|
|
}
|
|
|
|
void A64EmitX64::EmitA64InstructionSynchronizationBarrier(A64EmitContext& ctx, IR::Inst* ) {
|
|
ctx.reg_alloc.HostCall(nullptr);
|
|
|
|
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(jit_interface));
|
|
code.CallLambda([](A64::Jit* jit) { jit->ClearCache(); });
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetCNTFRQ(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
code.mov(result, conf.cntfrq_el0);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetCNTPCT(A64EmitContext& ctx, IR::Inst* inst) {
|
|
ctx.reg_alloc.HostCall(inst);
|
|
if (!conf.wall_clock_cntpct) {
|
|
code.UpdateTicks();
|
|
}
|
|
Devirtualize<&A64::UserCallbacks::GetCNTPCT>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetCTR(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
code.mov(result, conf.ctr_el0);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetDCZID(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
code.mov(result, conf.dczid_el0);
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetTPIDR(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
|
if (conf.tpidr_el0) {
|
|
code.mov(result, u64(conf.tpidr_el0));
|
|
code.mov(result, qword[result]);
|
|
} else {
|
|
code.xor_(result.cvt32(), result.cvt32());
|
|
}
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64GetTPIDRRO(A64EmitContext& ctx, IR::Inst* inst) {
|
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
|
if (conf.tpidrro_el0) {
|
|
code.mov(result, u64(conf.tpidrro_el0));
|
|
code.mov(result, qword[result]);
|
|
} else {
|
|
code.xor_(result.cvt32(), result.cvt32());
|
|
}
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64SetTPIDR(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[0]);
|
|
const Xbyak::Reg64 addr = ctx.reg_alloc.ScratchGpr();
|
|
if (conf.tpidr_el0) {
|
|
code.mov(addr, u64(conf.tpidr_el0));
|
|
code.mov(qword[addr], value);
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) {
|
|
code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
|
|
}
|
|
|
|
namespace {
|
|
|
|
constexpr size_t page_bits = 12;
|
|
constexpr size_t page_size = 1 << page_bits;
|
|
constexpr size_t page_mask = (1 << page_bits) - 1;
|
|
|
|
void EmitDetectMisaignedVAddr(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr, Xbyak::Reg64 tmp) {
|
|
if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) {
|
|
return;
|
|
}
|
|
|
|
const u32 align_mask = [bitsize]() -> u32 {
|
|
switch (bitsize) {
|
|
case 16:
|
|
return 0b1;
|
|
case 32:
|
|
return 0b11;
|
|
case 64:
|
|
return 0b111;
|
|
case 128:
|
|
return 0b1111;
|
|
}
|
|
UNREACHABLE();
|
|
}();
|
|
|
|
code.test(vaddr, align_mask);
|
|
|
|
if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
|
|
code.jnz(abort, code.T_NEAR);
|
|
return;
|
|
}
|
|
|
|
const u32 page_align_mask = static_cast<u32>(page_size - 1) & ~align_mask;
|
|
|
|
Xbyak::Label detect_boundary, resume;
|
|
|
|
code.jnz(detect_boundary, code.T_NEAR);
|
|
code.L(resume);
|
|
|
|
code.SwitchToFarCode();
|
|
code.L(detect_boundary);
|
|
code.mov(tmp, vaddr);
|
|
code.and_(tmp, page_align_mask);
|
|
code.cmp(tmp, page_align_mask);
|
|
code.jne(resume, code.T_NEAR);
|
|
// NOTE: We expect to fallthrough into abort code here.
|
|
code.SwitchToNearCode();
|
|
}
|
|
|
|
Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) {
|
|
const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits;
|
|
const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits;
|
|
|
|
const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr();
|
|
const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr();
|
|
|
|
EmitDetectMisaignedVAddr(code, ctx, bitsize, abort, vaddr, tmp);
|
|
|
|
if (unused_top_bits == 0) {
|
|
code.mov(tmp, vaddr);
|
|
code.shr(tmp, int(page_bits));
|
|
} else if (ctx.conf.silently_mirror_page_table) {
|
|
if (valid_page_index_bits >= 32) {
|
|
if (code.HasBMI2()) {
|
|
const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr();
|
|
code.mov(bit_count, unused_top_bits);
|
|
code.bzhi(tmp, vaddr, bit_count);
|
|
code.shr(tmp, int(page_bits));
|
|
ctx.reg_alloc.Release(bit_count);
|
|
} else {
|
|
code.mov(tmp, vaddr);
|
|
code.shl(tmp, int(unused_top_bits));
|
|
code.shr(tmp, int(unused_top_bits + page_bits));
|
|
}
|
|
} else {
|
|
code.mov(tmp, vaddr);
|
|
code.shr(tmp, int(page_bits));
|
|
code.and_(tmp, u32((1 << valid_page_index_bits) - 1));
|
|
}
|
|
} else {
|
|
ASSERT(valid_page_index_bits < 32);
|
|
code.mov(tmp, vaddr);
|
|
code.shr(tmp, int(page_bits));
|
|
code.test(tmp, u32(-(1 << valid_page_index_bits)));
|
|
code.jnz(abort, code.T_NEAR);
|
|
}
|
|
code.mov(page, qword[r14 + tmp * sizeof(void*)]);
|
|
if (ctx.conf.page_table_pointer_mask_bits == 0) {
|
|
code.test(page, page);
|
|
} else {
|
|
code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits);
|
|
}
|
|
code.jz(abort, code.T_NEAR);
|
|
if (ctx.conf.absolute_offset_page_table) {
|
|
return page + vaddr;
|
|
}
|
|
code.mov(tmp, vaddr);
|
|
code.and_(tmp, static_cast<u32>(page_mask));
|
|
return page + tmp;
|
|
}
|
|
|
|
template<std::size_t bitsize>
|
|
void EmitReadMemoryMov(BlockOfCode& code, const Xbyak::Reg64& value, const Xbyak::RegExp& addr) {
|
|
switch (bitsize) {
|
|
case 8:
|
|
code.movzx(value.cvt32(), code.byte[addr]);
|
|
return;
|
|
case 16:
|
|
code.movzx(value.cvt32(), word[addr]);
|
|
return;
|
|
case 32:
|
|
code.mov(value.cvt32(), dword[addr]);
|
|
return;
|
|
case 64:
|
|
code.mov(value, qword[addr]);
|
|
return;
|
|
default:
|
|
ASSERT_FALSE("Invalid bitsize");
|
|
}
|
|
}
|
|
|
|
template<std::size_t bitsize>
|
|
void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, const Xbyak::Reg64& value) {
|
|
switch (bitsize) {
|
|
case 8:
|
|
code.mov(code.byte[addr], value.cvt8());
|
|
return;
|
|
case 16:
|
|
code.mov(word[addr], value.cvt16());
|
|
return;
|
|
case 32:
|
|
code.mov(dword[addr], value.cvt32());
|
|
return;
|
|
case 64:
|
|
code.mov(qword[addr], value);
|
|
return;
|
|
default:
|
|
ASSERT_FALSE("Invalid bitsize");
|
|
}
|
|
}
|
|
|
|
} // anonymous namepsace
|
|
|
|
template<std::size_t bitsize>
|
|
void A64EmitX64::EmitDirectPageTableMemoryRead(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]);
|
|
const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr();
|
|
|
|
const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())];
|
|
|
|
Xbyak::Label abort, end;
|
|
|
|
const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr);
|
|
EmitReadMemoryMov<bitsize>(code, value, src_ptr);
|
|
code.L(end);
|
|
|
|
code.SwitchToFarCode();
|
|
code.L(abort);
|
|
code.call(wrapped_fn);
|
|
code.jmp(end, code.T_NEAR);
|
|
code.SwitchToNearCode();
|
|
|
|
ctx.reg_alloc.DefineValue(inst, value);
|
|
}
|
|
|
|
template<std::size_t bitsize>
|
|
void A64EmitX64::EmitDirectPageTableMemoryWrite(A64EmitContext& ctx, IR::Inst* inst) {
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]);
|
|
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]);
|
|
|
|
const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())];
|
|
|
|
Xbyak::Label abort, end;
|
|
|
|
const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr);
|
|
EmitWriteMemoryMov<bitsize>(code, dest_ptr, value);
|
|
code.L(end);
|
|
|
|
code.SwitchToFarCode();
|
|
code.L(abort);
|
|
code.call(wrapped_fn);
|
|
code.jmp(end, code.T_NEAR);
|
|
code.SwitchToNearCode();
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryRead<8>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(inst, {}, args[0]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ReadMemory16(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryRead<16>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(inst, {}, args[0]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ReadMemory32(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryRead<32>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(inst, {}, args[0]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ReadMemory64(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryRead<64>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(inst, {}, args[0]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ReadMemory128(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
Xbyak::Label abort, end;
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]);
|
|
const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm();
|
|
|
|
const auto src_ptr = EmitVAddrLookup(code, ctx, 128, abort, vaddr);
|
|
code.movups(value, xword[src_ptr]);
|
|
code.L(end);
|
|
|
|
code.SwitchToFarCode();
|
|
code.L(abort);
|
|
code.call(read_fallbacks[std::make_tuple(128, vaddr.getIdx(), value.getIdx())]);
|
|
code.jmp(end, code.T_NEAR);
|
|
code.SwitchToNearCode();
|
|
|
|
ctx.reg_alloc.DefineValue(inst, value);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
|
|
code.CallFunction(memory_read_128);
|
|
ctx.reg_alloc.DefineValue(inst, xmm1);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64WriteMemory8(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryWrite<8>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64WriteMemory16(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryWrite<16>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64WriteMemory32(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryWrite<32>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64WriteMemory64(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
EmitDirectPageTableMemoryWrite<64>(ctx, inst);
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]);
|
|
Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks).EmitCall(code);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) {
|
|
if (conf.page_table) {
|
|
Xbyak::Label abort, end;
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]);
|
|
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[1]);
|
|
|
|
const auto dest_ptr = EmitVAddrLookup(code, ctx, 128, abort, vaddr);
|
|
code.movups(xword[dest_ptr], value);
|
|
code.L(end);
|
|
|
|
code.SwitchToFarCode();
|
|
code.L(abort);
|
|
code.call(write_fallbacks[std::make_tuple(128, vaddr.getIdx(), value.getIdx())]);
|
|
code.jmp(end, code.T_NEAR);
|
|
code.SwitchToNearCode();
|
|
return;
|
|
}
|
|
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
ctx.reg_alloc.Use(args[0], ABI_PARAM2);
|
|
ctx.reg_alloc.Use(args[1], HostLoc::XMM1);
|
|
ctx.reg_alloc.EndOfAllocScope();
|
|
ctx.reg_alloc.HostCall(nullptr);
|
|
code.CallFunction(memory_write_128);
|
|
}
|
|
|
|
template<std::size_t bitsize, auto callback>
|
|
void A64EmitX64::EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst) {
|
|
ASSERT(conf.global_monitor != nullptr);
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
if constexpr (bitsize != 128) {
|
|
using T = mp::unsigned_integer_of_size<bitsize>;
|
|
|
|
ctx.reg_alloc.HostCall(inst, {}, args[0]);
|
|
|
|
code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1));
|
|
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
|
|
code.CallLambda(
|
|
[](A64::UserConfig& conf, u64 vaddr) -> T {
|
|
return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T {
|
|
return (conf.callbacks->*callback)(vaddr);
|
|
});
|
|
}
|
|
);
|
|
} else {
|
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
|
ctx.reg_alloc.Use(args[0], ABI_PARAM2);
|
|
ctx.reg_alloc.EndOfAllocScope();
|
|
ctx.reg_alloc.HostCall(nullptr);
|
|
|
|
code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1));
|
|
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
|
|
code.sub(rsp, 16 + ABI_SHADOW_SPACE);
|
|
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
|
|
code.CallLambda(
|
|
[](A64::UserConfig& conf, u64 vaddr, A64::Vector& ret) {
|
|
ret = conf.global_monitor->ReadAndMark<A64::Vector>(conf.processor_id, vaddr, [&]() -> A64::Vector {
|
|
return (conf.callbacks->*callback)(vaddr);
|
|
});
|
|
}
|
|
);
|
|
code.movups(result, xword[rsp + ABI_SHADOW_SPACE]);
|
|
code.add(rsp, 16 + ABI_SHADOW_SPACE);
|
|
|
|
ctx.reg_alloc.DefineValue(inst, result);
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveReadMemory16(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveReadMemory32(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveReadMemory64(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveReadMemory128(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst);
|
|
}
|
|
|
|
template<std::size_t bitsize, auto callback>
|
|
void A64EmitX64::EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst) {
|
|
ASSERT(conf.global_monitor != nullptr);
|
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
|
|
if constexpr (bitsize != 128) {
|
|
ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]);
|
|
} else {
|
|
ctx.reg_alloc.Use(args[0], ABI_PARAM2);
|
|
ctx.reg_alloc.Use(args[1], HostLoc::XMM1);
|
|
ctx.reg_alloc.EndOfAllocScope();
|
|
ctx.reg_alloc.HostCall(inst);
|
|
}
|
|
|
|
Xbyak::Label end;
|
|
|
|
code.mov(code.ABI_RETURN, u32(1));
|
|
code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
|
|
code.je(end);
|
|
code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
|
|
code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
|
|
if constexpr (bitsize != 128) {
|
|
using T = mp::unsigned_integer_of_size<bitsize>;
|
|
|
|
code.CallLambda(
|
|
[](A64::UserConfig& conf, u64 vaddr, T value) -> u32 {
|
|
return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr,
|
|
[&](T expected) -> bool {
|
|
return (conf.callbacks->*callback)(vaddr, value, expected);
|
|
}) ? 0 : 1;
|
|
}
|
|
);
|
|
} else {
|
|
code.sub(rsp, 16 + ABI_SHADOW_SPACE);
|
|
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
|
|
code.movaps(xword[code.ABI_PARAM3], xmm1);
|
|
code.CallLambda(
|
|
[](A64::UserConfig& conf, u64 vaddr, A64::Vector& value) -> u32 {
|
|
return conf.global_monitor->DoExclusiveOperation<A64::Vector>(conf.processor_id, vaddr,
|
|
[&](A64::Vector expected) -> bool {
|
|
return (conf.callbacks->*callback)(vaddr, value, expected);
|
|
}) ? 0 : 1;
|
|
}
|
|
);
|
|
code.add(rsp, 16 + ABI_SHADOW_SPACE);
|
|
}
|
|
code.L(end);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveWriteMemory8(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveWriteMemory16(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveWriteMemory32(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveWriteMemory64(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst);
|
|
}
|
|
|
|
void A64EmitX64::EmitA64ExclusiveWriteMemory128(A64EmitContext& ctx, IR::Inst* inst) {
|
|
EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst);
|
|
}
|
|
|
|
std::string A64EmitX64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const {
|
|
const A64::LocationDescriptor descriptor{ir_descriptor};
|
|
return fmt::format("a64_{:016X}_fpcr{:08X}",
|
|
descriptor.PC(),
|
|
descriptor.FPCR().Value());
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor, bool) {
|
|
code.SwitchMxcsrOnExit();
|
|
Devirtualize<&A64::UserCallbacks::InterpreterFallback>(conf.callbacks).EmitCall(code,
|
|
[&](RegList param) {
|
|
code.mov(param[0], A64::LocationDescriptor{terminal.next}.PC());
|
|
code.mov(qword[r15 + offsetof(A64JitState, pc)], param[0]);
|
|
code.mov(param[1].cvt32(), terminal.num_instructions);
|
|
});
|
|
code.ReturnFromRunCode(true); // TODO: Check cycles
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
|
|
code.ReturnFromRunCode();
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor, bool is_single_step) {
|
|
if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
|
|
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
|
|
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
|
|
code.ReturnFromRunCode();
|
|
return;
|
|
}
|
|
|
|
code.cmp(qword[r15 + offsetof(A64JitState, cycles_remaining)], 0);
|
|
|
|
patch_information[terminal.next].jg.emplace_back(code.getCurr());
|
|
if (auto next_bb = GetBasicBlock(terminal.next)) {
|
|
EmitPatchJg(terminal.next, next_bb->entrypoint);
|
|
} else {
|
|
EmitPatchJg(terminal.next);
|
|
}
|
|
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
|
|
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
|
|
code.ForceReturnFromRunCode();
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor, bool is_single_step) {
|
|
if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
|
|
code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
|
|
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
|
|
code.ReturnFromRunCode();
|
|
return;
|
|
}
|
|
|
|
patch_information[terminal.next].jmp.emplace_back(code.getCurr());
|
|
if (auto next_bb = GetBasicBlock(terminal.next)) {
|
|
EmitPatchJmp(terminal.next, next_bb->entrypoint);
|
|
} else {
|
|
EmitPatchJmp(terminal.next);
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) {
|
|
if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) || is_single_step) {
|
|
code.ReturnFromRunCode();
|
|
return;
|
|
}
|
|
|
|
code.jmp(terminal_handler_pop_rsb_hint);
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) {
|
|
if (!conf.HasOptimization(OptimizationFlag::FastDispatch) || is_single_step) {
|
|
code.ReturnFromRunCode();
|
|
return;
|
|
}
|
|
|
|
code.jmp(terminal_handler_fast_dispatch_hint);
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
|
|
switch (terminal.if_) {
|
|
case IR::Cond::AL:
|
|
case IR::Cond::NV:
|
|
EmitTerminal(terminal.then_, initial_location, is_single_step);
|
|
break;
|
|
default:
|
|
Xbyak::Label pass = EmitCond(terminal.if_);
|
|
EmitTerminal(terminal.else_, initial_location, is_single_step);
|
|
code.L(pass);
|
|
EmitTerminal(terminal.then_, initial_location, is_single_step);
|
|
break;
|
|
}
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
|
|
Xbyak::Label fail;
|
|
code.cmp(code.byte[r15 + offsetof(A64JitState, check_bit)], u8(0));
|
|
code.jz(fail);
|
|
EmitTerminal(terminal.then_, initial_location, is_single_step);
|
|
code.L(fail);
|
|
EmitTerminal(terminal.else_, initial_location, is_single_step);
|
|
}
|
|
|
|
void A64EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
|
|
code.cmp(code.byte[r15 + offsetof(A64JitState, halt_requested)], u8(0));
|
|
code.jne(code.GetForceReturnFromRunCodeAddress());
|
|
EmitTerminal(terminal.else_, initial_location, is_single_step);
|
|
}
|
|
|
|
void A64EmitX64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
|
|
const CodePtr patch_location = code.getCurr();
|
|
if (target_code_ptr) {
|
|
code.jg(target_code_ptr);
|
|
} else {
|
|
code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
|
|
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
|
|
code.jg(code.GetReturnFromRunCodeAddress());
|
|
}
|
|
code.EnsurePatchLocationSize(patch_location, 23);
|
|
}
|
|
|
|
void A64EmitX64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
|
|
const CodePtr patch_location = code.getCurr();
|
|
if (target_code_ptr) {
|
|
code.jmp(target_code_ptr);
|
|
} else {
|
|
code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
|
|
code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
|
|
code.jmp(code.GetReturnFromRunCodeAddress());
|
|
}
|
|
code.EnsurePatchLocationSize(patch_location, 22);
|
|
}
|
|
|
|
void A64EmitX64::EmitPatchMovRcx(CodePtr target_code_ptr) {
|
|
if (!target_code_ptr) {
|
|
target_code_ptr = code.GetReturnFromRunCodeAddress();
|
|
}
|
|
const CodePtr patch_location = code.getCurr();
|
|
code.mov(code.rcx, reinterpret_cast<u64>(target_code_ptr));
|
|
code.EnsurePatchLocationSize(patch_location, 10);
|
|
}
|
|
|
|
void A64EmitX64::Unpatch(const IR::LocationDescriptor& location) {
|
|
EmitX64::Unpatch(location);
|
|
if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
|
|
(*fast_dispatch_table_lookup)(location.Value()) = {};
|
|
}
|
|
}
|
|
|
|
} // namespace Dynarmic::Backend::X64
|