diff --git a/src/dynarmic/backend/arm64/a64_address_space.cpp b/src/dynarmic/backend/arm64/a64_address_space.cpp index fe063f5a..e6045ef8 100644 --- a/src/dynarmic/backend/arm64/a64_address_space.cpp +++ b/src/dynarmic/backend/arm64/a64_address_space.cpp @@ -96,6 +96,123 @@ static void* EmitExclusiveWriteCallTrampoline(oaknut::CodeGenerator& code, const return target; } +/* =========================== 128-bit versions =========================== */ + +static void* EmitRead128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<&A64::UserCallbacks::MemoryRead128>(this_); + + oaknut::Label l_addr, l_this; + + void* target = code.ptr(); + ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector)); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.STP(X0, X1, SP); + code.LDR(Q0, SP); + ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector)); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +static void* EmitExclusiveRead128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr) -> Vector { + return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> Vector { + return conf.callbacks->MemoryRead128(vaddr); + }); + }; + + void* target = code.ptr(); + ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector)); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.STP(X0, X1, SP); + code.LDR(Q0, SP); + ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), sizeof(Vector)); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast(Common::FptrCast(fn))); + + return target; +} + +static void* EmitWrite128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<&A64::UserCallbacks::MemoryWrite128>(this_); + + oaknut::Label l_addr, l_this; + + void* target = code.ptr(); + ABI_PushRegisters(code, 0, sizeof(Vector)); + code.STR(Q0, SP); + code.LDP(X2, X3, SP); + ABI_PopRegisters(code, 0, sizeof(Vector)); + + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +static void* EmitExclusiveWrite128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr, Vector value) -> u32 { + return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, + [&](Vector expected) -> bool { + return conf.callbacks->MemoryWriteExclusive128(vaddr, value, expected); + }) + ? 0 + : 1; + }; + + void* target = code.ptr(); + ABI_PushRegisters(code, 0, sizeof(Vector)); + code.STR(Q0, SP); + code.LDP(X2, X3, SP); + ABI_PopRegisters(code, 0, sizeof(Vector)); + + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast(Common::FptrCast(fn))); + + return target; +} + A64AddressSpace::A64AddressSpace(const A64::UserConfig& conf) : conf(conf) , mem(conf.code_cache_size) @@ -161,22 +278,22 @@ void A64AddressSpace::EmitPrelude() { prelude_info.read_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead16>(code, conf.callbacks); prelude_info.read_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead32>(code, conf.callbacks); prelude_info.read_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead64>(code, conf.callbacks); - prelude_info.read_memory_128 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead128>(code, conf.callbacks); + prelude_info.read_memory_128 = EmitRead128CallTrampoline(code, conf.callbacks); prelude_info.exclusive_read_memory_8 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead8, u8>(code, conf); prelude_info.exclusive_read_memory_16 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead16, u16>(code, conf); prelude_info.exclusive_read_memory_32 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead32, u32>(code, conf); prelude_info.exclusive_read_memory_64 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead64, u64>(code, conf); - prelude_info.exclusive_read_memory_128 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead128, Vector>(code, conf); + prelude_info.exclusive_read_memory_128 = EmitExclusiveRead128CallTrampoline(code, conf); prelude_info.write_memory_8 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite8>(code, conf.callbacks); prelude_info.write_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite16>(code, conf.callbacks); prelude_info.write_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite32>(code, conf.callbacks); prelude_info.write_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite64>(code, conf.callbacks); - prelude_info.write_memory_128 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite128>(code, conf.callbacks); + prelude_info.write_memory_128 = EmitWrite128CallTrampoline(code, conf.callbacks); prelude_info.exclusive_write_memory_8 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive8, u8>(code, conf); prelude_info.exclusive_write_memory_16 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive16, u16>(code, conf); prelude_info.exclusive_write_memory_32 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive32, u32>(code, conf); prelude_info.exclusive_write_memory_64 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive64, u64>(code, conf); - prelude_info.exclusive_write_memory_128 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive128, Vector>(code, conf); + prelude_info.exclusive_write_memory_128 = EmitExclusiveWrite128CallTrampoline(code, conf); prelude_info.call_svc = EmitCallTrampoline<&A64::UserCallbacks::CallSVC>(code, conf.callbacks); prelude_info.exception_raised = EmitCallTrampoline<&A64::UserCallbacks::ExceptionRaised>(code, conf.callbacks); prelude_info.isb_raised = EmitCallTrampoline<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(code, conf.callbacks); diff --git a/src/dynarmic/backend/arm64/abi.cpp b/src/dynarmic/backend/arm64/abi.cpp index b34f807b..e6913f0d 100644 --- a/src/dynarmic/backend/arm64/abi.cpp +++ b/src/dynarmic/backend/arm64/abi.cpp @@ -55,13 +55,15 @@ static FrameInfo CalculateFrameInfo(RegisterList rl, size_t frame_size) { }; } -#define DO_IT(TYPE, REG_TYPE, PAIR_OP, SINGLE_OP, OFFSET) \ - for (size_t i = 0; i < frame_info.TYPE##s.size() - 1; i += 2) { \ - code.PAIR_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, oaknut::REG_TYPE{frame_info.TYPE##s[i + 1]}, SP, (OFFSET) + i * TYPE##_size); \ - } \ - if (frame_info.TYPE##s.size() % 2 == 1) { \ - const size_t i = frame_info.TYPE##s.size() - 1; \ - code.SINGLE_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, SP, (OFFSET) + i * TYPE##_size); \ +#define DO_IT(TYPE, REG_TYPE, PAIR_OP, SINGLE_OP, OFFSET) \ + if (frame_info.TYPE##s.size() > 0) { \ + for (size_t i = 0; i < frame_info.TYPE##s.size() - 1; i += 2) { \ + code.PAIR_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, oaknut::REG_TYPE{frame_info.TYPE##s[i + 1]}, SP, (OFFSET) + i * TYPE##_size); \ + } \ + if (frame_info.TYPE##s.size() % 2 == 1) { \ + const size_t i = frame_info.TYPE##s.size() - 1; \ + code.SINGLE_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, SP, (OFFSET) + i * TYPE##_size); \ + } \ } void ABI_PushRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t frame_size) { diff --git a/src/dynarmic/backend/arm64/emit_arm64_a64.cpp b/src/dynarmic/backend/arm64/emit_arm64_a64.cpp index ca19890c..df593085 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_a64.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_a64.cpp @@ -465,7 +465,7 @@ void EmitIR(oaknut::CodeGenerator& code, EmitContext& c auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto Xvalue = ctx.reg_alloc.ReadX(args[0]); RegAlloc::Realize(Xvalue); - code.MOV(Xscratch0, mcl::bit_cast(ctx.conf.tpidrro_el0)); + code.MOV(Xscratch0, mcl::bit_cast(ctx.conf.tpidr_el0)); code.STR(Xvalue, Xscratch0); } diff --git a/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp b/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp index 9321f45c..5cf5f49f 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp @@ -1353,9 +1353,13 @@ void EmitIR(oaknut::CodeGenerator&, EmitContex } template<> -void EmitIR(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - ctx.reg_alloc.DefineAsExisting(inst, args[0]); + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Xvalue, Qresult); + + code.FMOV(Qresult->toD(), Xvalue); } template<> diff --git a/src/dynarmic/backend/arm64/reg_alloc.cpp b/src/dynarmic/backend/arm64/reg_alloc.cpp index 82bb02e7..2dff0080 100644 --- a/src/dynarmic/backend/arm64/reg_alloc.cpp +++ b/src/dynarmic/backend/arm64/reg_alloc.cpp @@ -157,15 +157,36 @@ void RegAlloc::PrepareForCall(IR::Inst* result, std::optional, 4> args{arg0, arg1, arg2, arg3}; + + // AAPCS64 Next General-purpose Register Number + int ngrn = 0; + // AAPCS64 Next SIMD and Floating-point Register Number + int nsrn = 0; + for (int i = 0; i < 4; i++) { if (args[i]) { - ASSERT(gprs[i].IsCompletelyEmpty()); - LoadCopyInto(args[i]->get().value, oaknut::XReg{i}); + if (args[i]->get().GetType() == IR::Type::U128) { + ASSERT(fprs[nsrn].IsCompletelyEmpty()); + LoadCopyInto(args[i]->get().value, oaknut::QReg{nsrn}); + nsrn++; + } else { + ASSERT(gprs[ngrn].IsCompletelyEmpty()); + LoadCopyInto(args[i]->get().value, oaknut::XReg{ngrn}); + ngrn++; + } + } else { + // Gaps are assumed to be in general-purpose registers + // TODO: should there be a separate list passed for FPRs instead? + ngrn++; } } if (result) { - DefineAsRegister(result, X0); + if (result->GetType() == IR::Type::U128) { + DefineAsRegister(result, Q0); + } else { + DefineAsRegister(result, X0); + } } }