diff --git a/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp b/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp index 087499ce..ce9bab2f 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp @@ -3,15 +3,31 @@ * SPDX-License-Identifier: 0BSD */ +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/a64_jitstate.h" #include "dynarmic/backend/arm64/abi.h" #include "dynarmic/backend/arm64/emit_arm64.h" #include "dynarmic/backend/arm64/emit_context.h" #include "dynarmic/backend/arm64/fpsr_manager.h" #include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/common/cast_util.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" #include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/op.h" +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/common/lut_from_list.h" #include "dynarmic/ir/basic_block.h" #include "dynarmic/ir/microinstruction.h" #include "dynarmic/ir/opcodes.h" @@ -19,6 +35,15 @@ namespace Dynarmic::Backend::Arm64 { using namespace oaknut::util; +namespace mp = mcl::mp; + +using A64FullVectorWidth = std::integral_constant; + +// Array alias that always sizes itself according to the given type T +// relative to the size of a vector register. e.g. T = u32 would result +// in a std::array. +template +using VectorArray = std::array>; template static void MaybeStandardFPSCRValue(oaknut::CodeGenerator& code, EmitContext& ctx, bool fpcr_controlled, EmitFn emit) { @@ -233,6 +258,51 @@ void EmitToFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) }); } +template +static void EmitTwoOpFallbackWithoutRegAlloc(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::QReg Qresult, oaknut::QReg Qarg1, Lambda lambda, bool fpcr_controlled) { + const auto fn = static_cast*>(lambda); + + const u32 fpcr = ctx.FPCR(fpcr_controlled).Value(); + constexpr u64 stack_size = sizeof(u64) * 4; // sizeof(u128)*2 + oaknut::Label fn_ptr, end; + + ABI_PushRegisters(code, ABI_CALLER_SAVE & ~(1ull << Qresult.index()), stack_size); + code.MOV(Xscratch0, SP); + code.LDR(Xscratch1, fn_ptr); + + // Call lambda(Vec&, Vec&, fpcr, fpsr&) + code.ADD(X0, Xscratch0, 0 * 16); + code.ADD(X1, Xscratch0, 1 * 16); + code.MOV(X2, fpcr); + code.ADD(X3, Xstate, offsetof(JitState, fpsr)); + code.STR(Qarg1, X1); + code.BLR(Xscratch1); + + // Reload result + code.LDR(Qresult, SP); + ABI_PopRegisters(code, ABI_CALLER_SAVE & ~(1ull << Qresult.index()), stack_size); + + code.B(end); + code.align(8); + code.l(fn_ptr); + code.dx(mcl::bit_cast(fn)); + code.l(end); +} + +template +static void EmitTwoOpFallback(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qarg1 = ctx.reg_alloc.ReadQ(args[0]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Qarg1, Qresult); + + ctx.reg_alloc.SpillFlags(); + ctx.fpsr.Spill(); + + const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1(); + EmitTwoOpFallbackWithoutRegAlloc(code, ctx, Qresult, Qarg1, lambda, fpcr_controlled); +} + template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { constexpr u16 non_sign_mask = FP::FPInfo::sign_mask - u16{1u}; @@ -494,10 +564,39 @@ void EmitIR(oaknut::CodeGenerator& code, E template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { - (void)code; - (void)ctx; - (void)inst; - ASSERT_FALSE("Unimplemented"); + const auto rounding = static_cast(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + + using rounding_list = mp::list< + mp::lift_value, + mp::lift_value, + mp::lift_value, + mp::lift_value, + mp::lift_value>; + using exact_list = mp::list; + + static const auto lut = Common::GenerateLookupTableFromList( + [](I) { + using FPT = u16; + return std::pair{ + mp::lower_to_tuple_v, + Common::FptrCast( + [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; + constexpr bool exact = mp::get<1, I>::value; + + for (size_t i = 0; i < output.size(); ++i) { + output[i] = static_cast(FP::FPRoundInt(input[i], fpcr, rounding_mode, exact, fpsr)); + } + })}; + }, + mp::cartesian_product{}); + + if (ctx.conf.is_a64) { + EmitTwoOpFallback<3, A64JitState>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact))); + } else { + EmitTwoOpFallback<3, A32JitState>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact))); + } } template<>