diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index e96ef6b9..9ebbcb1e 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -4,9 +4,12 @@ * General Public License version 2 or any later version. */ +#include + #include "backend_x64/abi.h" #include "backend_x64/block_of_code.h" #include "backend_x64/emit_x64.h" +#include "common/bit_util.h" #include "common/fp_util.h" #include "frontend/ir/basic_block.h" #include "frontend/ir/microinstruction.h" @@ -15,6 +18,74 @@ namespace Dynarmic::BackendX64 { using namespace Xbyak::util; +template +struct NaNWrapper; + +template <> +struct NaNWrapper { + static constexpr u32 value = 0x7fc00000; +}; + +template <> +struct NaNWrapper { + static constexpr u64 value = 0x7ff8'0000'0000'0000; +}; + +template +static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& xmm_a, + const Xbyak::Xmm& xmm_b, const Xbyak::Xmm& result, const Xbyak::Xmm& nan_mask) { + static_assert(std::is_same_v || std::is_same_v, "T must be either u32 or u64"); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.ptest(nan_mask, nan_mask); + } else { + const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32(); + code.movmskps(bitmask, nan_mask); + code.cmp(bitmask, 0); + } + + Xbyak::Label end; + Xbyak::Label nan; + + code.jz(end); + code.jmp(nan, code.T_NEAR); + code.L(end); + + code.SwitchToFarCode(); + code.L(nan); + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + const size_t stack_space = 3 * 16; + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + code.movaps(xword[code.ABI_PARAM1], result); + code.movaps(xword[code.ABI_PARAM2], xmm_a); + code.movaps(xword[code.ABI_PARAM3], xmm_b); + + using Elements = std::integral_constant()>; + using RegArray = std::array; + code.CallFunction(static_cast( + [](RegArray& result, const RegArray& a, const RegArray& b) { + for (size_t i = 0; i < result.size(); ++i) { + if (auto r = Common::ProcessNaNs(a[i], b[i])) { + result[i] = *r; + } else if (Common::IsNaN(result[i])) { + result[i] = NaNWrapper::value; + } + } + } + )); + + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.add(rsp, stack_space + ABI_SHADOW_SPACE); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); +} + template static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { @@ -42,7 +113,6 @@ static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* auto args = ctx.reg_alloc.GetArgumentInfo(inst); - Xbyak::Label end, nan; Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); @@ -53,46 +123,8 @@ static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* code.cmpunordps(nan_mask, xmm_a); (code.*fn)(result, xmm_b); code.cmpunordps(nan_mask, result); - if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { - code.ptest(nan_mask, nan_mask); - } else { - Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32(); - code.movmskps(bitmask, nan_mask); - code.cmp(bitmask, 0); - } - code.jz(end); - code.jmp(nan, code.T_NEAR); - code.L(end); - code.SwitchToFarCode(); - code.L(nan); - code.sub(rsp, 8); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); - const size_t stack_space = 3 * 16; - code.sub(rsp, stack_space + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); - code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); - code.movaps(xword[code.ABI_PARAM1], result); - code.movaps(xword[code.ABI_PARAM2], xmm_a); - code.movaps(xword[code.ABI_PARAM3], xmm_b); - code.CallFunction(static_cast&, const std::array&, const std::array&)>( - [](std::array& result, const std::array& a, const std::array& b) { - for (size_t i = 0; i < result.size(); ++i) { - if (auto r = Common::ProcessNaNs(a[i], b[i])) { - result[i] = *r; - } else if (Common::IsNaN(result[i])) { - result[i] = 0x7fc00000; - } - } - } - )); - code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); - code.add(rsp, stack_space + ABI_SHADOW_SPACE); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); - code.add(rsp, 8); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); + HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); ctx.reg_alloc.DefineValue(inst, result); } @@ -135,46 +167,8 @@ static void EmitVectorOperation64(BlockOfCode& code, EmitContext& ctx, IR::Inst* code.cmpunordpd(nan_mask, xmm_a); (code.*fn)(result, xmm_b); code.cmpunordpd(nan_mask, result); - if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { - code.ptest(nan_mask, nan_mask); - } else { - Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32(); - code.movmskps(bitmask, nan_mask); - code.cmp(bitmask, 0); - } - code.jz(end); - code.jmp(nan, code.T_NEAR); - code.L(end); - code.SwitchToFarCode(); - code.L(nan); - code.sub(rsp, 8); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); - const size_t stack_space = 3 * 16; - code.sub(rsp, stack_space + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); - code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); - code.movaps(xword[code.ABI_PARAM1], result); - code.movaps(xword[code.ABI_PARAM2], xmm_a); - code.movaps(xword[code.ABI_PARAM3], xmm_b); - code.CallFunction(static_cast&, const std::array&, const std::array&)>( - [](std::array& result, const std::array& a, const std::array& b) { - for (size_t i = 0; i < result.size(); ++i) { - if (auto r = Common::ProcessNaNs(a[i], b[i])) { - result[i] = *r; - } else if (Common::IsNaN(result[i])) { - result[i] = 0x7ff8'0000'0000'0000; - } - } - } - )); - code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); - code.add(rsp, stack_space + ABI_SHADOW_SPACE); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); - code.add(rsp, 8); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); + HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); ctx.reg_alloc.DefineValue(inst, result); }