diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index 7a1c1d03..d140991f 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -34,8 +34,10 @@ namespace Dynarmic::BackendX64 { using namespace Xbyak::util; namespace mp = Common::mp; +namespace { + template -static T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) { +T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) { static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); if constexpr (fsize == 32) { @@ -78,7 +80,7 @@ private: }; template -static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) { +void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) { static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { @@ -120,8 +122,27 @@ static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array +void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { + if (ctx.FPSCR_DN()) { + const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.pcmpeqw(tmp, tmp); + code.movaps(nan_mask, result); + FCODE(cmpordp)(nan_mask, nan_mask); + code.andps(result, nan_mask); + code.xorps(nan_mask, tmp); + code.andps(nan_mask, fsize == 32 ? code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000) : code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000)); + code.orps(result, nan_mask); + } +} + template struct DefaultIndexer { + std::tuple operator()(size_t i, const VectorArray& a) { + return std::make_tuple(a[i]); + } + std::tuple operator()(size_t i, const VectorArray& a, const VectorArray& b) { return std::make_tuple(a[i], b[i]); } @@ -173,7 +194,48 @@ struct PairedLowerIndexer { }; template class Indexer, typename Function> -static void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { +void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { + static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); + + if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if constexpr (std::is_member_function_pointer_v) { + (code.*fn)(xmm_a); + } else { + fn(xmm_a); + } + + ForceToDefaultNaN(code, ctx, xmm_a); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + + code.movaps(nan_mask, xmm_a); + code.movaps(result, xmm_a); + FCODE(cmpunordp)(nan_mask, nan_mask); + if constexpr (std::is_member_function_pointer_v) { + (code.*fn)(result); + } else { + fn(result); + } + FCODE(cmpunordp)(nan_mask, result); + + HandleNaNs(code, ctx, {result, xmm_a}, nan_mask, nan_handler); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template class Indexer, typename Function> +void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { @@ -187,17 +249,7 @@ static void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR:: fn(xmm_a, xmm_b); } - if (ctx.FPSCR_DN()) { - const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - code.pcmpeqw(tmp, tmp); - code.movaps(nan_mask, xmm_a); - FCODE(cmpordp)(nan_mask, nan_mask); - code.andps(xmm_a, nan_mask); - code.xorps(nan_mask, tmp); - code.andps(nan_mask, fsize == 32 ? code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000) : code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000)); - code.orps(xmm_a, nan_mask); - } + ForceToDefaultNaN(code, ctx, xmm_a); ctx.reg_alloc.DefineValue(inst, xmm_a); return; @@ -226,7 +278,7 @@ static void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR:: } template class Indexer, typename Function> -static void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { +void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { @@ -241,17 +293,7 @@ static void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::I fn(xmm_a, xmm_b, xmm_c); } - if (ctx.FPSCR_DN()) { - const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - code.pcmpeqw(tmp, tmp); - code.movaps(nan_mask, xmm_a); - FCODE(cmpordp)(nan_mask, nan_mask); - code.andps(xmm_a, nan_mask); - code.xorps(nan_mask, tmp); - code.andps(nan_mask, fsize == 32 ? code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000) : code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000)); - code.orps(xmm_a, nan_mask); - } + ForceToDefaultNaN(code, ctx, xmm_a); ctx.reg_alloc.DefineValue(inst, xmm_a); return; @@ -282,7 +324,7 @@ static void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::I } template -inline void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { +void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { const auto fn = static_cast*>(lambda); auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -307,7 +349,7 @@ inline void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins } template -inline void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { +void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { const auto fn = static_cast*>(lambda); auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -351,7 +393,7 @@ inline void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* i } template -inline void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { +void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { const auto fn = static_cast*>(lambda); auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -398,6 +440,8 @@ inline void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* in ctx.reg_alloc.DefineValue(inst, xmm0); } +} // anonymous namespace + void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -736,6 +780,34 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const auto rounding = static_cast(inst->GetArg(1).GetU8()); const bool exact = inst->GetArg(2).GetU1(); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) { + const u8 round_imm = [&]() -> u8 { + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + default: + UNREACHABLE(); + } + return 0; + }(); + + EmitTwoOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result){ + if constexpr (fsize == 32) { + code.roundps(result, result, round_imm); + } else { + code.roundpd(result, result, round_imm); + } + }); + + return; + } + using rounding_list = mp::list< std::integral_constant, std::integral_constant, diff --git a/src/common/fp/util.h b/src/common/fp/util.h index e3a4bced..fafe63c4 100644 --- a/src/common/fp/util.h +++ b/src/common/fp/util.h @@ -36,6 +36,17 @@ constexpr bool IsNaN(u32 value) { return IsQNaN(value) || IsSNaN(value); } +/// Given a single argument, return the NaN value which would be returned by an ARM processor. +/// If the argument isn't a NaN, returns boost::none. +inline boost::optional ProcessNaNs(u32 a) { + if (IsSNaN(a)) { + return a | 0x00400000; + } else if (IsQNaN(a)) { + return a; + } + return boost::none; +} + /// Given a pair of arguments, return the NaN value which would be returned by an ARM processor. /// If neither argument is a NaN, returns boost::none. inline boost::optional ProcessNaNs(u32 a, u32 b) { @@ -96,6 +107,17 @@ constexpr bool IsNaN(u64 value) { return IsQNaN(value) || IsSNaN(value); } +/// Given a single argument, return the NaN value which would be returned by an ARM processor. +/// If the argument isn't a NaN, returns boost::none. +inline boost::optional ProcessNaNs(u64 a) { + if (IsSNaN(a)) { + return a | 0x0008'0000'0000'0000; + } else if (IsQNaN(a)) { + return a; + } + return boost::none; +} + /// Given a pair of arguments, return the NaN value which would be returned by an ARM processor. /// If neither argument is a NaN, returns boost::none. inline boost::optional ProcessNaNs(u64 a, u64 b) {