emit_x64_vector_floating_point: AVX512 implementation of EmitFPVectorToFixed

AVX512 introduces the _unsigned_ variant of float-to-integer conversion functions via `vcvttp{sd}2u{dq}q`. In the case that a value is not representable as an unsigned integer, it will result in `0xFFFFF...` which can be utilized to get "free" saturation when the floating point value exceeds the unsigned range, after masking away negative values. https://www.felixcloutier.com/x86/vcvttps2udq https://www.felixcloutier.com/x86/vcvttpd2uqq This PR also speeds up the _signed_ conversion function for fp64->int64 https://www.felixcloutier.com/x86/vcvttpd2qq
2021-07-05 17:54:01 -07:00 · 2021-07-05 17:54:01 -07:00 · f33bd69ec2
commit f33bd69ec2
parent 048da372e9
1 changed files with 47 additions and 31 deletions
--- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@ -1715,8 +1715,6 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
    [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();

-    // TODO: AVX512 implementation
-
    if constexpr (fsize != 16) {
        if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -1744,6 +1742,9 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {

                    if constexpr (fsize == 32) {
                        code.cvttps2dq(src, src);
+                    } else {
+                        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                            code.vcvttpd2qq(src, src);
                        } else {
                            const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
                            const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
@ -1757,6 +1758,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
                            ctx.reg_alloc.Release(hi);
                            ctx.reg_alloc.Release(lo);
                        }
+                    }
                };

                if (fbits != 0) {
@ -1773,6 +1775,19 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
                [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;

                if constexpr (unsigned_) {
+                    if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                        // Mask positive values
+                        code.xorps(xmm0, xmm0);
+                        FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
+
+                        // Convert positive values to unsigned integers, write 0 anywhere else
+                        // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
+                        if constexpr (fsize == 32) {
+                            code.vcvttps2udq(src | k1 | T_z, src);
+                        } else {
+                            code.vcvttpd2uqq(src | k1 | T_z, src);
+                        }
+                    } else {
                        // Zero is minimum
                        code.xorps(xmm0, xmm0);
                        FCODE(cmplep)(xmm0, src);
@ -1796,6 +1811,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {

                        // Saturate to max
                        FCODE(orp)(src, exceed_unsigned);
+                    }
                } else {
                    constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());