emit_x64_vector_floating_point: AVX512 implementation of EmitFPVectorToFixed

AVX512 introduces the _unsigned_ variant of float-to-integer conversion
functions via `vcvttp{sd}2u{dq}q`. In the case that a value is not
representable as an unsigned integer, it will result in `0xFFFFF...`
which can be utilized to get "free" saturation when the floating point
value exceeds the unsigned range, after masking away negative values.

https://www.felixcloutier.com/x86/vcvttps2udq
https://www.felixcloutier.com/x86/vcvttpd2uqq

This PR also speeds up the _signed_ conversion function for fp64->int64
https://www.felixcloutier.com/x86/vcvttpd2qq
This commit is contained in:
Wunkolo 2021-07-05 17:54:01 -07:00 committed by merry
parent 048da372e9
commit f33bd69ec2

View file

@ -1715,8 +1715,6 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8()); const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
[[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1(); [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
// TODO: AVX512 implementation
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -1744,6 +1742,9 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if constexpr (fsize == 32) { if constexpr (fsize == 32) {
code.cvttps2dq(src, src); code.cvttps2dq(src, src);
} else {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
code.vcvttpd2qq(src, src);
} else { } else {
const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(); const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(); const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
@ -1757,6 +1758,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(hi); ctx.reg_alloc.Release(hi);
ctx.reg_alloc.Release(lo); ctx.reg_alloc.Release(lo);
} }
}
}; };
if (fbits != 0) { if (fbits != 0) {
@ -1773,6 +1775,19 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
[[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
if constexpr (unsigned_) { if constexpr (unsigned_) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
// Mask positive values
code.xorps(xmm0, xmm0);
FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
// Convert positive values to unsigned integers, write 0 anywhere else
// vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
if constexpr (fsize == 32) {
code.vcvttps2udq(src | k1 | T_z, src);
} else {
code.vcvttpd2uqq(src | k1 | T_z, src);
}
} else {
// Zero is minimum // Zero is minimum
code.xorps(xmm0, xmm0); code.xorps(xmm0, xmm0);
FCODE(cmplep)(xmm0, src); FCODE(cmplep)(xmm0, src);
@ -1796,6 +1811,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
// Saturate to max // Saturate to max
FCODE(orp)(src, exceed_unsigned); FCODE(orp)(src, exceed_unsigned);
}
} else { } else {
constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max()); constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());