emit_x64_vector_floating_point: AVX512 implementation of EmitFPVectorToFixed
AVX512 introduces the _unsigned_ variant of float-to-integer conversion functions via `vcvttp{sd}2u{dq}q`. In the case that a value is not representable as an unsigned integer, it will result in `0xFFFFF...` which can be utilized to get "free" saturation when the floating point value exceeds the unsigned range, after masking away negative values. https://www.felixcloutier.com/x86/vcvttps2udq https://www.felixcloutier.com/x86/vcvttpd2uqq This PR also speeds up the _signed_ conversion function for fp64->int64 https://www.felixcloutier.com/x86/vcvttpd2qq
This commit is contained in:
parent
048da372e9
commit
f33bd69ec2
1 changed files with 47 additions and 31 deletions
|
@ -1715,8 +1715,6 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
|
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
|
||||||
[[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
|
[[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
|
||||||
|
|
||||||
// TODO: AVX512 implementation
|
|
||||||
|
|
||||||
if constexpr (fsize != 16) {
|
if constexpr (fsize != 16) {
|
||||||
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
|
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
@ -1745,17 +1743,21 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
if constexpr (fsize == 32) {
|
if constexpr (fsize == 32) {
|
||||||
code.cvttps2dq(src, src);
|
code.cvttps2dq(src, src);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
|
code.vcvttpd2qq(src, src);
|
||||||
|
} else {
|
||||||
|
const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
|
||||||
|
const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
code.cvttsd2si(lo, src);
|
code.cvttsd2si(lo, src);
|
||||||
code.punpckhqdq(src, src);
|
code.punpckhqdq(src, src);
|
||||||
code.cvttsd2si(hi, src);
|
code.cvttsd2si(hi, src);
|
||||||
code.movq(src, lo);
|
code.movq(src, lo);
|
||||||
code.pinsrq(src, hi, 1);
|
code.pinsrq(src, hi, 1);
|
||||||
|
|
||||||
ctx.reg_alloc.Release(hi);
|
ctx.reg_alloc.Release(hi);
|
||||||
ctx.reg_alloc.Release(lo);
|
ctx.reg_alloc.Release(lo);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1773,29 +1775,43 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
[[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
|
[[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
|
||||||
|
|
||||||
if constexpr (unsigned_) {
|
if constexpr (unsigned_) {
|
||||||
// Zero is minimum
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
code.xorps(xmm0, xmm0);
|
// Mask positive values
|
||||||
FCODE(cmplep)(xmm0, src);
|
code.xorps(xmm0, xmm0);
|
||||||
FCODE(andp)(src, xmm0);
|
FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
|
||||||
|
|
||||||
// Will we exceed unsigned range?
|
// Convert positive values to unsigned integers, write 0 anywhere else
|
||||||
const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm();
|
// vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
|
||||||
code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
|
if constexpr (fsize == 32) {
|
||||||
FCODE(cmplep)(exceed_unsigned, src);
|
code.vcvttps2udq(src | k1 | T_z, src);
|
||||||
|
} else {
|
||||||
|
code.vcvttpd2uqq(src | k1 | T_z, src);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Zero is minimum
|
||||||
|
code.xorps(xmm0, xmm0);
|
||||||
|
FCODE(cmplep)(xmm0, src);
|
||||||
|
FCODE(andp)(src, xmm0);
|
||||||
|
|
||||||
// Will be exceed signed range?
|
// Will we exceed unsigned range?
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
|
code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
|
||||||
code.movaps(xmm0, tmp);
|
FCODE(cmplep)(exceed_unsigned, src);
|
||||||
FCODE(cmplep)(xmm0, src);
|
|
||||||
FCODE(andp)(tmp, xmm0);
|
|
||||||
FCODE(subp)(src, tmp);
|
|
||||||
perform_conversion(src);
|
|
||||||
ICODE(psll)(xmm0, static_cast<u8>(fsize - 1));
|
|
||||||
FCODE(orp)(src, xmm0);
|
|
||||||
|
|
||||||
// Saturate to max
|
// Will be exceed signed range?
|
||||||
FCODE(orp)(src, exceed_unsigned);
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
|
||||||
|
code.movaps(xmm0, tmp);
|
||||||
|
FCODE(cmplep)(xmm0, src);
|
||||||
|
FCODE(andp)(tmp, xmm0);
|
||||||
|
FCODE(subp)(src, tmp);
|
||||||
|
perform_conversion(src);
|
||||||
|
ICODE(psll)(xmm0, static_cast<u8>(fsize - 1));
|
||||||
|
FCODE(orp)(src, xmm0);
|
||||||
|
|
||||||
|
// Saturate to max
|
||||||
|
FCODE(orp)(src, exceed_unsigned);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());
|
constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue