emit_x64_vector_floating_point: AVX512 implementation of EmitFPVectorToFixed
AVX512 introduces the _unsigned_ variant of float-to-integer conversion functions via `vcvttp{sd}2u{dq}q`. In the case that a value is not representable as an unsigned integer, it will result in `0xFFFFF...` which can be utilized to get "free" saturation when the floating point value exceeds the unsigned range, after masking away negative values. https://www.felixcloutier.com/x86/vcvttps2udq https://www.felixcloutier.com/x86/vcvttpd2uqq This PR also speeds up the _signed_ conversion function for fp64->int64 https://www.felixcloutier.com/x86/vcvttpd2qq
This commit is contained in:
parent
048da372e9
commit
f33bd69ec2
1 changed files with 47 additions and 31 deletions
|
@ -1715,8 +1715,6 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
|
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
|
||||||
[[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
|
[[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
|
||||||
|
|
||||||
// TODO: AVX512 implementation
|
|
||||||
|
|
||||||
if constexpr (fsize != 16) {
|
if constexpr (fsize != 16) {
|
||||||
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
|
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
@ -1744,6 +1742,9 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if constexpr (fsize == 32) {
|
if constexpr (fsize == 32) {
|
||||||
code.cvttps2dq(src, src);
|
code.cvttps2dq(src, src);
|
||||||
|
} else {
|
||||||
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
|
code.vcvttpd2qq(src, src);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
|
const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
|
||||||
const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
|
const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
|
||||||
|
@ -1757,6 +1758,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
ctx.reg_alloc.Release(hi);
|
ctx.reg_alloc.Release(hi);
|
||||||
ctx.reg_alloc.Release(lo);
|
ctx.reg_alloc.Release(lo);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
|
@ -1773,6 +1775,19 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
[[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
|
[[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
|
||||||
|
|
||||||
if constexpr (unsigned_) {
|
if constexpr (unsigned_) {
|
||||||
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
|
// Mask positive values
|
||||||
|
code.xorps(xmm0, xmm0);
|
||||||
|
FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
|
||||||
|
|
||||||
|
// Convert positive values to unsigned integers, write 0 anywhere else
|
||||||
|
// vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
code.vcvttps2udq(src | k1 | T_z, src);
|
||||||
|
} else {
|
||||||
|
code.vcvttpd2uqq(src | k1 | T_z, src);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
// Zero is minimum
|
// Zero is minimum
|
||||||
code.xorps(xmm0, xmm0);
|
code.xorps(xmm0, xmm0);
|
||||||
FCODE(cmplep)(xmm0, src);
|
FCODE(cmplep)(xmm0, src);
|
||||||
|
@ -1796,6 +1811,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
// Saturate to max
|
// Saturate to max
|
||||||
FCODE(orp)(src, exceed_unsigned);
|
FCODE(orp)(src, exceed_unsigned);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());
|
constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue