emit_x64_{vector_}floating_point: Add AVX512 implementation for ForceToDefaultNaN

`vfpclassp* k, xmm, i8` has better latency(4->3) and allocates better
execution ports(01->5) that are out of the way of ALU-ports than
`vcmpunordp* xmm, xmm, xmm`(`vcmpp* xmm, xmm, xmm, i8`) and removes the
pipeline dependency on `xmm0` in favor AVX512 `k`-mask registers.

`vblendmp* xmm, k, xmm, mem` is about the same throughput and latency as
`blendvp* xmm. mem` but has the benefit of embedded broadcasts to reduce
memory bandwidth(32/64-bit read rather than 128-bit) and lends itself to
a future size optimization feature of `constant_pool`.
This commit is contained in:
Wunkolo 2022-06-18 01:12:36 -07:00 committed by merry
parent 6367a26e62
commit 4d78d167d6
2 changed files with 12 additions and 3 deletions

View file

@ -136,7 +136,11 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch)
template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
if (code.HasHostFeature(HostFeature::AVX)) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
const Xbyak::Opmask nan_mask = k1;
FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
FCODE(vblendmp)(result | nan_mask, result, code.MConst(ptr_b, fsize == 32 ? f32_nan : f64_nan));
} else if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpunords)(xmm0, result, result);
FCODE(blendvp)(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
} else {

View file

@ -190,11 +190,16 @@ Xbyak::Address GetVectorOf(BlockOfCode& code) {
template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
if (fpcr.DN()) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
const Xbyak::Opmask nan_mask = k1;
FCODE(vfpclassp)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
FCODE(vblendmp)(result | nan_mask, result, GetNaNVector<fsize>(code));
} else if (code.HasHostFeature(HostFeature::AVX)) {
const Xbyak::Xmm nan_mask = xmm0;
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vcmpunordp)(nan_mask, result, result);
FCODE(blendvp)(result, GetNaNVector<fsize>(code));
} else {
const Xbyak::Xmm nan_mask = xmm0;
code.movaps(nan_mask, result);
FCODE(cmpordp)(nan_mask, nan_mask);
code.andps(result, nan_mask);