emit_x64_{vector_}floating_point: Add AVX512 implementation for DenormalsAreZero

Both single and double precision floating point numbers as well as the
packed and unpacked version of this instruction will be able to use the
same memory constant. This takes advantage of the fact that `VFIXUPIMM*`
doesn't just copy from the source, but it will convert to `0.0` if it
turns out that it is a denormal and the `MXCSR.DAZ` flag is set.

```
tsrc[31:0]←((src1[30:23] = 0) AND (MXCSR.DAZ =1)) ? 0.0 : src1[31:0]
...
CASE(token_response[3:0]) {
    ...
    0001: dest[31:0]←tsrc[31:0]; ; pass through src1 normal input value, denormal as zero
    ...
```
This commit is contained in:
Wunkolo 2022-06-18 00:02:59 -07:00 committed by merry
parent 3ed2aebb20
commit 6367a26e62
2 changed files with 43 additions and 0 deletions

View file

@ -16,6 +16,7 @@
#include <mcl/mp/typelist/lower_to_tuple.hpp> #include <mcl/mp/typelist/lower_to_tuple.hpp>
#include <mcl/stdint.hpp> #include <mcl/stdint.hpp>
#include <mcl/type_traits/integer_of_size.hpp> #include <mcl/type_traits/integer_of_size.hpp>
#include <xbyak/xbyak.h>
#include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/abi.h"
#include "dynarmic/backend/x64/block_of_code.h" #include "dynarmic/backend/x64/block_of_code.h"
@ -79,6 +80,27 @@ constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actua
template<size_t fsize> template<size_t fsize>
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) { void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) {
if (ctx.FPCR().FZ()) { if (ctx.FPCR().FZ()) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
constexpr u32 denormal_to_zero = FixupLUT(
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src);
constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
const Xbyak::Xmm tmp = xmm16;
FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
for (const Xbyak::Xmm& xmm : to_daz) {
FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));
}
return;
}
for (const Xbyak::Xmm& xmm : to_daz) { for (const Xbyak::Xmm& xmm : to_daz) {
code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
code.andps(xmm0, xmm); code.andps(xmm0, xmm);

View file

@ -18,6 +18,7 @@
#include <mcl/mp/typelist/lower_to_tuple.hpp> #include <mcl/mp/typelist/lower_to_tuple.hpp>
#include <mcl/type_traits/function_info.hpp> #include <mcl/type_traits/function_info.hpp>
#include <mcl/type_traits/integer_of_size.hpp> #include <mcl/type_traits/integer_of_size.hpp>
#include <xbyak/xbyak.h>
#include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/abi.h"
#include "dynarmic/backend/x64/block_of_code.h" #include "dynarmic/backend/x64/block_of_code.h"
@ -223,6 +224,26 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
template<size_t fsize> template<size_t fsize>
void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) { void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
if (fpcr.FZ()) { if (fpcr.FZ()) {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
constexpr u32 denormal_to_zero = FixupLUT(
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src,
FpFixup::Norm_Src);
constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
for (const Xbyak::Xmm& xmm : to_daz) {
FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
}
return;
}
if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) { if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
code.movaps(tmp, GetNegativeZeroVector<fsize>(code)); code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
} else { } else {