emit_x64_{vector_}floating_point: Add AVX512 implementation for DenormalsAreZero

Both single and double precision floating point numbers as well as the packed and unpacked version of this instruction will be able to use the same memory constant. This takes advantage of the fact that `VFIXUPIMM*` doesn't just copy from the source, but it will convert to `0.0` if it turns out that it is a denormal and the `MXCSR.DAZ` flag is set. ``` tsrc[31:0]←((src1[30:23] = 0) AND (MXCSR.DAZ =1)) ? 0.0 : src1[31:0] ... CASE(token_response[3:0]) { ... 0001: dest[31:0]←tsrc[31:0]; ; pass through src1 normal input value, denormal as zero ... ```
2022-06-18 00:02:59 -07:00 · 2022-06-18 00:02:59 -07:00 · 6367a26e62
commit 6367a26e62
parent 3ed2aebb20
2 changed files with 43 additions and 0 deletions
--- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@ -16,6 +16,7 @@
 #include <mcl/mp/typelist/lower_to_tuple.hpp>
 #include <mcl/stdint.hpp>
 #include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>

 #include "dynarmic/backend/x64/abi.h"
 #include "dynarmic/backend/x64/block_of_code.h"
@ -79,6 +80,27 @@ constexpr u64 f64_max_s64_lim = 0x43e0000000000000u;  // 2^63 as a double (actua
 template<size_t fsize>
 void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) {
    if (ctx.FPCR().FZ()) {
+        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+            constexpr u32 denormal_to_zero = FixupLUT(
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src);
+            constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
+
+            const Xbyak::Xmm tmp = xmm16;
+            FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
+
+            for (const Xbyak::Xmm& xmm : to_daz) {
+                FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));
+            }
+            return;
+        }
+
        for (const Xbyak::Xmm& xmm : to_daz) {
            code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
            code.andps(xmm0, xmm);
--- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@ -18,6 +18,7 @@
 #include <mcl/mp/typelist/lower_to_tuple.hpp>
 #include <mcl/type_traits/function_info.hpp>
 #include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>

 #include "dynarmic/backend/x64/abi.h"
 #include "dynarmic/backend/x64/block_of_code.h"
@ -223,6 +224,26 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
 template<size_t fsize>
 void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
    if (fpcr.FZ()) {
+        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+            constexpr u32 denormal_to_zero = FixupLUT(
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src);
+            constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
+
+            FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
+
+            for (const Xbyak::Xmm& xmm : to_daz) {
+                FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
+            }
+            return;
+        }
+
        if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
            code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
        } else {