diff --git a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index 1f067f28..a66920ad 100644 --- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/block_of_code.h" @@ -79,6 +80,27 @@ constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actua template void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list to_daz) { if (ctx.FPCR().FZ()) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 denormal_to_zero = FixupLUT( + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src); + constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element(denormal_to_zero); + + const Xbyak::Xmm tmp = xmm16; + FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64))); + + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(vfixupimms)(xmm, xmm, tmp, u8(0)); + } + return; + } + for (const Xbyak::Xmm& xmm : to_daz) { code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); code.andps(xmm0, xmm); diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index 8ce25004..416f3e1d 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/block_of_code.h" @@ -223,6 +224,26 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) { template void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list to_daz, Xbyak::Xmm tmp) { if (fpcr.FZ()) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 denormal_to_zero = FixupLUT( + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src); + constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element(denormal_to_zero); + + FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64))); + + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0)); + } + return; + } + if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) { code.movaps(tmp, GetNegativeZeroVector(code)); } else {