emit_x64_{vector_}floating_point: Add AVX512 implementation for DenormalsAreZero
Both single and double precision floating point numbers as well as the packed and unpacked version of this instruction will be able to use the same memory constant. This takes advantage of the fact that `VFIXUPIMM*` doesn't just copy from the source, but it will convert to `0.0` if it turns out that it is a denormal and the `MXCSR.DAZ` flag is set. ``` tsrc[31:0]←((src1[30:23] = 0) AND (MXCSR.DAZ =1)) ? 0.0 : src1[31:0] ... CASE(token_response[3:0]) { ... 0001: dest[31:0]←tsrc[31:0]; ; pass through src1 normal input value, denormal as zero ... ```
This commit is contained in:
parent
3ed2aebb20
commit
6367a26e62
2 changed files with 43 additions and 0 deletions
|
@ -16,6 +16,7 @@
|
||||||
#include <mcl/mp/typelist/lower_to_tuple.hpp>
|
#include <mcl/mp/typelist/lower_to_tuple.hpp>
|
||||||
#include <mcl/stdint.hpp>
|
#include <mcl/stdint.hpp>
|
||||||
#include <mcl/type_traits/integer_of_size.hpp>
|
#include <mcl/type_traits/integer_of_size.hpp>
|
||||||
|
#include <xbyak/xbyak.h>
|
||||||
|
|
||||||
#include "dynarmic/backend/x64/abi.h"
|
#include "dynarmic/backend/x64/abi.h"
|
||||||
#include "dynarmic/backend/x64/block_of_code.h"
|
#include "dynarmic/backend/x64/block_of_code.h"
|
||||||
|
@ -79,6 +80,27 @@ constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actua
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) {
|
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) {
|
||||||
if (ctx.FPCR().FZ()) {
|
if (ctx.FPCR().FZ()) {
|
||||||
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
|
constexpr u32 denormal_to_zero = FixupLUT(
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src);
|
||||||
|
constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
|
||||||
|
|
||||||
|
const Xbyak::Xmm tmp = xmm16;
|
||||||
|
FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
|
||||||
|
|
||||||
|
for (const Xbyak::Xmm& xmm : to_daz) {
|
||||||
|
FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
for (const Xbyak::Xmm& xmm : to_daz) {
|
for (const Xbyak::Xmm& xmm : to_daz) {
|
||||||
code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
||||||
code.andps(xmm0, xmm);
|
code.andps(xmm0, xmm);
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#include <mcl/mp/typelist/lower_to_tuple.hpp>
|
#include <mcl/mp/typelist/lower_to_tuple.hpp>
|
||||||
#include <mcl/type_traits/function_info.hpp>
|
#include <mcl/type_traits/function_info.hpp>
|
||||||
#include <mcl/type_traits/integer_of_size.hpp>
|
#include <mcl/type_traits/integer_of_size.hpp>
|
||||||
|
#include <xbyak/xbyak.h>
|
||||||
|
|
||||||
#include "dynarmic/backend/x64/abi.h"
|
#include "dynarmic/backend/x64/abi.h"
|
||||||
#include "dynarmic/backend/x64/block_of_code.h"
|
#include "dynarmic/backend/x64/block_of_code.h"
|
||||||
|
@ -223,6 +224,26 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
|
void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
|
||||||
if (fpcr.FZ()) {
|
if (fpcr.FZ()) {
|
||||||
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
|
constexpr u32 denormal_to_zero = FixupLUT(
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src,
|
||||||
|
FpFixup::Norm_Src);
|
||||||
|
constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
|
||||||
|
|
||||||
|
FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
|
||||||
|
|
||||||
|
for (const Xbyak::Xmm& xmm : to_daz) {
|
||||||
|
FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
|
if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
|
||||||
code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
|
code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Reference in a new issue