From 776208742b22558e16406a8afd32538f89e77112 Mon Sep 17 00:00:00 2001
From: Wunkolo <Wunkolo@gmail.com>
Date: Wed, 9 Jun 2021 09:44:55 -0700
Subject: [PATCH] emit_x64_{vector_}floating_point: Centralize implementation
 of FP{Vector}{Abs,Neg}

Removes dependency on the constants at the top of some files
such as `f16_negative_zero` and `f32_non_sign_mask` in favor
of the `FPInfo` trait-type.

Also removes bypass delays by selecting between instructions
such as `pand`, `andps`, or `andpd` depending on the type
and keeps them in their respective uop domain.

See https://www.agner.org/optimize/instruction_tables.pdf for
more info on bypass delays.
---
 .../backend/x64/emit_x64_floating_point.cpp   | 55 +++++++--------
 .../x64/emit_x64_vector_floating_point.cpp    | 70 ++++++++-----------
 2 files changed, 55 insertions(+), 70 deletions(-)
diff --git a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
index dfe834fa..5e740469 100644
--- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@@ -39,9 +39,6 @@ namespace {
 
 const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1);
 
-constexpr u64 f16_negative_zero = 0x8000;
-constexpr u64 f16_non_sign_mask = 0x7fff;
-
 constexpr u64 f32_negative_zero = 0x80000000u;
 constexpr u64 f32_nan = 0x7fc00000u;
 constexpr u64 f32_non_sign_mask = 0x7fffffffu;
@@ -328,58 +325,56 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
 
 }  // anonymous namespace
 
-void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
+template<size_t fsize>
+void FPAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    using FPT = mp::unsigned_integer_of_size<fsize>;
+    constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u);
+
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Address mask = code.MConst(xword, non_sign_mask);
 
-    code.pand(result, code.MConst(xword, f16_non_sign_mask));
+    code.andps(result, mask);
 
     ctx.reg_alloc.DefineValue(inst, result);
 }
 
+void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
+    FPAbs<16>(code, ctx, inst);
+}
+
 void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-
-    code.pand(result, code.MConst(xword, f32_non_sign_mask));
-
-    ctx.reg_alloc.DefineValue(inst, result);
+    FPAbs<32>(code, ctx, inst);
 }
 
 void EmitX64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) {
+    FPAbs<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+void FPNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    using FPT = mp::unsigned_integer_of_size<fsize>;
+    constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
+
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Address mask = code.MConst(xword, u64(sign_mask));
 
-    code.pand(result, code.MConst(xword, f64_non_sign_mask));
+    code.xorps(result, mask);
 
     ctx.reg_alloc.DefineValue(inst, result);
 }
 
 void EmitX64::EmitFPNeg16(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-
-    code.pxor(result, code.MConst(xword, f16_negative_zero));
-
-    ctx.reg_alloc.DefineValue(inst, result);
+    FPNeg<16>(code, ctx, inst);
 }
 
 void EmitX64::EmitFPNeg32(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-
-    code.pxor(result, code.MConst(xword, f32_negative_zero));
-
-    ctx.reg_alloc.DefineValue(inst, result);
+    FPNeg<32>(code, ctx, inst);
 }
 
 void EmitX64::EmitFPNeg64(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
-
-    code.pxor(result, code.MConst(xword, f64_negative_zero));
-
-    ctx.reg_alloc.DefineValue(inst, result);
+    FPNeg<64>(code, ctx, inst);
 }
 
 void EmitX64::EmitFPAdd32(EmitContext& ctx, IR::Inst* inst) {
diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
index 9df3e61f..aeba7e2b 100644
--- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@@ -557,37 +557,32 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam
 
 }  // anonymous namespace
 
-void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
+template<size_t fsize>
+void FPVectorAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    using FPT = mp::unsigned_integer_of_size<fsize>;
+    constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u);
+    constexpr u64 non_sign_mask64 = Common::Replicate<u64>(non_sign_mask, fsize);
+
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Address mask = code.MConst(xword, 0x7FFF7FFF7FFF7FFF, 0x7FFF7FFF7FFF7FFF);
-
-    code.pand(a, mask);
-
-    ctx.reg_alloc.DefineValue(inst, a);
-}
-
-void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Address mask = code.MConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF);
+    const Xbyak::Address mask = code.MConst(xword, non_sign_mask64, non_sign_mask64);
 
     code.andps(a, mask);
 
     ctx.reg_alloc.DefineValue(inst, a);
 }
 
+void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
+    FPVectorAbs<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
+    FPVectorAbs<32>(code, ctx, inst);
+}
+
 void EmitX64::EmitFPVectorAbs64(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Address mask = code.MConst(xword, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF);
-
-    code.andpd(a, mask);
-
-    ctx.reg_alloc.DefineValue(inst, a);
+    FPVectorAbs<64>(code, ctx, inst);
 }
 
 void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
@@ -1229,37 +1224,32 @@ void EmitX64::EmitFPVectorMulX64(EmitContext& ctx, IR::Inst* inst) {
     EmitFPVectorMulX<64>(code, ctx, inst);
 }
 
-void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
+template<size_t fsize>
+void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    using FPT = mp::unsigned_integer_of_size<fsize>;
+    constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
+    constexpr u64 sign_mask64 = Common::Replicate<u64>(sign_mask, fsize);
+
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Address mask = code.MConst(xword, 0x8000800080008000, 0x8000800080008000);
+    const Xbyak::Address mask = code.MConst(xword, sign_mask64, sign_mask64);
 
-    code.pxor(a, mask);
+    code.xorps(a, mask);
 
     ctx.reg_alloc.DefineValue(inst, a);
 }
 
+void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
+    FPVectorNeg<16>(code, ctx, inst);
+}
+
 void EmitX64::EmitFPVectorNeg32(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Address mask = code.MConst(xword, 0x8000000080000000, 0x8000000080000000);
-
-    code.pxor(a, mask);
-
-    ctx.reg_alloc.DefineValue(inst, a);
+    FPVectorNeg<32>(code, ctx, inst);
 }
 
 void EmitX64::EmitFPVectorNeg64(EmitContext& ctx, IR::Inst* inst) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Address mask = code.MConst(xword, 0x8000000000000000, 0x8000000000000000);
-
-    code.pxor(a, mask);
-
-    ctx.reg_alloc.DefineValue(inst, a);
+    FPVectorNeg<64>(code, ctx, inst);
 }
 
 void EmitX64::EmitFPVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {