IR: Implement FPRecipStepFused, FPVectorRecipStepFused

2018-07-25 19:11:20 +01:00 · 2018-07-25 19:11:20 +01:00 · 901bd9b4e2
commit 901bd9b4e2
parent f66f61d8ab
10 changed files with 135 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -27,6 +27,8 @@ add_library(dynarmic
    common/fp/op/FPMulAdd.h
    common/fp/op/FPRecipEstimate.cpp
    common/fp/op/FPRecipEstimate.h
+    common/fp/op/FPRecipStepFused.cpp
+    common/fp/op/FPRecipStepFused.h
    common/fp/op/FPRoundInt.cpp
    common/fp/op/FPRoundInt.h
    common/fp/op/FPRSqrtEstimate.cpp
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@ -830,6 +830,23 @@ void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
    EmitFPRecipEstimate<u64>(code, ctx, inst);
 }

+template<typename FPT>
+static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ctx.reg_alloc.HostCall(inst, args[0], args[1]);
+    code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR());
+    code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+    code.CallFunction(&FP::FPRecipStepFused<FPT>);
+}
+
+void EmitX64::EmitFPRecipStepFused32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPRecipStepFused<u32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPRecipStepFused<u64>(code, ctx, inst);
+}
+
 static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) {
    const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
    const bool exact = inst->GetArg(2).GetU1();
--- a/src/backend_x64/emit_x64_vector_floating_point.cpp
+++ b/src/backend_x64/emit_x64_vector_floating_point.cpp
@ -631,6 +631,23 @@ void EmitX64::EmitFPVectorRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
    EmitRecipEstimate<u64>(code, ctx, inst);
 }

+template<typename FPT>
+static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    EmitThreeOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
+        for (size_t i = 0; i < result.size(); i++) {
+            result[i] = FP::FPRecipStepFused<FPT>(op1[i], op2[i], fpcr, fpsr);
+        }
+    });
+}
+
+void EmitX64::EmitFPVectorRecipStepFused32(EmitContext& ctx, IR::Inst* inst) {
+    EmitRecipStepFused<u32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
+    EmitRecipStepFused<u64>(code, ctx, inst);
+}
+
 template<typename FPT>
 static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
--- a/src/common/fp/info.h
+++ b/src/common/fp/info.h
@ -34,6 +34,7 @@ struct FPInfo<u32> {
    static constexpr u32 MaxNormal(bool sign) { return (exponent_mask - 1) | Zero(sign); }
    static constexpr u32 DefaultNaN() { return exponent_mask | (u32(1) << (explicit_mantissa_width - 1)); }
    static constexpr u32 OnePointFive(bool sign) { return Zero(sign) | (u32(1) << (explicit_mantissa_width - 1)) | (u32(exponent_bias) << explicit_mantissa_width); }
+    static constexpr u32 Two(bool sign) { return Zero(sign) | (u32(exponent_bias + 1) << explicit_mantissa_width); }
 };

 template<>
@ -57,6 +58,7 @@ struct FPInfo<u64> {
    static constexpr u64 MaxNormal(bool sign) { return (exponent_mask - 1) | Zero(sign); }
    static constexpr u64 DefaultNaN() { return exponent_mask | (u64(1) << (explicit_mantissa_width - 1)); }
    static constexpr u64 OnePointFive(bool sign) { return Zero(sign) | (u64(1) << (explicit_mantissa_width - 1)) | (u64(exponent_bias) << explicit_mantissa_width); }
+    static constexpr u64 Two(bool sign) { return Zero(sign) | (u64(exponent_bias + 1) << explicit_mantissa_width); }
 };

 } // namespace Dynarmic::FP 
--- a/src/common/fp/op.h
+++ b/src/common/fp/op.h
@ -8,6 +8,7 @@

 #include "common/fp/op/FPMulAdd.h"
 #include "common/fp/op/FPRecipEstimate.h"
+#include "common/fp/op/FPRecipStepFused.h"
 #include "common/fp/op/FPRoundInt.h"
 #include "common/fp/op/FPRSqrtEstimate.h"
 #include "common/fp/op/FPRSqrtStepFused.h"
--- a/src/common/fp/op/FPRecipStepFused.cpp
+++ b/src/common/fp/op/FPRecipStepFused.cpp
@ -0,0 +1,55 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include "common/fp/fpcr.h"
+#include "common/fp/fpsr.h"
+#include "common/fp/fused.h"
+#include "common/fp/info.h"
+#include "common/fp/op/FPNeg.h"
+#include "common/fp/op/FPRecipStepFused.h"
+#include "common/fp/process_exception.h"
+#include "common/fp/process_nan.h"
+#include "common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
+    op1 = FPNeg(op1);
+
+    const auto [type1, sign1, value1] = FPUnpack<FPT>(op1, fpcr, fpsr);
+    const auto [type2, sign2, value2] = FPUnpack<FPT>(op2, fpcr, fpsr);
+    
+    if (const auto maybe_nan = FPProcessNaNs(type1, type2, op1, op2, fpcr, fpsr)) {
+        return *maybe_nan;
+    }
+
+    const bool inf1 = type1 == FPType::Infinity;
+    const bool inf2 = type2 == FPType::Infinity;
+    const bool zero1 = type1 == FPType::Zero;
+    const bool zero2 = type2 == FPType::Zero;
+
+    if ((inf1 && zero2) || (zero1 && inf2)) {
+        return FPInfo<FPT>::Two(false);
+    }
+
+    if (inf1 || inf2) {
+        return FPInfo<FPT>::Infinity(sign1 != sign2);
+    }
+
+    // result_value = 2.0 + (value1 * value2)
+    FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 2), value1, value2);
+
+    if (result_value.mantissa == 0) {
+        return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity);
+    }
+    return FPRound<FPT>(result_value, fpcr, fpsr);
+}
+
+template u32 FPRecipStepFused<u32>(u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
+template u64 FPRecipStepFused<u64>(u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP 
--- a/src/common/fp/op/FPRecipStepFused.h
+++ b/src/common/fp/op/FPRecipStepFused.h
@ -0,0 +1,17 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP 
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@ -1495,6 +1495,13 @@ U32U64 IREmitter::FPRecipEstimate(const U32U64& a) {
    return Inst<U64>(Opcode::FPRecipEstimate64, a);
 }

+U32U64 IREmitter::FPRecipStepFused(const U32U64& a, const U32U64& b) {
+    if (a.GetType() == Type::U32) {
+        return Inst<U32>(Opcode::FPRecipStepFused32, a, b);
+    }
+    return Inst<U64>(Opcode::FPRecipStepFused64, a, b);
+}
+
 U32U64 IREmitter::FPRoundInt(const U32U64& a, FP::RoundingMode rounding, bool exact) {
    if (a.GetType() == Type::U32) {
        return Inst<U32>(Opcode::FPRoundInt32, a, static_cast<u8>(rounding), Imm1(exact));
@ -1760,6 +1767,17 @@ U128 IREmitter::FPVectorRecipEstimate(size_t esize, const U128& a) {
    return {};
 }

+U128 IREmitter::FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b) {
+    switch (esize) {
+    case 32:
+        return Inst<U128>(Opcode::FPVectorRecipStepFused32, a, b);
+    case 64:
+        return Inst<U128>(Opcode::FPVectorRecipStepFused64, a, b);
+    }
+    UNREACHABLE();
+    return {};
+}
+
 U128 IREmitter::FPVectorRSqrtEstimate(size_t esize, const U128& a) {
    switch (esize) {
    case 32:
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@ -270,6 +270,7 @@ public:
    U32U64 FPMulAdd(const U32U64& addend, const U32U64& op1, const U32U64& op2, bool fpscr_controlled);
    U32U64 FPNeg(const U32U64& a);
    U32U64 FPRecipEstimate(const U32U64& a);
+    U32U64 FPRecipStepFused(const U32U64& a, const U32U64& b);
    U32U64 FPRoundInt(const U32U64& a, FP::RoundingMode rounding, bool exact);
    U32U64 FPRSqrtEstimate(const U32U64& a);
    U32U64 FPRSqrtStepFused(const U32U64& a, const U32U64& b);
@ -306,6 +307,7 @@ public:
    U128 FPVectorPairedAdd(size_t esize, const U128& a, const U128& b);
    U128 FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b);
    U128 FPVectorRecipEstimate(size_t esize, const U128& a);
+    U128 FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b);
    U128 FPVectorRSqrtEstimate(size_t esize, const U128& a);
    U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b);
    U128 FPVectorSub(size_t esize, const U128& a, const U128& b);
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@ -396,6 +396,8 @@ OPCODE(FPNeg32,                                 T::U32,         T::U32
 OPCODE(FPNeg64,                                 T::U64,         T::U64                                          )
 OPCODE(FPRecipEstimate32,                       T::U32,         T::U32                                          )
 OPCODE(FPRecipEstimate64,                       T::U64,         T::U64                                          )
+OPCODE(FPRecipStepFused32,                      T::U32,         T::U32,         T::U32                          )
+OPCODE(FPRecipStepFused64,                      T::U64,         T::U64,         T::U64                          )
 OPCODE(FPRoundInt32,                            T::U32,         T::U32,         T::U8,          T::U1           )
 OPCODE(FPRoundInt64,                            T::U64,         T::U64,         T::U8,          T::U1           )
 OPCODE(FPRSqrtEstimate32,                       T::U32,         T::U32                                          )
@ -454,6 +456,8 @@ OPCODE(FPVectorPairedAdd32,                     T::U128,        T::U128,
 OPCODE(FPVectorPairedAdd64,                     T::U128,        T::U128,        T::U128                         )
 OPCODE(FPVectorRecipEstimate32,                 T::U128,        T::U128                                         )
 OPCODE(FPVectorRecipEstimate64,                 T::U128,        T::U128                                         )
+OPCODE(FPVectorRecipStepFused32,                T::U128,        T::U128,        T::U128                         )
+OPCODE(FPVectorRecipStepFused64,                T::U128,        T::U128,        T::U128                         )
 OPCODE(FPVectorRSqrtEstimate32,                 T::U128,        T::U128                                         )
 OPCODE(FPVectorRSqrtEstimate64,                 T::U128,        T::U128                                         )
 OPCODE(FPVectorRSqrtStepFused32,                T::U128,        T::U128,        T::U128                         )