IR: Implement FPRecipStepFused, FPVectorRecipStepFused
This commit is contained in:
parent
f66f61d8ab
commit
901bd9b4e2
10 changed files with 135 additions and 0 deletions
|
@ -27,6 +27,8 @@ add_library(dynarmic
|
||||||
common/fp/op/FPMulAdd.h
|
common/fp/op/FPMulAdd.h
|
||||||
common/fp/op/FPRecipEstimate.cpp
|
common/fp/op/FPRecipEstimate.cpp
|
||||||
common/fp/op/FPRecipEstimate.h
|
common/fp/op/FPRecipEstimate.h
|
||||||
|
common/fp/op/FPRecipStepFused.cpp
|
||||||
|
common/fp/op/FPRecipStepFused.h
|
||||||
common/fp/op/FPRoundInt.cpp
|
common/fp/op/FPRoundInt.cpp
|
||||||
common/fp/op/FPRoundInt.h
|
common/fp/op/FPRoundInt.h
|
||||||
common/fp/op/FPRSqrtEstimate.cpp
|
common/fp/op/FPRSqrtEstimate.cpp
|
||||||
|
|
|
@ -830,6 +830,23 @@ void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRecipEstimate<u64>(code, ctx, inst);
|
EmitFPRecipEstimate<u64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename FPT>
|
||||||
|
static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
ctx.reg_alloc.HostCall(inst, args[0], args[1]);
|
||||||
|
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR());
|
||||||
|
code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
|
||||||
|
code.CallFunction(&FP::FPRecipStepFused<FPT>);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPRecipStepFused32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPRecipStepFused<u32>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPRecipStepFused<u64>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) {
|
static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) {
|
||||||
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
|
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
|
||||||
const bool exact = inst->GetArg(2).GetU1();
|
const bool exact = inst->GetArg(2).GetU1();
|
||||||
|
|
|
@ -631,6 +631,23 @@ void EmitX64::EmitFPVectorRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitRecipEstimate<u64>(code, ctx, inst);
|
EmitRecipEstimate<u64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename FPT>
|
||||||
|
static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitThreeOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||||
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
|
result[i] = FP::FPRecipStepFused<FPT>(op1[i], op2[i], fpcr, fpsr);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorRecipStepFused32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitRecipStepFused<u32>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitRecipStepFused<u64>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename FPT>
|
template<typename FPT>
|
||||||
static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||||
|
|
|
@ -34,6 +34,7 @@ struct FPInfo<u32> {
|
||||||
static constexpr u32 MaxNormal(bool sign) { return (exponent_mask - 1) | Zero(sign); }
|
static constexpr u32 MaxNormal(bool sign) { return (exponent_mask - 1) | Zero(sign); }
|
||||||
static constexpr u32 DefaultNaN() { return exponent_mask | (u32(1) << (explicit_mantissa_width - 1)); }
|
static constexpr u32 DefaultNaN() { return exponent_mask | (u32(1) << (explicit_mantissa_width - 1)); }
|
||||||
static constexpr u32 OnePointFive(bool sign) { return Zero(sign) | (u32(1) << (explicit_mantissa_width - 1)) | (u32(exponent_bias) << explicit_mantissa_width); }
|
static constexpr u32 OnePointFive(bool sign) { return Zero(sign) | (u32(1) << (explicit_mantissa_width - 1)) | (u32(exponent_bias) << explicit_mantissa_width); }
|
||||||
|
static constexpr u32 Two(bool sign) { return Zero(sign) | (u32(exponent_bias + 1) << explicit_mantissa_width); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
@ -57,6 +58,7 @@ struct FPInfo<u64> {
|
||||||
static constexpr u64 MaxNormal(bool sign) { return (exponent_mask - 1) | Zero(sign); }
|
static constexpr u64 MaxNormal(bool sign) { return (exponent_mask - 1) | Zero(sign); }
|
||||||
static constexpr u64 DefaultNaN() { return exponent_mask | (u64(1) << (explicit_mantissa_width - 1)); }
|
static constexpr u64 DefaultNaN() { return exponent_mask | (u64(1) << (explicit_mantissa_width - 1)); }
|
||||||
static constexpr u64 OnePointFive(bool sign) { return Zero(sign) | (u64(1) << (explicit_mantissa_width - 1)) | (u64(exponent_bias) << explicit_mantissa_width); }
|
static constexpr u64 OnePointFive(bool sign) { return Zero(sign) | (u64(1) << (explicit_mantissa_width - 1)) | (u64(exponent_bias) << explicit_mantissa_width); }
|
||||||
|
static constexpr u64 Two(bool sign) { return Zero(sign) | (u64(exponent_bias + 1) << explicit_mantissa_width); }
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace Dynarmic::FP
|
} // namespace Dynarmic::FP
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
#include "common/fp/op/FPMulAdd.h"
|
#include "common/fp/op/FPMulAdd.h"
|
||||||
#include "common/fp/op/FPRecipEstimate.h"
|
#include "common/fp/op/FPRecipEstimate.h"
|
||||||
|
#include "common/fp/op/FPRecipStepFused.h"
|
||||||
#include "common/fp/op/FPRoundInt.h"
|
#include "common/fp/op/FPRoundInt.h"
|
||||||
#include "common/fp/op/FPRSqrtEstimate.h"
|
#include "common/fp/op/FPRSqrtEstimate.h"
|
||||||
#include "common/fp/op/FPRSqrtStepFused.h"
|
#include "common/fp/op/FPRSqrtStepFused.h"
|
||||||
|
|
55
src/common/fp/op/FPRecipStepFused.cpp
Normal file
55
src/common/fp/op/FPRecipStepFused.cpp
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
/* This file is part of the dynarmic project.
|
||||||
|
* Copyright (c) 2018 MerryMage
|
||||||
|
* This software may be used and distributed according to the terms of the GNU
|
||||||
|
* General Public License version 2 or any later version.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "common/fp/fpcr.h"
|
||||||
|
#include "common/fp/fpsr.h"
|
||||||
|
#include "common/fp/fused.h"
|
||||||
|
#include "common/fp/info.h"
|
||||||
|
#include "common/fp/op/FPNeg.h"
|
||||||
|
#include "common/fp/op/FPRecipStepFused.h"
|
||||||
|
#include "common/fp/process_exception.h"
|
||||||
|
#include "common/fp/process_nan.h"
|
||||||
|
#include "common/fp/unpacked.h"
|
||||||
|
|
||||||
|
namespace Dynarmic::FP {
|
||||||
|
|
||||||
|
template<typename FPT>
|
||||||
|
FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
|
||||||
|
op1 = FPNeg(op1);
|
||||||
|
|
||||||
|
const auto [type1, sign1, value1] = FPUnpack<FPT>(op1, fpcr, fpsr);
|
||||||
|
const auto [type2, sign2, value2] = FPUnpack<FPT>(op2, fpcr, fpsr);
|
||||||
|
|
||||||
|
if (const auto maybe_nan = FPProcessNaNs(type1, type2, op1, op2, fpcr, fpsr)) {
|
||||||
|
return *maybe_nan;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool inf1 = type1 == FPType::Infinity;
|
||||||
|
const bool inf2 = type2 == FPType::Infinity;
|
||||||
|
const bool zero1 = type1 == FPType::Zero;
|
||||||
|
const bool zero2 = type2 == FPType::Zero;
|
||||||
|
|
||||||
|
if ((inf1 && zero2) || (zero1 && inf2)) {
|
||||||
|
return FPInfo<FPT>::Two(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inf1 || inf2) {
|
||||||
|
return FPInfo<FPT>::Infinity(sign1 != sign2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// result_value = 2.0 + (value1 * value2)
|
||||||
|
FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 2), value1, value2);
|
||||||
|
|
||||||
|
if (result_value.mantissa == 0) {
|
||||||
|
return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity);
|
||||||
|
}
|
||||||
|
return FPRound<FPT>(result_value, fpcr, fpsr);
|
||||||
|
}
|
||||||
|
|
||||||
|
template u32 FPRecipStepFused<u32>(u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
|
||||||
|
template u64 FPRecipStepFused<u64>(u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
|
||||||
|
|
||||||
|
} // namespace Dynarmic::FP
|
17
src/common/fp/op/FPRecipStepFused.h
Normal file
17
src/common/fp/op/FPRecipStepFused.h
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
/* This file is part of the dynarmic project.
|
||||||
|
* Copyright (c) 2018 MerryMage
|
||||||
|
* This software may be used and distributed according to the terms of the GNU
|
||||||
|
* General Public License version 2 or any later version.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace Dynarmic::FP {
|
||||||
|
|
||||||
|
class FPCR;
|
||||||
|
class FPSR;
|
||||||
|
|
||||||
|
template<typename FPT>
|
||||||
|
FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
|
||||||
|
|
||||||
|
} // namespace Dynarmic::FP
|
|
@ -1495,6 +1495,13 @@ U32U64 IREmitter::FPRecipEstimate(const U32U64& a) {
|
||||||
return Inst<U64>(Opcode::FPRecipEstimate64, a);
|
return Inst<U64>(Opcode::FPRecipEstimate64, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U32U64 IREmitter::FPRecipStepFused(const U32U64& a, const U32U64& b) {
|
||||||
|
if (a.GetType() == Type::U32) {
|
||||||
|
return Inst<U32>(Opcode::FPRecipStepFused32, a, b);
|
||||||
|
}
|
||||||
|
return Inst<U64>(Opcode::FPRecipStepFused64, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
U32U64 IREmitter::FPRoundInt(const U32U64& a, FP::RoundingMode rounding, bool exact) {
|
U32U64 IREmitter::FPRoundInt(const U32U64& a, FP::RoundingMode rounding, bool exact) {
|
||||||
if (a.GetType() == Type::U32) {
|
if (a.GetType() == Type::U32) {
|
||||||
return Inst<U32>(Opcode::FPRoundInt32, a, static_cast<u8>(rounding), Imm1(exact));
|
return Inst<U32>(Opcode::FPRoundInt32, a, static_cast<u8>(rounding), Imm1(exact));
|
||||||
|
@ -1760,6 +1767,17 @@ U128 IREmitter::FPVectorRecipEstimate(size_t esize, const U128& a) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b) {
|
||||||
|
switch (esize) {
|
||||||
|
case 32:
|
||||||
|
return Inst<U128>(Opcode::FPVectorRecipStepFused32, a, b);
|
||||||
|
case 64:
|
||||||
|
return Inst<U128>(Opcode::FPVectorRecipStepFused64, a, b);
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::FPVectorRSqrtEstimate(size_t esize, const U128& a) {
|
U128 IREmitter::FPVectorRSqrtEstimate(size_t esize, const U128& a) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 32:
|
case 32:
|
||||||
|
|
|
@ -270,6 +270,7 @@ public:
|
||||||
U32U64 FPMulAdd(const U32U64& addend, const U32U64& op1, const U32U64& op2, bool fpscr_controlled);
|
U32U64 FPMulAdd(const U32U64& addend, const U32U64& op1, const U32U64& op2, bool fpscr_controlled);
|
||||||
U32U64 FPNeg(const U32U64& a);
|
U32U64 FPNeg(const U32U64& a);
|
||||||
U32U64 FPRecipEstimate(const U32U64& a);
|
U32U64 FPRecipEstimate(const U32U64& a);
|
||||||
|
U32U64 FPRecipStepFused(const U32U64& a, const U32U64& b);
|
||||||
U32U64 FPRoundInt(const U32U64& a, FP::RoundingMode rounding, bool exact);
|
U32U64 FPRoundInt(const U32U64& a, FP::RoundingMode rounding, bool exact);
|
||||||
U32U64 FPRSqrtEstimate(const U32U64& a);
|
U32U64 FPRSqrtEstimate(const U32U64& a);
|
||||||
U32U64 FPRSqrtStepFused(const U32U64& a, const U32U64& b);
|
U32U64 FPRSqrtStepFused(const U32U64& a, const U32U64& b);
|
||||||
|
@ -306,6 +307,7 @@ public:
|
||||||
U128 FPVectorPairedAdd(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorPairedAdd(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorRecipEstimate(size_t esize, const U128& a);
|
U128 FPVectorRecipEstimate(size_t esize, const U128& a);
|
||||||
|
U128 FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorRSqrtEstimate(size_t esize, const U128& a);
|
U128 FPVectorRSqrtEstimate(size_t esize, const U128& a);
|
||||||
U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorSub(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorSub(size_t esize, const U128& a, const U128& b);
|
||||||
|
|
|
@ -396,6 +396,8 @@ OPCODE(FPNeg32, T::U32, T::U32
|
||||||
OPCODE(FPNeg64, T::U64, T::U64 )
|
OPCODE(FPNeg64, T::U64, T::U64 )
|
||||||
OPCODE(FPRecipEstimate32, T::U32, T::U32 )
|
OPCODE(FPRecipEstimate32, T::U32, T::U32 )
|
||||||
OPCODE(FPRecipEstimate64, T::U64, T::U64 )
|
OPCODE(FPRecipEstimate64, T::U64, T::U64 )
|
||||||
|
OPCODE(FPRecipStepFused32, T::U32, T::U32, T::U32 )
|
||||||
|
OPCODE(FPRecipStepFused64, T::U64, T::U64, T::U64 )
|
||||||
OPCODE(FPRoundInt32, T::U32, T::U32, T::U8, T::U1 )
|
OPCODE(FPRoundInt32, T::U32, T::U32, T::U8, T::U1 )
|
||||||
OPCODE(FPRoundInt64, T::U64, T::U64, T::U8, T::U1 )
|
OPCODE(FPRoundInt64, T::U64, T::U64, T::U8, T::U1 )
|
||||||
OPCODE(FPRSqrtEstimate32, T::U32, T::U32 )
|
OPCODE(FPRSqrtEstimate32, T::U32, T::U32 )
|
||||||
|
@ -454,6 +456,8 @@ OPCODE(FPVectorPairedAdd32, T::U128, T::U128,
|
||||||
OPCODE(FPVectorPairedAdd64, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorPairedAdd64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorRecipEstimate32, T::U128, T::U128 )
|
OPCODE(FPVectorRecipEstimate32, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorRecipEstimate64, T::U128, T::U128 )
|
OPCODE(FPVectorRecipEstimate64, T::U128, T::U128 )
|
||||||
|
OPCODE(FPVectorRecipStepFused32, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(FPVectorRecipStepFused64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorRSqrtEstimate32, T::U128, T::U128 )
|
OPCODE(FPVectorRSqrtEstimate32, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorRSqrtEstimate64, T::U128, T::U128 )
|
OPCODE(FPVectorRSqrtEstimate64, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorRSqrtStepFused32, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorRSqrtStepFused32, T::U128, T::U128, T::U128 )
|
||||||
|
|
Loading…
Reference in a new issue