emit_x64{_vector}_floating_point: Add unsafe optimizations for RSqrtEstimate and RecipEstimate
This commit is contained in:
parent
761e95eec0
commit
82417da780
2 changed files with 106 additions and 20 deletions
|
@ -738,8 +738,29 @@ void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPMulX<64>(code, ctx, inst);
|
EmitFPMulX<64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename FPT>
|
template<size_t fsize>
|
||||||
static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
|
if constexpr (fsize != 16) {
|
||||||
|
if (ctx.UnsafeOptimizations()) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
code.rcpss(result, operand);
|
||||||
|
} else {
|
||||||
|
code.cvtsd2ss(result, operand);
|
||||||
|
code.rcpss(result, result);
|
||||||
|
code.cvtss2sd(result, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
ctx.reg_alloc.HostCall(inst, args[0]);
|
ctx.reg_alloc.HostCall(inst, args[0]);
|
||||||
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
|
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
|
||||||
|
@ -748,19 +769,21 @@ static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRecipEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRecipEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRecipEstimate<u16>(code, ctx, inst);
|
EmitFPRecipEstimate<16>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRecipEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRecipEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRecipEstimate<u32>(code, ctx, inst);
|
EmitFPRecipEstimate<32>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRecipEstimate<u64>(code, ctx, inst);
|
EmitFPRecipEstimate<64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename FPT>
|
template <size_t fsize>
|
||||||
static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
ctx.reg_alloc.HostCall(inst, args[0]);
|
ctx.reg_alloc.HostCall(inst, args[0]);
|
||||||
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
|
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
|
||||||
|
@ -769,15 +792,15 @@ static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRecipExponent16(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRecipExponent16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRecipExponent<u16>(code, ctx, inst);
|
EmitFPRecipExponent<16>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRecipExponent32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRecipExponent32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRecipExponent<u32>(code, ctx, inst);
|
EmitFPRecipExponent<32>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRecipExponent64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRecipExponent64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRecipExponent<u64>(code, ctx, inst);
|
EmitFPRecipExponent<64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
|
@ -911,8 +934,29 @@ void EmitX64::EmitFPRoundInt64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRound(code, ctx, inst, 64);
|
EmitFPRound(code, ctx, inst, 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename FPT>
|
template<size_t fsize>
|
||||||
static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
|
if constexpr (fsize != 16) {
|
||||||
|
if (ctx.UnsafeOptimizations()) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
code.rsqrtss(result, operand);
|
||||||
|
} else {
|
||||||
|
code.cvtsd2ss(result, operand);
|
||||||
|
code.rsqrtss(result, result);
|
||||||
|
code.cvtss2sd(result, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
ctx.reg_alloc.HostCall(inst, args[0]);
|
ctx.reg_alloc.HostCall(inst, args[0]);
|
||||||
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
|
code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
|
||||||
|
@ -921,15 +965,15 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRSqrtEstimate<u16>(code, ctx, inst);
|
EmitFPRSqrtEstimate<16>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRSqrtEstimate<u32>(code, ctx, inst);
|
EmitFPRSqrtEstimate<32>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPRSqrtEstimate<u64>(code, ctx, inst);
|
EmitFPRSqrtEstimate<64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
|
|
|
@ -1178,8 +1178,29 @@ void EmitX64::EmitFPVectorPairedAddLower64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename FPT>
|
template<size_t fsize>
|
||||||
static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
|
if constexpr (fsize != 16) {
|
||||||
|
if (ctx.UnsafeOptimizations()) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
code.rcpps(result, operand);
|
||||||
|
} else {
|
||||||
|
code.cvtpd2ps(result, operand);
|
||||||
|
code.rcpps(result, result);
|
||||||
|
code.cvtps2pd(result, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||||
for (size_t i = 0; i < result.size(); i++) {
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
result[i] = FP::FPRecipEstimate<FPT>(operand[i], fpcr, fpsr);
|
result[i] = FP::FPRecipEstimate<FPT>(operand[i], fpcr, fpsr);
|
||||||
|
@ -1188,15 +1209,15 @@ static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorRecipEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorRecipEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitRecipEstimate<u16>(code, ctx, inst);
|
EmitRecipEstimate<16>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorRecipEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorRecipEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitRecipEstimate<u32>(code, ctx, inst);
|
EmitRecipEstimate<32>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitRecipEstimate<u64>(code, ctx, inst);
|
EmitRecipEstimate<64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
|
@ -1337,8 +1358,29 @@ void EmitX64::EmitFPVectorRoundInt64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitFPVectorRoundInt<64>(code, ctx, inst);
|
EmitFPVectorRoundInt<64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename FPT>
|
template<size_t fsize>
|
||||||
static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
|
if constexpr (fsize != 16) {
|
||||||
|
if (ctx.UnsafeOptimizations()) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
code.rsqrtps(result, operand);
|
||||||
|
} else {
|
||||||
|
code.cvtpd2ps(result, operand);
|
||||||
|
code.rsqrtps(result, result);
|
||||||
|
code.cvtps2pd(result, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||||
for (size_t i = 0; i < result.size(); i++) {
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
result[i] = FP::FPRSqrtEstimate<FPT>(operand[i], fpcr, fpsr);
|
result[i] = FP::FPRSqrtEstimate<FPT>(operand[i], fpcr, fpsr);
|
||||||
|
@ -1347,15 +1389,15 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitRSqrtEstimate<u16>(code, ctx, inst);
|
EmitRSqrtEstimate<16>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitRSqrtEstimate<u32>(code, ctx, inst);
|
EmitRSqrtEstimate<32>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitRSqrtEstimate<u64>(code, ctx, inst);
|
EmitRSqrtEstimate<64>(code, ctx, inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
|
|
Loading…
Reference in a new issue