emit_x64_floating_point: SSE4.1 implementation of EmitFPRound

This commit is contained in:
MerryMage 2018-07-16 14:22:29 +01:00
parent a40127a054
commit 83be491875

View file

@ -794,10 +794,38 @@ void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
} }
static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) { static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
const bool exact = inst->GetArg(2).GetU1();
const auto rounding = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
const bool exact = args[2].GetImmediateU1(); const int round_imm = [&]{
switch (rounding) {
case FP::RoundingMode::ToNearest_TieEven:
return 0b00;
case FP::RoundingMode::TowardsPlusInfinity:
return 0b10;
case FP::RoundingMode::TowardsMinusInfinity:
return 0b01;
case FP::RoundingMode::TowardsZero:
return 0b11;
default:
UNREACHABLE();
}
return 0;
}();
if (fsize == 64) {
FPTwoOp64(code, ctx, inst, [&](Xbyak::Xmm result) {
code.roundsd(result, result, round_imm);
});
} else {
FPTwoOp32(code, ctx, inst, [&](Xbyak::Xmm result) {
code.roundss(result, result, round_imm);
});
}
return;
}
using fsize_list = mp::list<mp::vlift<size_t(32)>, mp::vlift<size_t(64)>>; using fsize_list = mp::list<mp::vlift<size_t(32)>, mp::vlift<size_t(64)>>;
using rounding_list = mp::list< using rounding_list = mp::list<
@ -832,6 +860,7 @@ static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, siz
mp::cartesian_product<fsize_list, rounding_list, exact_list>{} mp::cartesian_product<fsize_list, rounding_list, exact_list>{}
); );
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(inst, args[0]); ctx.reg_alloc.HostCall(inst, args[0]);
code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR()); code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR());