From 83be4918755d3d1788b5e8a782bc88586dfdd597 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Mon, 16 Jul 2018 14:22:29 +0100 Subject: [PATCH] emit_x64_floating_point: SSE4.1 implementation of EmitFPRound --- src/backend_x64/emit_x64_floating_point.cpp | 35 +++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp index 90e3ef37..2d051a3e 100644 --- a/src/backend_x64/emit_x64_floating_point.cpp +++ b/src/backend_x64/emit_x64_floating_point.cpp @@ -794,10 +794,38 @@ void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) { } static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding = static_cast(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); - const auto rounding = static_cast(args[1].GetImmediateU8()); - const bool exact = args[2].GetImmediateU1(); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) { + const int round_imm = [&]{ + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + default: + UNREACHABLE(); + } + return 0; + }(); + + if (fsize == 64) { + FPTwoOp64(code, ctx, inst, [&](Xbyak::Xmm result) { + code.roundsd(result, result, round_imm); + }); + } else { + FPTwoOp32(code, ctx, inst, [&](Xbyak::Xmm result) { + code.roundss(result, result, round_imm); + }); + } + + return; + } using fsize_list = mp::list, mp::vlift>; using rounding_list = mp::list< @@ -832,6 +860,7 @@ static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, siz mp::cartesian_product{} ); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, args[0]); code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR());