emit_x64_floating_point: SSE4.1 implementation of EmitFPRound

2018-07-16 14:22:29 +01:00 · 2018-07-16 14:22:29 +01:00 · 83be491875
commit 83be491875
parent a40127a054
1 changed files with 32 additions and 3 deletions
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@ -794,10 +794,38 @@ void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
 }
 static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
    const bool exact = inst->GetArg(2).GetU1();
-    const auto rounding = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+    if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
-    const bool exact = args[2].GetImmediateU1();
+        const int round_imm = [&]{
            switch (rounding) {
            case FP::RoundingMode::ToNearest_TieEven:
                return 0b00;
            case FP::RoundingMode::TowardsPlusInfinity:
                return 0b10;
            case FP::RoundingMode::TowardsMinusInfinity:
                return 0b01;
            case FP::RoundingMode::TowardsZero:
                return 0b11;
            default:
                UNREACHABLE();
            }
            return 0;
        }();
        if (fsize == 64) {
            FPTwoOp64(code, ctx, inst, [&](Xbyak::Xmm result) {
                code.roundsd(result, result, round_imm);
            });
        } else {
            FPTwoOp32(code, ctx, inst, [&](Xbyak::Xmm result) {
                code.roundss(result, result, round_imm);
            });
        }
        return;
    }
    using fsize_list = mp::list<mp::vlift<size_t(32)>, mp::vlift<size_t(64)>>;
    using rounding_list = mp::list<
@ -832,6 +860,7 @@ static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, siz
        mp::cartesian_product<fsize_list, rounding_list, exact_list>{}
    );
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    ctx.reg_alloc.HostCall(inst, args[0]);
    code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
    code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR());