emit_x64_floating_point: Simplify EmitFP{Min,Max}{,Numeric}{32,64}

2018-08-02 20:16:03 +01:00 · 2018-08-02 20:16:03 +01:00 · 700088408d
commit 700088408d
parent 07e0585994
1 changed files with 124 additions and 269 deletions
--- a/src/backend_x64/emit_x64_floating_point.cpp
+++ b/src/backend_x64/emit_x64_floating_point.cpp
@ -108,75 +108,6 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch)
    code.pand(xmm_value, xmm_scratch);
 }
 template<size_t fsize>
 void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label& end) {
    using FPT = mp::unsigned_integer_of_size<fsize>;
    Xbyak::Label nan;
    FCODE(ucomis)(a, b);
    code.jp(nan, code.T_NEAR);
    code.SwitchToFarCode();
    code.L(nan);
    code.sub(rsp, 8);
    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
    code.movq(code.ABI_PARAM1, a);
    code.movq(code.ABI_PARAM2, b);
    code.CallFunction(static_cast<FPT(*)(FPT, FPT)>([](FPT a, FPT b) -> FPT {
        return *FP::ProcessNaNs(a, b);
    }));
    code.movq(a, code.ABI_RETURN);
    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
    code.add(rsp, 8);
    code.jmp(end, code.T_NEAR);
    code.SwitchToNearCode();
 }
 template<size_t fsize, typename NaNHandler>
 void PreProcessNaNs(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
    using FPT = mp::unsigned_integer_of_size<fsize>;
    Xbyak::Label nan;
    FCODE(ucomis)(a, b);
    code.jp(nan, code.T_NEAR);
    FCODE(ucomis)(c, c);
    code.jp(nan, code.T_NEAR);
    code.SwitchToFarCode();
    code.L(nan);
    code.sub(rsp, 8);
    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
    code.movq(code.ABI_PARAM1, a);
    code.movq(code.ABI_PARAM2, b);
    code.movq(code.ABI_PARAM3, c);
    code.mov(code.ABI_PARAM4, ctx.FPCR());
    code.CallFunction(static_cast<FPT(*)(FPT, FPT, FPT, FP::FPCR)>(nan_handler));
    code.movq(a, code.ABI_RETURN);
    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
    code.add(rsp, 8);
    code.jmp(end, code.T_NEAR);
    code.SwitchToNearCode();
 }
 template<size_t fsize>
 void PostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
    if constexpr (fsize == 32) {
        code.movaps(tmp, result);
        code.cmpunordps(tmp, tmp);
        code.pslld(tmp, 31);
        code.xorps(result, tmp);
    } else {
        code.movaps(tmp, result);
        code.cmpunordpd(tmp, tmp);
        code.psllq(tmp, 63);
        code.xorps(result, tmp);
    }
 }
 template<size_t fsize>
 void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
    if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
@ -207,6 +138,21 @@ Xbyak::Label ProcessNaN(BlockOfCode& code, Xbyak::Xmm a) {
    return end;
 }
 template<size_t fsize>
 void PostProcessNaN(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
    if constexpr (fsize == 32) {
        code.movaps(tmp, result);
        code.cmpunordps(tmp, tmp);
        code.pslld(tmp, 31);
        code.xorps(result, tmp);
    } else {
        code.movaps(tmp, result);
        code.cmpunordpd(tmp, tmp);
        code.psllq(tmp, 63);
        code.xorps(result, tmp);
    }
 }
 // This is necessary because x86 and ARM differ in they way they return NaNs from floating point operations
 //
 // ARM behaviour:
@ -372,47 +318,7 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
    if (ctx.FPSCR_DN()) {
        ForceToDefaultNaN<fsize>(code, result);
    } else if (ctx.AccurateNaN()) {
-        PostProcessNaNs<fsize>(code, result, ctx.reg_alloc.ScratchXmm());
+        PostProcessNaN<fsize>(code, result, ctx.reg_alloc.ScratchXmm());
    }
    code.L(end);
    ctx.reg_alloc.DefineValue(inst, result);
 }
 enum class CallDenormalsAreZero {
    Yes,
    No,
 };
 template <size_t fsize, typename PreprocessFunction, typename Function>
 void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, [[maybe_unused]] PreprocessFunction preprocess, Function fn, CallDenormalsAreZero call_denormals_are_zero = CallDenormalsAreZero::No) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    Xbyak::Label end;
    Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
    Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
    Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
    if (ctx.FPSCR_FTZ() && call_denormals_are_zero == CallDenormalsAreZero::Yes) {
        DenormalsAreZero<fsize>(code, result, gpr_scratch);
        DenormalsAreZero<fsize>(code, operand, gpr_scratch);
    }
    if constexpr(!std::is_same_v<PreprocessFunction, std::nullptr_t>) {
        preprocess(result, operand, gpr_scratch, end);
    }
    if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
        PreProcessNaNs<fsize>(code, result, operand, end);
    }
    if constexpr (std::is_member_function_pointer_v<Function>) {
        (code.*fn)(result, operand);
    } else {
        fn(result, operand);
    }
    if (ctx.FPSCR_DN()) {
        ForceToDefaultNaN<fsize>(code, result);
    } else if (ctx.AccurateNaN()) {
        PostProcessNaNs<fsize>(code, result, operand);
    }
    code.L(end);
@ -528,8 +434,8 @@ void EmitX64::EmitFPDiv64(EmitContext& ctx, IR::Inst* inst) {
    FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::divsd);
 }
-template<size_t fsize>
+template<size_t fsize, bool is_max>
-static void EmitFPMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -545,14 +451,22 @@ static void EmitFPMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    FCODE(ucomis)(result, operand);
    code.jz(equal, code.T_NEAR);
    if constexpr (is_max) {
        FCODE(maxs)(result, operand);
    } else {
        FCODE(mins)(result, operand);
    }
    code.L(end);
    code.SwitchToFarCode();
    code.L(equal);
    code.jp(nan);
    if constexpr (is_max) {
        code.andps(result, operand);
    } else {
        code.orps(result, operand);
    }
    code.jmp(end);
    code.L(nan);
@ -568,196 +482,137 @@ static void EmitFPMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    ctx.reg_alloc.DefineValue(inst, result);
 }
-void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) {
+template<size_t fsize, bool is_max>
-    EmitFPMax<32>(code, ctx, inst);
+static void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
-}
+    using FPT = mp::unsigned_integer_of_size<fsize>;
    constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1);
 void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMax<64>(code, ctx, inst);
 }
 void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
    FPThreeOp<32>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){
        Xbyak::Label normal, normal_or_equal, result_is_result;
        code.ucomiss(result, operand);
        code.jnp(normal_or_equal);
        // If operand == QNaN, result = result.
        code.movd(scratch.cvt32(), operand);
        code.shl(scratch.cvt32(), 1);
        code.cmp(scratch.cvt32(), 0xff800000u);
        code.jae(result_is_result);
        // If operand == SNaN, let usual NaN code handle it.
        code.cmp(scratch.cvt32(), 0xff000000u);
        code.ja(normal);
        // If result == SNaN, && operand != NaN, result = result.
        code.movd(scratch.cvt32(), result);
        code.shl(scratch.cvt32(), 1);
        code.cmp(scratch.cvt32(), 0xff800000u);
        code.jnae(result_is_result);
        // If result == QNaN && operand != NaN, result = operand.
        code.movaps(result, operand);
        code.jmp(end, code.T_NEAR);
        code.L(result_is_result);
        code.movaps(operand, result);
        code.jmp(normal);
        code.L(normal_or_equal);
        code.jnz(normal);
        code.andps(operand, result);
        code.L(normal);
    }, &Xbyak::CodeGenerator::maxss, CallDenormalsAreZero::Yes);
 }
 void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
    FPThreeOp<64>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){
        Xbyak::Label normal, normal_or_equal, result_is_result;
        code.ucomisd(result, operand);
        code.jnp(normal_or_equal);
        // If operand == QNaN, result = result.
        code.movq(scratch, operand);
        code.shl(scratch, 1);
        code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u));
        code.jae(result_is_result);
        // If operand == SNaN, let usual NaN code handle it.
        code.cmp(scratch, code.MConst(qword, 0xffe0'0000'0000'0000u));
        code.ja(normal);
        // If result == SNaN, && operand != NaN, result = result.
        code.movq(scratch, result);
        code.shl(scratch, 1);
        code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u));
        code.jnae(result_is_result);
        // If result == QNaN && operand != NaN, result = operand.
        code.movaps(result, operand);
        code.jmp(end, code.T_NEAR);
        code.L(result_is_result);
        code.movaps(operand, result);
        code.jmp(normal);
        code.L(normal_or_equal);
        code.jnz(normal);
        code.andps(operand, result);
        code.L(normal);
    }, &Xbyak::CodeGenerator::maxsd, CallDenormalsAreZero::Yes);
 }
 template<size_t fsize>
 static void EmitFPMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm op1 = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
+    const Xbyak::Xmm op2 = ctx.reg_alloc.UseScratchXmm(args[1]); // Result stored here!
-    const Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
+    Xbyak::Reg tmp = ctx.reg_alloc.ScratchGpr();
    tmp.setBit(fsize);
    const auto move_to_tmp = [&](const Xbyak::Xmm& xmm) {
        if constexpr (fsize == 32) {
            code.movd(tmp.cvt32(), xmm);
        } else {
            code.movq(tmp.cvt64(), xmm);
        }
    };
    if (ctx.FPSCR_FTZ()) {
-        DenormalsAreZero<fsize>(code, result, gpr_scratch);
+        DenormalsAreZero<fsize>(code, op1, tmp.cvt64());
-        DenormalsAreZero<fsize>(code, operand, gpr_scratch);
+        DenormalsAreZero<fsize>(code, op2, tmp.cvt64());
    }
-    Xbyak::Label equal, end, nan;
+    Xbyak::Label end, z, nan, op2_is_nan, snan, maybe_both_nan, normal;
-    FCODE(ucomis)(result, operand);
+    FCODE(ucomis)(op1, op2);
-    code.jz(equal, code.T_NEAR);
+    code.jz(z, code.T_NEAR);
-    FCODE(mins)(result, operand);
+    code.L(normal);
    if constexpr (is_max) {
        FCODE(maxs)(op2, op1);
    } else {
        FCODE(mins)(op2, op1);
    }
    code.L(end);
    code.SwitchToFarCode();
-    code.L(equal);
+    code.L(z);
    code.jp(nan);
-    code.orps(result, operand);
+    if constexpr (is_max) {
        code.andps(op2, op1);
    } else {
        code.orps(op2, op1);
    }
    code.jmp(end);
    // NaN requirements:
    // op1     op2      result
    // SNaN    anything op1
    // !SNaN   SNaN     op2
    // QNaN    !NaN     op2
    // !NaN    QNaN     op1
    // QNaN    QNaN     op1
    code.L(nan);
    FCODE(ucomis)(op1, op1);
    code.jnp(op2_is_nan);
    // op1 is NaN
    move_to_tmp(op1);
    code.bt(tmp, mantissa_msb_bit);
    code.jc(maybe_both_nan);
    if (ctx.FPSCR_DN()) {
-        code.movaps(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
+        code.L(snan);
        code.movaps(op2, code.MConst(xword, FP::FPInfo<FPT>::DefaultNaN()));
        code.jmp(end);
    } else {
-        EmitProcessNaNs<fsize>(code, result, result, operand, gpr_scratch, end);
+        code.movaps(op2, op1);
        code.L(snan);
        code.orps(op2, code.MConst(xword, FP::FPInfo<FPT>::mantissa_msb));
        code.jmp(end);
    }
    code.L(maybe_both_nan);
    FCODE(ucomis)(op2, op2);
    code.jnp(end, code.T_NEAR);
    if (ctx.FPSCR_DN()) {
        code.jmp(snan);
    } else {
        move_to_tmp(op2);
        code.bt(tmp.cvt64(), mantissa_msb_bit);
        code.jnc(snan);
        code.movaps(op2, op1);
        code.jmp(end);
    }
    // op2 is NaN
    code.L(op2_is_nan);
    move_to_tmp(op2);
    code.bt(tmp, mantissa_msb_bit);
    code.jnc(snan);
    code.movaps(op2, op1);
    code.jmp(end);
    code.SwitchToNearCode();
-    ctx.reg_alloc.DefineValue(inst, result);
+    ctx.reg_alloc.DefineValue(inst, op2);
 }
 void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMinMax<32, true>(code, ctx, inst);
 }
 void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMinMax<64, true>(code, ctx, inst);
 }
 void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMinMaxNumeric<32, true>(code, ctx, inst);
 }
 void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
    EmitFPMinMaxNumeric<64, true>(code, ctx, inst);
 }
 void EmitX64::EmitFPMin32(EmitContext& ctx, IR::Inst* inst) {
-    EmitFPMin<32>(code, ctx, inst);
+    EmitFPMinMax<32, false>(code, ctx, inst);
 }
 void EmitX64::EmitFPMin64(EmitContext& ctx, IR::Inst* inst) {
-    EmitFPMin<64>(code, ctx, inst);
+    EmitFPMinMax<64, false>(code, ctx, inst);
 }
 void EmitX64::EmitFPMinNumeric32(EmitContext& ctx, IR::Inst* inst) {
-    FPThreeOp<32>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){
+    EmitFPMinMaxNumeric<32, false>(code, ctx, inst);
        Xbyak::Label normal, normal_or_equal, result_is_result;
        code.ucomiss(result, operand);
        code.jnp(normal_or_equal);
        // If operand == QNaN, result = result.
        code.movd(scratch.cvt32(), operand);
        code.shl(scratch.cvt32(), 1);
        code.cmp(scratch.cvt32(), 0xff800000u);
        code.jae(result_is_result);
        // If operand == SNaN, let usual NaN code handle it.
        code.cmp(scratch.cvt32(), 0xff000000u);
        code.ja(normal);
        // If result == SNaN, && operand != NaN, result = result.
        code.movd(scratch.cvt32(), result);
        code.shl(scratch.cvt32(), 1);
        code.cmp(scratch.cvt32(), 0xff800000u);
        code.jnae(result_is_result);
        // If result == QNaN && operand != NaN, result = operand.
        code.movaps(result, operand);
        code.jmp(end, code.T_NEAR);
        code.L(result_is_result);
        code.movaps(operand, result);
        code.jmp(normal);
        code.L(normal_or_equal);
        code.jnz(normal);
        code.orps(operand, result);
        code.L(normal);
    }, &Xbyak::CodeGenerator::minss, CallDenormalsAreZero::Yes);
 }
 void EmitX64::EmitFPMinNumeric64(EmitContext& ctx, IR::Inst* inst) {
-    FPThreeOp<64>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){
+    EmitFPMinMaxNumeric<64, false>(code, ctx, inst);
        Xbyak::Label normal, normal_or_equal, result_is_result;
        code.ucomisd(result, operand);
        code.jnp(normal_or_equal);
        // If operand == QNaN, result = result.
        code.movq(scratch, operand);
        code.shl(scratch, 1);
        code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u));
        code.jae(result_is_result);
        // If operand == SNaN, let usual NaN code handle it.
        code.cmp(scratch, code.MConst(qword, 0xffe0'0000'0000'0000u));
        code.ja(normal);
        // If result == SNaN, && operand != NaN, result = result.
        code.movq(scratch, result);
        code.shl(scratch, 1);
        code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u));
        code.jnae(result_is_result);
        // If result == QNaN && operand != NaN, result = operand.
        code.movaps(result, operand);
        code.jmp(end, code.T_NEAR);
        code.L(result_is_result);
        code.movaps(operand, result);
        code.jmp(normal);
        code.L(normal_or_equal);
        code.jnz(normal);
        code.orps(operand, result);
        code.L(normal);
    }, &Xbyak::CodeGenerator::minsd, CallDenormalsAreZero::Yes);
 }
 void EmitX64::EmitFPMul32(EmitContext& ctx, IR::Inst* inst) {