emit_x64_vector_floating_point: FPVectorMulAdd: Minimize full fallback

2023-08-28 12:58:09 +01:00 · 2023-08-28 12:58:09 +01:00 · adac93f12e
commit adac93f12e
parent ceea80dd59
1 changed files with 59 additions and 10 deletions
--- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@ -381,7 +381,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
    ctx.reg_alloc.DefineValue(inst, result);
 }
-enum CheckInputNaN {
+enum class CheckInputNaN {
    Yes,
    No,
 };
@ -540,7 +540,12 @@ void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, La
    ctx.reg_alloc.DefineValue(inst, result);
 }
-template<typename Lambda>
+enum class LoadPreviousResult {
    Yes,
    No,
 };
 template<LoadPreviousResult load_previous_result = LoadPreviousResult::No, typename Lambda>
 void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Xbyak::Xmm arg3, Lambda lambda, bool fpcr_controlled) {
    const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
@ -565,6 +570,9 @@ void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbya
    code.lea(code.ABI_PARAM6, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
 #endif
    if constexpr (load_previous_result == LoadPreviousResult::Yes) {
        code.movaps(xword[code.ABI_PARAM1], result);
    }
    code.movaps(xword[code.ABI_PARAM2], arg1);
    code.movaps(xword[code.ABI_PARAM3], arg2);
    code.movaps(xword[code.ABI_PARAM4], arg3);
@ -1290,6 +1298,31 @@ void EmitX64::EmitFPVectorMul64(EmitContext& ctx, IR::Inst* inst) {
    EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulpd);
 }
 template<typename FPT, bool needs_rounding_correction, bool needs_nan_correction>
 static void EmitFPVectorMulAddFallback(VectorArray<FPT>& result, const VectorArray<FPT>& addend, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, [[maybe_unused]] FP::FPSR& fpsr) {
    for (size_t i = 0; i < result.size(); i++) {
        if constexpr (needs_rounding_correction) {
            constexpr FPT non_sign_mask = FP::FPInfo<FPT>::exponent_mask | FP::FPInfo<FPT>::mantissa_mask;
            constexpr FPT smallest_normal_number = FP::FPValue<FPT, false, FP::FPInfo<FPT>::exponent_min, 1>();
            if ((result[i] & non_sign_mask) == smallest_normal_number) {
                result[i] = FP::FPMulAdd<FPT>(addend[i], op1[i], op2[i], fpcr, fpsr);
                continue;
            }
        }
        if constexpr (needs_nan_correction) {
            if (FP::IsNaN(result[i])) {
                if (FP::IsQNaN(addend[i]) && ((FP::IsZero(op1[i], fpcr) && FP::IsInf(op2[i])) || (FP::IsInf(op1[i]) && FP::IsZero(op2[i], fpcr)))) {
                    result[i] = FP::FPInfo<FPT>::DefaultNaN();
                } else if (auto r = FP::ProcessNaNs(addend[i], op1[i], op2[i])) {
                    result[i] = *r;
                } else {
                    result[i] = FP::FPInfo<FPT>::DefaultNaN();
                }
            }
        }
    }
 }
 template<size_t fsize>
 void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    using FPT = mcl::unsigned_integer_of_size<fsize>;
@ -1301,9 +1334,12 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    };
    if constexpr (fsize != 16) {
-        if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+        const bool fpcr_controlled = inst->GetArg(3).GetU1();
        const bool needs_rounding_correction = ctx.FPCR(fpcr_controlled).FZ();
        const bool needs_nan_correction = !(ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN));
        if (code.HasHostFeature(HostFeature::FMA) && !needs_rounding_correction && !needs_nan_correction) {
            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
            const bool fpcr_controlled = args[3].GetImmediateU1();
            const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
            const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
@ -1311,6 +1347,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
            MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
                FCODE(vfmadd231p)(result, xmm_b, xmm_c);
                ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
            });
            ctx.reg_alloc.DefineValue(inst, result);
@ -1319,12 +1356,11 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
        if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
            const bool fpcr_controlled = args[3].GetImmediateU1();
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
            const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
            const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
            const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
            SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
@ -1333,19 +1369,32 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
                code.movaps(result, xmm_a);
                FCODE(vfmadd231p)(result, xmm_b, xmm_c);
-                code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
+                if (needs_rounding_correction && needs_nan_correction) {
-                code.andnps(tmp, result);
+                    code.vandps(tmp, result, GetNonSignMaskVector<fsize>(code));
                    FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
                } else if (needs_rounding_correction) {
                    code.vandps(tmp, result, GetNonSignMaskVector<fsize>(code));
                    ICODE(vpcmpeq)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
                } else if (needs_nan_correction) {
                    FCODE(vcmpunordp)(tmp, result, result);
                }
                code.vptest(tmp, tmp);
                code.jnz(*fallback, code.T_NEAR);
                code.L(*end);
                ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
            });
            ctx.deferred_emits.emplace_back([=, &code, &ctx] {
                code.L(*fallback);
                code.sub(rsp, 8);
                ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
-                EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, xmm_a, xmm_b, xmm_c, fallback_fn, fpcr_controlled);
+                if (needs_rounding_correction && needs_nan_correction) {
                    EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, true, true>, fpcr_controlled);
                } else if (needs_rounding_correction) {
                    EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, true, false>, fpcr_controlled);
                } else if (needs_nan_correction) {
                    EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, false, true>, fpcr_controlled);
                }
                ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
                code.add(rsp, 8);
                code.jmp(*end, code.T_NEAR);