diff --git a/src/backend/x64/emit_x64_floating_point.cpp b/src/backend/x64/emit_x64_floating_point.cpp index 67142359..b63bcbba 100644 --- a/src/backend/x64/emit_x64_floating_point.cpp +++ b/src/backend/x64/emit_x64_floating_point.cpp @@ -845,6 +845,21 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* ctx.reg_alloc.DefineValue(inst, result); return; } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, code.MConst(xword, FP::FPValue())); + FCODE(muls)(operand1, operand2); + FCODE(subs)(result, operand1); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } } auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -1030,6 +1045,22 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* ctx.reg_alloc.DefineValue(inst, result); return; } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, code.MConst(xword, FP::FPValue())); + FCODE(muls)(operand1, operand2); + FCODE(subs)(result, operand1); + FCODE(muls)(result, code.MConst(xword, FP::FPValue())); + + ctx.reg_alloc.DefineValue(inst, operand1); + return; + } } auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp index d9b4f0a0..20b1c808 100644 --- a/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -1267,6 +1267,21 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in ctx.reg_alloc.DefineValue(inst, result); return; } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, GetVectorOf(code)); + FCODE(mulp)(operand1, operand2); + FCODE(subp)(result, operand1); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } } EmitThreeOpFallback(code, ctx, inst, fallback_fn); @@ -1453,6 +1468,22 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in ctx.reg_alloc.DefineValue(inst, result); return; } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, GetVectorOf(code)); + FCODE(mulp)(operand1, operand2); + FCODE(subp)(result, operand1); + FCODE(mulp)(result, GetVectorOf(code)); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } } EmitThreeOpFallback(code, ctx, inst, fallback_fn);