diff --git a/src/backend/x64/emit_x64_floating_point.cpp b/src/backend/x64/emit_x64_floating_point.cpp index f64a8b7e..32ed3734 100644 --- a/src/backend/x64/emit_x64_floating_point.cpp +++ b/src/backend/x64/emit_x64_floating_point.cpp @@ -283,7 +283,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) auto args = ctx.reg_alloc.GetArgumentInfo(inst); - if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + if (ctx.FPCR().DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); @@ -293,22 +293,10 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) fn(result, operand); } - ctx.reg_alloc.DefineValue(inst, result); - return; - } - - if (ctx.FPCR().DN()) { - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); - - if constexpr (std::is_member_function_pointer_v) { - (code.*fn)(result, operand); - } else { - fn(result, operand); + if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + ForceToDefaultNaN(code, result); } - ForceToDefaultNaN(code, result); - ctx.reg_alloc.DefineValue(inst, result); return; } @@ -605,9 +593,9 @@ template static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { using FPT = mp::unsigned_integer_of_size; - if constexpr (fsize != 16) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if constexpr (fsize != 16) { if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); @@ -680,7 +668,6 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { } } - auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, args[0], args[1], args[2]); code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value()); #ifdef _WIN32 @@ -834,10 +821,10 @@ template static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { using FPT = mp::unsigned_integer_of_size; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if constexpr (fsize != 16) { if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - Xbyak::Label end, fallback; const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); @@ -852,8 +839,6 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (code.HasFMA()) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - Xbyak::Label end, fallback; const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); @@ -888,8 +873,6 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); @@ -903,7 +886,6 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } } - auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, args[0], args[1]); code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); @@ -1038,10 +1020,10 @@ template static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { using FPT = mp::unsigned_integer_of_size; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if constexpr (fsize != 16) { if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); @@ -1055,8 +1037,6 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (code.HasFMA() && code.HasAVX()) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - Xbyak::Label end, fallback; const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); @@ -1103,8 +1083,6 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); @@ -1119,7 +1097,6 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } } - auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, args[0], args[1]); code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp index d09e7412..c7585436 100644 --- a/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -989,10 +989,10 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { }; if constexpr (fsize != 16) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const bool fpcr_controlled = args[3].GetImmediateU1(); - if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[3].GetImmediateU1(); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]); @@ -1006,6 +1006,9 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { } if (code.HasFMA() && code.HasAVX()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[3].GetImmediateU1(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); @@ -1041,6 +1044,8 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { } if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); @@ -1247,10 +1252,10 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in }; if constexpr (fsize != 16) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const bool fpcr_controlled = args[2].GetImmediateU1(); - if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); @@ -1265,6 +1270,9 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in } if (code.HasFMA() && code.HasAVX()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); @@ -1297,6 +1305,8 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in } if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); @@ -1454,10 +1464,10 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in }; if constexpr (fsize != 16) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const bool fpcr_controlled = args[2].GetImmediateU1(); - if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); @@ -1473,6 +1483,9 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in } if (code.HasFMA() && code.HasAVX()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); @@ -1511,6 +1524,8 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in } if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();