diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp index 0bf4f619..a9b85a7f 100644 --- a/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -1110,41 +1110,47 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in } }; - if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if constexpr (fsize != 16) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - Xbyak::Label end, fallback; + Xbyak::Label end, fallback; - code.movaps(result, GetVectorOf(code)); - FCODE(vfnmadd231p)(result, operand1, operand2); + code.movaps(result, GetVectorOf(code)); + FCODE(vfnmadd231p)(result, operand1, operand2); - FCODE(vcmpunordp)(tmp, result, result); - code.vptest(tmp, tmp); - code.jnz(fallback, code.T_NEAR); - code.L(end); + FCODE(vcmpunordp)(tmp, result, result); + code.vptest(tmp, tmp); + code.jnz(fallback, code.T_NEAR); + code.L(end); - code.SwitchToFarCode(); - code.L(fallback); - code.sub(rsp, 8); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); - EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); - code.add(rsp, 8); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); + code.SwitchToFarCode(); + code.L(fallback); + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); - ctx.reg_alloc.DefineValue(inst, result); - return; + ctx.reg_alloc.DefineValue(inst, result); + return; + } } EmitThreeOpFallback(code, ctx, inst, fallback_fn); } +void EmitX64::EmitFPVectorRecipStepFused16(EmitContext& ctx, IR::Inst* inst) { + EmitRecipStepFused<16>(code, ctx, inst); +} + void EmitX64::EmitFPVectorRecipStepFused32(EmitContext& ctx, IR::Inst* inst) { EmitRecipStepFused<32>(code, ctx, inst); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 42032457..ea2d509a 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -2277,6 +2277,8 @@ U128 IREmitter::FPVectorRecipEstimate(size_t esize, const U128& a) { U128 IREmitter::FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b) { switch (esize) { + case 16: + return Inst(Opcode::FPVectorRecipStepFused16, a, b); case 32: return Inst(Opcode::FPVectorRecipStepFused32, a, b); case 64: diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index d0ad05b8..873727a6 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -338,6 +338,7 @@ bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const { case Opcode::FPVectorPairedAdd64: case Opcode::FPVectorRecipEstimate32: case Opcode::FPVectorRecipEstimate64: + case Opcode::FPVectorRecipStepFused16: case Opcode::FPVectorRecipStepFused32: case Opcode::FPVectorRecipStepFused64: case Opcode::FPVectorRoundInt16: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index d266190b..dcfcf1a8 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -573,6 +573,7 @@ OPCODE(FPVectorPairedAddLower32, U128, U128 OPCODE(FPVectorPairedAddLower64, U128, U128, U128 ) OPCODE(FPVectorRecipEstimate32, U128, U128 ) OPCODE(FPVectorRecipEstimate64, U128, U128 ) +OPCODE(FPVectorRecipStepFused16, U128, U128, U128 ) OPCODE(FPVectorRecipStepFused32, U128, U128, U128 ) OPCODE(FPVectorRecipStepFused64, U128, U128, U128 ) OPCODE(FPVectorRoundInt16, U128, U128, U8, U1 )