ir/frontend: Add half-precision opcode for FPVectorMulAdd

2019-04-13 01:42:35 -04:00 · 2019-04-13 01:42:35 -04:00 · ec6b3ae084
commit ec6b3ae084
parent 5f74d25bf7
4 changed files with 37 additions and 27 deletions
--- a/src/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/backend/x64/emit_x64_vector_floating_point.cpp
@ -908,44 +908,50 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
        }
    };

-    if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
-        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    if constexpr (fsize != 16) {
+        if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
+            auto args = ctx.reg_alloc.GetArgumentInfo(inst);

-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
-        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
-        const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+            const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+            const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+            const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
+            const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();

-        Xbyak::Label end, fallback;
+            Xbyak::Label end, fallback;

-        code.movaps(result, xmm_a);
-        FCODE(vfmadd231p)(result, xmm_b, xmm_c);
+            code.movaps(result, xmm_a);
+            FCODE(vfmadd231p)(result, xmm_b, xmm_c);

-        code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
-        code.andnps(tmp, result);
-        FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
-        code.vptest(tmp, tmp);
-        code.jnz(fallback, code.T_NEAR);
-        code.L(end);
+            code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
+            code.andnps(tmp, result);
+            FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
+            code.vptest(tmp, tmp);
+            code.jnz(fallback, code.T_NEAR);
+            code.L(end);

-        code.SwitchToFarCode();
-        code.L(fallback);
-        code.sub(rsp, 8);
-        ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
-        EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, xmm_a, xmm_b, xmm_c, fallback_fn);
-        ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
-        code.add(rsp, 8);
-        code.jmp(end, code.T_NEAR);
-        code.SwitchToNearCode();
+            code.SwitchToFarCode();
+            code.L(fallback);
+            code.sub(rsp, 8);
+            ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+            EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, xmm_a, xmm_b, xmm_c, fallback_fn);
+            ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+            code.add(rsp, 8);
+            code.jmp(end, code.T_NEAR);
+            code.SwitchToNearCode();

-        ctx.reg_alloc.DefineValue(inst, result);
-        return;
+            ctx.reg_alloc.DefineValue(inst, result);
+            return;
+        }
    }

    EmitFourOpFallback(code, ctx, inst, fallback_fn);
 }

+void EmitX64::EmitFPVectorMulAdd16(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPVectorMulAdd<16>(code, ctx, inst);
+}
+
 void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) {
    EmitFPVectorMulAdd<32>(code, ctx, inst);
 }
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@ -2173,6 +2173,8 @@ U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b) {

 U128 IREmitter::FPVectorMulAdd(size_t esize, const U128& a, const U128& b, const U128& c) {
    switch (esize) {
+    case 16:
+        return Inst<U128>(Opcode::FPVectorMulAdd16, a, b, c);
    case 32:
        return Inst<U128>(Opcode::FPVectorMulAdd32, a, b, c);
    case 64:
--- a/src/frontend/ir/microinstruction.cpp
+++ b/src/frontend/ir/microinstruction.cpp
@ -327,6 +327,7 @@ bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const {
    case Opcode::FPVectorGreaterEqual64:
    case Opcode::FPVectorMul32:
    case Opcode::FPVectorMul64:
+    case Opcode::FPVectorMulAdd16:
    case Opcode::FPVectorMulAdd32:
    case Opcode::FPVectorMulAdd64:
    case Opcode::FPVectorPairedAddLower32:
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@ -553,6 +553,7 @@ OPCODE(FPVectorMin32,                                       U128,           U128
 OPCODE(FPVectorMin64,                                       U128,           U128,           U128                                            )
 OPCODE(FPVectorMul32,                                       U128,           U128,           U128                                            )
 OPCODE(FPVectorMul64,                                       U128,           U128,           U128                                            )
+OPCODE(FPVectorMulAdd16,                                    U128,           U128,           U128,           U128                            )
 OPCODE(FPVectorMulAdd32,                                    U128,           U128,           U128,           U128                            )
 OPCODE(FPVectorMulAdd64,                                    U128,           U128,           U128,           U128                            )
 OPCODE(FPVectorMulX32,                                      U128,           U128,           U128                                            )