diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 27839e83..c835d3de 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -3262,6 +3262,85 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply32(EmitContext& ctx, IR:: } } +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.punpcklwd(x, x); + code.punpcklwd(y, y); + code.pmaddwd(x, y); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpcmpeqd(y, x, code.MConst(xword, 0x8000000080000000, 0x8000000080000000)); + code.vpxor(x, x, y); + } else { + code.movdqa(y, code.MConst(xword, 0x8000000080000000, 0x8000000080000000)); + code.pcmpeqd(y, x); + code.pxor(x, y); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pmovmskb(bit, y); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, x); +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpmovsxdq(x, x); + code.vpmovsxdq(y, y); + code.vpmuldq(x, x, y); + code.vpaddq(x, x, x); + } else { + const Xbyak::Reg64 a = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 b = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 c = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 d = ctx.reg_alloc.ScratchGpr(); + + code.movq(c, x); + code.movq(d, y); + code.movsxd(a, c.cvt32()); + code.movsxd(b, d.cvt32()); + code.sar(c, 32); + code.sar(d, 32); + code.imul(a, b); + code.imul(c, d); + code.movq(x, a); + code.movq(y, c); + code.punpcklqdq(x, y); + code.paddq(x, x); + + ctx.reg_alloc.Release(a); + ctx.reg_alloc.Release(b); + ctx.reg_alloc.Release(c); + ctx.reg_alloc.Release(d); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpcmpeqq(y, x, code.MConst(xword, 0x8000000000000000, 0x8000000000000000)); + code.vpxor(x, x, y); + code.vpmovmskb(bit, y); + } else { + code.movdqa(y, code.MConst(xword, 0x8000000000000000, 0x8000000000000000)); + code.pcmpeqd(y, x); + code.shufps(y, y, 0b11110101); + code.pxor(x, y); + code.pmovmskb(bit, y); + } + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, x); +} + static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 7b73efa2..e81b528b 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1594,6 +1594,17 @@ UpperAndLower IREmitter::VectorSignedSaturatedDoublingMultiply(size_t esize, con }; } +U128 IREmitter::VectorSignedSaturatedDoublingMultiplyLong(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 16: + return Inst(Opcode::VectorSignedSaturatedDoublingMultiplyLong16, a, b); + case 32: + return Inst(Opcode::VectorSignedSaturatedDoublingMultiplyLong32, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) { switch (original_esize) { case 16: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 9385f29c..016cda79 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -274,6 +274,7 @@ public: U128 VectorSignedSaturatedAbs(size_t esize, const U128& a); U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b); UpperAndLower VectorSignedSaturatedDoublingMultiply(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedDoublingMultiplyLong(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNeg(size_t esize, const U128& a); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index 299be55b..5a91ab92 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -355,6 +355,10 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const { case Opcode::VectorSignedSaturatedAccumulateUnsigned16: case Opcode::VectorSignedSaturatedAccumulateUnsigned32: case Opcode::VectorSignedSaturatedAccumulateUnsigned64: + case Opcode::VectorSignedSaturatedDoublingMultiply16: + case Opcode::VectorSignedSaturatedDoublingMultiply32: + case Opcode::VectorSignedSaturatedDoublingMultiplyLong16: + case Opcode::VectorSignedSaturatedDoublingMultiplyLong32: case Opcode::VectorSignedSaturatedNarrowToSigned16: case Opcode::VectorSignedSaturatedNarrowToSigned32: case Opcode::VectorSignedSaturatedNarrowToSigned64: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 03279c31..504f5bd7 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -410,6 +410,8 @@ OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 ) OPCODE(VectorSignedSaturatedDoublingMultiply16, Void, U128, U128 ) OPCODE(VectorSignedSaturatedDoublingMultiply32, Void, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyLong16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyLong32, U128, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 )