diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 953c8e84..f561e985 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -1735,18 +1735,65 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } +void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.psllw(a, 8); + code.psraw(c, 8); + code.psraw(a, 8); + code.paddw(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.pslld(a, 16); + code.psrad(c, 16); + code.psrad(a, 16); + code.paddd(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); - code.movdqa(c, a); - code.psllq(a, 32); - code.psraq(c, 32); - code.psraq(a, 32); - code.paddq(a, c); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { + code.vpsraq(c, a, 32); + code.vpsllq(a, a, 32); + code.vpsraq(a, a, 32); + code.vpaddq(a, a, c); + } else { + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(c, a); + code.psllq(a, 32); + code.movdqa(tmp1, code.MConst(xword, 0x80000000'00000000, 0x80000000'00000000)); + code.movdqa(tmp2, tmp1); + code.pand(tmp1, a); + code.pand(tmp2, c); + code.psrlq(a, 32); + code.psrlq(c, 32); + code.psrad(tmp1, 31); + code.psrad(tmp2, 31); + code.por(a, tmp1); + code.por(c, tmp2); + code.paddq(a, c); + } ctx.reg_alloc.DefineValue(inst, a); } diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index 8fd2b49f..00c51443 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -568,7 +568,7 @@ INST(INS_elt, "INS (element)", "01101 // Data Processing - FP and SIMD - SIMD Two-register misc INST(REV64_asimd, "REV64", "0Q001110zz100000000010nnnnnddddd") INST(REV16_asimd, "REV16 (vector)", "0Q001110zz100000000110nnnnnddddd") -//INST(SADDLP, "SADDLP", "0Q001110zz100000001010nnnnnddddd") +INST(SADDLP, "SADDLP", "0Q001110zz100000001010nnnnnddddd") //INST(SUQADD_2, "SUQADD", "0Q001110zz100000001110nnnnnddddd") //INST(CLS_asimd, "CLS (vector)", "0Q001110zz100000010010nnnnnddddd") INST(CNT, "CNT", "0Q001110zz100000010110nnnnnddddd") diff --git a/src/frontend/A64/translate/impl/simd_two_register_misc.cpp b/src/frontend/A64/translate/impl/simd_two_register_misc.cpp index 8e1f6678..fb825132 100644 --- a/src/frontend/A64/translate/impl/simd_two_register_misc.cpp +++ b/src/frontend/A64/translate/impl/simd_two_register_misc.cpp @@ -384,6 +384,25 @@ bool TranslatorVisitor::UADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd) { return true; } +bool TranslatorVisitor::SADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + IR::U128 result = ir.VectorPairedAddSignedWiden(esize, operand); + + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + bool TranslatorVisitor::SCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd) { return IntegerConvertToFloat(*this, Q, sz, Vn, Vd, Signedness::Signed); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 780f5523..142de470 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1168,6 +1168,19 @@ U128 IREmitter::VectorPairedAddLower(size_t esize, const U128& a, const U128& b) return {}; } +U128 IREmitter::VectorPairedAddSignedWiden(size_t original_esize, const U128& a) { + switch (original_esize) { + case 8: + return Inst(Opcode::VectorPairedAddSignedWiden8, a); + case 16: + return Inst(Opcode::VectorPairedAddSignedWiden16, a); + case 32: + return Inst(Opcode::VectorPairedAddSignedWiden32, a); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a) { switch (original_esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 0fb40827..00da94af 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -236,6 +236,7 @@ public: U128 VectorOr(const U128& a, const U128& b); U128 VectorPairedAdd(size_t esize, const U128& a, const U128& b); U128 VectorPairedAddLower(size_t esize, const U128& a, const U128& b); + U128 VectorPairedAddSignedWiden(size_t original_esize, const U128& a); U128 VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a); U128 VectorPopulationCount(const U128& a); U128 VectorReverseBits(const U128& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 37a8d262..129fdaad 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -319,6 +319,9 @@ OPCODE(VectorOr, T::U128, T::U128, T::U OPCODE(VectorPairedAddLower8, T::U128, T::U128, T::U128 ) OPCODE(VectorPairedAddLower16, T::U128, T::U128, T::U128 ) OPCODE(VectorPairedAddLower32, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAddSignedWiden8, T::U128, T::U128 ) +OPCODE(VectorPairedAddSignedWiden16, T::U128, T::U128 ) +OPCODE(VectorPairedAddSignedWiden32, T::U128, T::U128 ) OPCODE(VectorPairedAddUnsignedWiden8, T::U128, T::U128 ) OPCODE(VectorPairedAddUnsignedWiden16, T::U128, T::U128 ) OPCODE(VectorPairedAddUnsignedWiden32, T::U128, T::U128 )