diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 610a340e..876ab899 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -1789,6 +1789,105 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, data); } +static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + switch (esize) { + case 8: { + const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(vec_128, code.MConst(xword, 0x8080808080808080, 0x8080808080808080)); + + code.paddb(a, vec_128); + code.paddb(b, vec_128); + code.pavgb(a, b); + code.paddb(a, vec_128); + break; + } + case 16: { + const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(vec_32768, code.MConst(xword, 0x8000800080008000, 0x8000800080008000)); + + code.paddw(a, vec_32768); + code.paddw(b, vec_32768); + code.pavgw(a, b); + code.paddw(a, vec_32768); + break; + } + case 32: { + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp1, a); + + code.por(a, b); + code.psrad(tmp1, 1); + code.psrad(b, 1); + code.pslld(a, 31); + code.paddd(b, tmp1); + code.psrld(a, 31); + code.paddd(a, b); + break; + } + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorRoundingHalvingAddS8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddSigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddSigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddS32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddSigned(32, ctx, inst, code); +} + +static void EmitVectorRoundingHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + switch (esize) { + case 8: + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pavgb); + return; + case 16: + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pavgw); + return; + case 32: { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp1, a); + + code.por(a, b); + code.psrld(tmp1, 1); + code.psrld(b, 1); + code.pslld(a, 31); + code.paddd(b, tmp1); + code.psrld(a, 31); + code.paddd(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + } + } +} + +void EmitX64::EmitVectorRoundingHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddUnsigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddUnsigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddU32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddUnsigned(32, ctx, inst, code); +} + enum class ShuffleType { LowHalfwords, HighHalfwords, diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 07fe0d55..ba433755 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1198,6 +1198,34 @@ U128 IREmitter::VectorRotateRight(size_t esize, const U128& a, u8 amount) { VectorLogicalShiftLeft(esize, a, static_cast(esize - amount))); } +U128 IREmitter::VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorRoundingHalvingAddS8, a, b); + case 16: + return Inst(Opcode::VectorRoundingHalvingAddS16, a, b); + case 32: + return Inst(Opcode::VectorRoundingHalvingAddS32, a, b); + } + + UNREACHABLE(); + return {}; +} + +U128 IREmitter::VectorRoundingHalvingAddUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorRoundingHalvingAddU8, a, b); + case 16: + return Inst(Opcode::VectorRoundingHalvingAddU16, a, b); + case 32: + return Inst(Opcode::VectorRoundingHalvingAddU32, a, b); + } + + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorShuffleHighHalfwords(const U128& a, u8 mask) { return Inst(Opcode::VectorShuffleHighHalfwords, a, mask); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 5af45a7a..629e21e7 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -236,6 +236,8 @@ public: U128 VectorReverseBits(const U128& a); U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount); U128 VectorRotateRight(size_t esize, const U128& a, u8 amount); + U128 VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b); + U128 VectorRoundingHalvingAddUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorShuffleHighHalfwords(const U128& a, u8 mask); U128 VectorShuffleLowHalfwords(const U128& a, u8 mask); U128 VectorShuffleWords(const U128& a, u8 mask); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index d0d20a29..0a9470bc 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -325,6 +325,12 @@ OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) OPCODE(VectorPopulationCount, T::U128, T::U128 ) OPCODE(VectorReverseBits, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorRoundingHalvingAddU32, T::U128, T::U128, T::U128 ) OPCODE(VectorShuffleHighHalfwords, T::U128, T::U128, T::U8 ) OPCODE(VectorShuffleLowHalfwords, T::U128, T::U128, T::U8 ) OPCODE(VectorShuffleWords, T::U128, T::U128, T::U8 )