diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 23780db7..0145931f 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -849,6 +849,92 @@ void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) { EmitVectorHalvingAddUnsigned(32, ctx, inst, code); } +static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + switch (esize) { + case 8: { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp, code.MConst(xword, 0x8080808080808080, 0x8080808080808080)); + code.pxor(a, tmp); + code.pxor(b, tmp); + code.pavgb(b, a); + code.psubb(a, b); + break; + } + case 16: { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp, code.MConst(xword, 0x8000800080008000, 0x8000800080008000)); + code.pxor(a, tmp); + code.pxor(b, tmp); + code.pavgw(b, a); + code.psubw(a, b); + break; + } + case 32: + code.pxor(a, b); + code.pand(b, a); + code.psrad(a, 1); + code.psubd(a, b); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorHalvingSubS8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubSigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubSigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubS32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubSigned(32, ctx, inst, code); +} + +static void EmitVectorHalvingSubUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + switch (esize) { + case 8: + code.pavgb(b, a); + code.psubb(a, b); + break; + case 16: + code.pavgw(b, a); + code.psubw(a, b); + break; + case 32: + code.pxor(a, b); + code.pand(b, a); + code.psrld(a, 1); + code.psubd(a, b); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorHalvingSubU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubUnsigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubUnsigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubU32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubUnsigned(32, ctx, inst, code); +} + static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 69acfd15..87a32049 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -912,6 +912,32 @@ U128 IREmitter::VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128 return {}; } +U128 IREmitter::VectorHalvingSubSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorHalvingSubS8, a, b); + case 16: + return Inst(Opcode::VectorHalvingSubS16, a, b); + case 32: + return Inst(Opcode::VectorHalvingSubS32, a, b); + } + UNREACHABLE(); + return {}; +} + +U128 IREmitter::VectorHalvingSubUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorHalvingSubU8, a, b); + case 16: + return Inst(Opcode::VectorHalvingSubU16, a, b); + case 32: + return Inst(Opcode::VectorHalvingSubU32, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorInterleaveLower(size_t esize, const U128& a, const U128& b) { switch (esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 8d2c86d5..a95c7010 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -210,6 +210,8 @@ public: U128 VectorGreaterUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorHalvingAddSigned(size_t esize, const U128& a, const U128& b); U128 VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingSubSigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingSubUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorInterleaveLower(size_t esize, const U128& a, const U128& b); U128 VectorInterleaveUpper(size_t esize, const U128& a, const U128& b); U128 VectorLessEqualSigned(size_t esize, const U128& a, const U128& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index ba2960d8..48901af0 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -261,6 +261,12 @@ OPCODE(VectorHalvingAddS32, T::U128, T::U128, T::U OPCODE(VectorHalvingAddU8, T::U128, T::U128, T::U128 ) OPCODE(VectorHalvingAddU16, T::U128, T::U128, T::U128 ) OPCODE(VectorHalvingAddU32, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingSubU32, T::U128, T::U128, T::U128 ) OPCODE(VectorInterleaveLower8, T::U128, T::U128, T::U128 ) OPCODE(VectorInterleaveLower16, T::U128, T::U128, T::U128 ) OPCODE(VectorInterleaveLower32, T::U128, T::U128, T::U128 )