diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index b2d7e330..e3536bf1 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -895,28 +895,74 @@ void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); - code.pshufd(lhs, lhs, 0b10001000); - code.pshufd(rhs, rhs, 0b10001000); - - if (code.HasSSE41()) { - code.pblendw(lhs, rhs, 0b11110000); - } else { - code.punpcklqdq(lhs, rhs); - } + code.shufps(lhs, rhs, 0b10001000); ctx.reg_alloc.DefineValue(inst, lhs); } void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.shufpd(lhs, rhs, 0b00); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, code.MConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.pand(lhs, tmp); + code.pand(rhs, tmp); + code.packuswb(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + code.pslld(lhs, 16); + code.psrad(lhs, 16); + + code.pslld(rhs, 16); + code.psrad(rhs, 16); + + code.packssdw(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.unpcklps(lhs, rhs); + code.movq(lhs, lhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEvenLower64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + [[maybe_unused]] const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + code.movq(lhs, lhs); - code.pslldq(rhs, 8); - code.por(lhs, rhs); ctx.reg_alloc.DefineValue(inst, lhs); } @@ -948,16 +994,9 @@ void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); - code.pshufd(lhs, lhs, 0b11011101); - code.pshufd(rhs, rhs, 0b11011101); - - if (code.HasSSE41()) { - code.pblendw(lhs, rhs, 0b11110000); - } else { - code.punpcklqdq(lhs, rhs); - } + code.shufps(lhs, rhs, 0b11011101); ctx.reg_alloc.DefineValue(inst, lhs); } @@ -965,13 +1004,64 @@ void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); - code.punpckhqdq(lhs, rhs); + code.shufpd(lhs, rhs, 0b11); ctx.reg_alloc.DefineValue(inst, lhs); } +void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psraw(lhs, 8); + code.psraw(rhs, 8); + code.packsswb(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psrad(lhs, 16); + code.psrad(rhs, 16); + code.packssdw(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + + code.xorps(zero, zero); + code.unpcklps(lhs, rhs); + code.unpckhpd(lhs, zero); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOddLower64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + [[maybe_unused]] const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.movq(rhs, rhs); + + ctx.reg_alloc.DefineValue(inst, rhs); +} + void EmitX64::EmitVectorEor(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pxor); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 7f19e4f8..563e8487 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1065,6 +1065,34 @@ U128 IREmitter::VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b UNREACHABLE(); } +U128 IREmitter::VectorDeinterleaveEvenLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorDeinterleaveEvenLower8, a, b); + case 16: + return Inst(Opcode::VectorDeinterleaveEvenLower16, a, b); + case 32: + return Inst(Opcode::VectorDeinterleaveEvenLower32, a, b); + case 64: + return Inst(Opcode::VectorDeinterleaveEvenLower64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorDeinterleaveOddLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorDeinterleaveOddLower8, a, b); + case 16: + return Inst(Opcode::VectorDeinterleaveOddLower16, a, b); + case 32: + return Inst(Opcode::VectorDeinterleaveOddLower32, a, b); + case 64: + return Inst(Opcode::VectorDeinterleaveOddLower64, a, b); + } + UNREACHABLE(); +} + U128 IREmitter::VectorEor(const U128& a, const U128& b) { return Inst(Opcode::VectorEor, a, b); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 95051edb..87e8844f 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -234,7 +234,9 @@ public: U128 VectorCountLeadingZeros(size_t esize, const U128& a); U128 VectorEor(const U128& a, const U128& b); U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b); + U128 VectorDeinterleaveEvenLower(size_t esize, const U128& a, const U128& b); U128 VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b); + U128 VectorDeinterleaveOddLower(size_t esize, const U128& a, const U128& b); U128 VectorEqual(size_t esize, const U128& a, const U128& b); U128 VectorExtract(const U128& a, const U128& b, size_t position); U128 VectorExtractLower(const U128& a, const U128& b, size_t position); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 4314d5a3..a81a792f 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -307,10 +307,18 @@ OPCODE(VectorDeinterleaveEven8, U128, U128 OPCODE(VectorDeinterleaveEven16, U128, U128, U128 ) OPCODE(VectorDeinterleaveEven32, U128, U128, U128 ) OPCODE(VectorDeinterleaveEven64, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEvenLower8, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEvenLower16, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEvenLower32, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEvenLower64, U128, U128, U128 ) OPCODE(VectorDeinterleaveOdd8, U128, U128, U128 ) OPCODE(VectorDeinterleaveOdd16, U128, U128, U128 ) OPCODE(VectorDeinterleaveOdd32, U128, U128, U128 ) OPCODE(VectorDeinterleaveOdd64, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOddLower8, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOddLower16, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOddLower32, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOddLower64, U128, U128, U128 ) OPCODE(VectorEor, U128, U128, U128 ) OPCODE(VectorEqual8, U128, U128, U128 ) OPCODE(VectorEqual16, U128, U128, U128 )