diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 3dd7f150..f086b545 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -17,6 +17,18 @@ namespace BackendX64 { using namespace Xbyak::util; +template +static void EmitVectorOperation(BlockOfCode* code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + (code->*fn)(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ASSERT(args[1].IsImmediate()); @@ -90,17 +102,6 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, dest); } -static void EmitVectorOperation(BlockOfCode* code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); - Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); - - (code->*fn)(xmm_a, xmm_b); - - ctx.reg_alloc.DefineValue(inst, xmm_a); -} - void EmitX64::EmitVectorAdd8(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddb); } @@ -121,6 +122,95 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand); } +void EmitX64::EmitVectorLowerBroadcast8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->pxor(tmp, tmp); + code->pshufb(a, tmp); + code->movq(a, a); + } else { + code->punpcklbw(a, a); + code->pshuflw(a, a, 0); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorLowerBroadcast16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshuflw(a, a, 0); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorLowerBroadcast32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshuflw(a, a, 0b01000100); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->pxor(tmp, tmp); + code->pshufb(a, tmp); + } else { + code->punpcklbw(a, a); + code->pshuflw(a, a, 0); + code->punpcklqdq(a, a); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshuflw(a, a, 0); + code->punpcklqdq(a, a); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshufd(a, a, 0); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->punpcklqdq(a, a); + + ctx.reg_alloc.DefineValue(inst, a); +} + void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::por); } @@ -141,6 +231,65 @@ void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, xmm_a); } +void EmitX64::EmitVectorEqual8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqb); +} + +void EmitX64::EmitVectorEqual16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqw); +} + +void EmitX64::EmitVectorEqual32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqd); +} + +void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) { + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqq); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->pcmpeqd(xmm_a, xmm_b); + code->pshufd(tmp, xmm_a, 0b10110001); + code->pand(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->pcmpeqq(xmm_a, xmm_b); + code->pshufd(tmp, xmm_a, 0b01001110); + code->pand(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + } else { + Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->pcmpeqd(xmm_a, xmm_b); + code->pshufd(tmp, xmm_a, 0b10110001); + code->pand(xmm_a, tmp); + code->pshufd(tmp, xmm_a, 0b01001110); + code->pand(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + } +} + void EmitX64::EmitVectorLowerPairedAdd8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -298,96 +447,6 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } -void EmitX64::EmitVectorLowerBroadcast8(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - - if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { - Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - - code->pxor(tmp, tmp); - code->pshufb(a, tmp); - code->movq(a, a); - } else { - code->punpcklbw(a, a); - code->pshuflw(a, a, 0); - } - - ctx.reg_alloc.DefineValue(inst, a); -} - -void EmitX64::EmitVectorLowerBroadcast16(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - - code->pshuflw(a, a, 0); - - ctx.reg_alloc.DefineValue(inst, a); -} - -void EmitX64::EmitVectorLowerBroadcast32(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - - code->pshuflw(a, a, 0b01000100); - - ctx.reg_alloc.DefineValue(inst, a); -} - -void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - - if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { - Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - - code->pxor(tmp, tmp); - code->pshufb(a, tmp); - } else { - code->punpcklbw(a, a); - code->pshuflw(a, a, 0); - code->punpcklqdq(a, a); - } - - ctx.reg_alloc.DefineValue(inst, a); -} - -void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - - code->pshuflw(a, a, 0); - code->punpcklqdq(a, a); - - ctx.reg_alloc.DefineValue(inst, a); -} - -void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - - code->pshufd(a, a, 0); - - ctx.reg_alloc.DefineValue(inst, a); -} - -void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); - - code->punpcklqdq(a, a); - - ctx.reg_alloc.DefineValue(inst, a); -} - - void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 60fef7c0..f52b7ccd 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -745,6 +745,26 @@ U128 IREmitter::VectorBroadcast64(const U64& a) { return Inst(Opcode::VectorBroadcast64, a); } +U128 IREmitter::VectorEqual8(const U128& a, const U128& b) { + return Inst(Opcode::VectorEqual8, a, b); +} + +U128 IREmitter::VectorEqual16(const U128& a, const U128& b) { + return Inst(Opcode::VectorEqual16, a, b); +} + +U128 IREmitter::VectorEqual32(const U128& a, const U128& b) { + return Inst(Opcode::VectorEqual32, a, b); +} + +U128 IREmitter::VectorEqual64(const U128& a, const U128& b) { + return Inst(Opcode::VectorEqual64, a, b); +} + +U128 IREmitter::VectorEqual128(const U128& a, const U128& b) { + return Inst(Opcode::VectorEqual128, a, b); +} + U128 IREmitter::VectorLowerPairedAdd8(const U128& a, const U128& b) { return Inst(Opcode::VectorLowerPairedAdd8, a, b); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index f1b116c1..ae11212b 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -200,6 +200,11 @@ public: U128 VectorBroadcast16(const U16& a); U128 VectorBroadcast32(const U32& a); U128 VectorBroadcast64(const U64& a); + U128 VectorEqual8(const U128& a, const U128& b); + U128 VectorEqual16(const U128& a, const U128& b); + U128 VectorEqual32(const U128& a, const U128& b); + U128 VectorEqual64(const U128& a, const U128& b); + U128 VectorEqual128(const U128& a, const U128& b); U128 VectorLowerPairedAdd8(const U128& a, const U128& b); U128 VectorLowerPairedAdd16(const U128& a, const U128& b); U128 VectorLowerPairedAdd32(const U128& a, const U128& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index b2218159..5678d34f 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -185,6 +185,11 @@ OPCODE(VectorBroadcast8, T::U128, T::U8 OPCODE(VectorBroadcast16, T::U128, T::U16 ) OPCODE(VectorBroadcast32, T::U128, T::U32 ) OPCODE(VectorBroadcast64, T::U128, T::U64 ) +OPCODE(VectorEqual8, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual16, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual32, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual64, T::U128, T::U128, T::U128 ) +OPCODE(VectorEqual128, T::U128, T::U128, T::U128 ) OPCODE(VectorLowerPairedAdd8, T::U128, T::U128, T::U128 ) OPCODE(VectorLowerPairedAdd16, T::U128, T::U128, T::U128 ) OPCODE(VectorLowerPairedAdd32, T::U128, T::U128, T::U128 )