From da2e7fad873d940f9b7243f06c927acbf8d231c5 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Sun, 23 Sep 2018 00:44:45 -0400 Subject: [PATCH] emit_x64_vector: SSSE3 variant of EmitVectorCountLeadingZeros8() pshufb lyfe --- src/backend/x64/emit_x64_vector.cpp | 49 +++++++++++------------------ 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 9571ade9..92629510 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -632,41 +632,28 @@ static void EmitVectorCountLeadingZeros(VectorArray& result, const VectorArra } void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { - if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data_hi = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm data_lo = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm tmp_hi = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm tmp_lo = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); - code.vpsllw(data_lo, data_hi, 8); - code.vpsrlw(tmp_hi, data_hi, 1); - code.vpsrlw(tmp_lo, data_lo, 1); - code.vpor(data_hi, data_hi, tmp_hi); - code.vpor(data_lo, data_lo, tmp_lo); - code.vpsrlw(tmp_hi, data_hi, 2); - code.vpsrlw(tmp_lo, data_lo, 2); - code.vpor(data_hi, data_hi, tmp_hi); - code.vpor(data_lo, data_lo, tmp_lo); - code.vpsrlw(tmp_hi, data_hi, 4); - code.vpsrlw(tmp_lo, data_lo, 4); - code.vpor(data_hi, data_hi, tmp_hi); - code.vpor(data_lo, data_lo, tmp_lo); - code.vpsrlw(data_hi, data_hi, 8); - code.vpsrlw(data_lo, data_lo, 8); - code.vmovdqa(result, code.MConst(xword, 0xe700e700e700e700, 0xe700e700e700e700)); - code.vpmullw(data_hi, data_hi, result); - code.vpmullw(data_lo, data_lo, result); - code.vpsrlw(data_hi, data_hi, 12); - code.vpsrlw(data_lo, data_lo, 12); - code.vpsllw(data_hi, data_hi, 8); - code.vpor(data_hi, data_hi, data_lo); - code.vmovdqa(result, code.MConst(xword, 0xaaaa05aaaaaa0008, 0x030702aa06aa0104)); - code.vpshufb(result, result, data_hi); + code.movdqa(tmp1, code.MConst(xword, 0x0101010102020304, 0x0000000000000000)); + code.movdqa(tmp2, tmp1); - ctx.reg_alloc.DefineValue(inst, result); + code.pshufb(tmp2, data); + code.psrlw(data, 4); + code.pand(data, code.MConst(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F)); + code.pshufb(tmp1, data); + + code.movdqa(data, code.MConst(xword, 0x0404040404040404, 0x0404040404040404)); + + code.pcmpeqb(data, tmp1); + code.pand(data, tmp2); + code.paddb(data, tmp1); + + ctx.reg_alloc.DefineValue(inst, data); return; }