EmitPackedHalvingAddU8: Add SSE2 implementation

2017-12-12 16:11:22 +00:00 · 2017-12-12 16:11:22 +00:00 · fc885ac80f
commit fc885ac80f
parent 4682211729
1 changed files with 35 additions and 16 deletions
--- a/src/backend_x64/emit_x64.cpp
+++ b/src/backend_x64/emit_x64.cpp
@ -1737,6 +1737,24 @@ void EmitX64::EmitPackedSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
 void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
    auto args = reg_alloc.GetArgumentInfo(inst);

+    if (args[0].IsInXmm() || args[1].IsInXmm()) {
+        Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]);
+        Xbyak::Xmm xmm_b = reg_alloc.UseScratchXmm(args[1]);
+        Xbyak::Xmm ones = reg_alloc.ScratchXmm();
+
+        // Since,
+        //   pavg(a, b) == (a + b + 1) >> 1
+        // Therefore,
+        //   ~pavg(~a, ~b) == (a + b) >> 1
+
+        code->pcmpeqb(ones, ones);
+        code->pxor(xmm_a, ones);
+        code->pxor(xmm_b, ones);
+        code->pavgb(xmm_a, xmm_b);
+        code->pxor(xmm_a, ones);
+
+        reg_alloc.DefineValue(inst, xmm_a);
+    } else {
        Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
        Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
        Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
@ -1756,6 +1774,7 @@ void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
        code->add(result, xor_a_b);

        reg_alloc.DefineValue(inst, result);
+    }
 }

 void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {