EmitPackedHalvingAddU8: Add SSE2 implementation

This commit is contained in:
MerryMage 2017-12-12 16:11:22 +00:00
parent 4682211729
commit fc885ac80f

View file

@ -1737,25 +1737,44 @@ void EmitX64::EmitPackedSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
auto args = reg_alloc.GetArgumentInfo(inst); auto args = reg_alloc.GetArgumentInfo(inst);
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32(); if (args[0].IsInXmm() || args[1].IsInXmm()) {
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32(); Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]);
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); Xbyak::Xmm xmm_b = reg_alloc.UseScratchXmm(args[1]);
Xbyak::Reg32 and_a_b = reg_a; Xbyak::Xmm ones = reg_alloc.ScratchXmm();
Xbyak::Reg32 result = reg_a;
// This relies on the equality x+y == ((x&y) << 1) + (x^y). // Since,
// Note that x^y always contains the LSB of the result. // pavg(a, b) == (a + b + 1) >> 1
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). // Therefore,
// We mask by 0x7F to remove the LSB so that it doesn't leak into the field below. // ~pavg(~a, ~b) == (a + b) >> 1
code->mov(xor_a_b, reg_a); code->pcmpeqb(ones, ones);
code->and_(and_a_b, reg_b); code->pxor(xmm_a, ones);
code->xor_(xor_a_b, reg_b); code->pxor(xmm_b, ones);
code->shr(xor_a_b, 1); code->pavgb(xmm_a, xmm_b);
code->and_(xor_a_b, 0x7F7F7F7F); code->pxor(xmm_a, ones);
code->add(result, xor_a_b);
reg_alloc.DefineValue(inst, result); reg_alloc.DefineValue(inst, xmm_a);
} else {
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
Xbyak::Reg32 and_a_b = reg_a;
Xbyak::Reg32 result = reg_a;
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
// Note that x^y always contains the LSB of the result.
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
// We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
code->mov(xor_a_b, reg_a);
code->and_(and_a_b, reg_b);
code->xor_(xor_a_b, reg_b);
code->shr(xor_a_b, 1);
code->and_(xor_a_b, 0x7F7F7F7F);
code->add(result, xor_a_b);
reg_alloc.DefineValue(inst, result);
}
} }
void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {