EmitPackedHalvingAdd{U,S}16: Add SSE2 implementation
This commit is contained in:
parent
9ac1c87a51
commit
4682211729
1 changed files with 41 additions and 35 deletions
|
@ -1761,25 +1761,39 @@ void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
|
||||||
void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
auto args = reg_alloc.GetArgumentInfo(inst);
|
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
if (args[0].IsInXmm() || args[1].IsInXmm()) {
|
||||||
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
|
Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
|
Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]);
|
||||||
Xbyak::Reg32 and_a_b = reg_a;
|
Xbyak::Xmm tmp = reg_alloc.ScratchXmm();
|
||||||
Xbyak::Reg32 result = reg_a;
|
|
||||||
|
|
||||||
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
code->movdqa(tmp, xmm_a);
|
||||||
// Note that x^y always contains the LSB of the result.
|
code->pand(xmm_a, xmm_b);
|
||||||
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
code->pxor(tmp, xmm_b);
|
||||||
// We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below.
|
code->psrlw(tmp, 1);
|
||||||
|
code->paddw(xmm_a, tmp);
|
||||||
|
|
||||||
code->mov(xor_a_b, reg_a);
|
reg_alloc.DefineValue(inst, xmm_a);
|
||||||
code->and_(and_a_b, reg_b);
|
} else {
|
||||||
code->xor_(xor_a_b, reg_b);
|
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||||
code->shr(xor_a_b, 1);
|
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
|
||||||
code->and_(xor_a_b, 0x7FFF7FFF);
|
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
|
||||||
code->add(result, xor_a_b);
|
Xbyak::Reg32 and_a_b = reg_a;
|
||||||
|
Xbyak::Reg32 result = reg_a;
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, result);
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
||||||
|
// Note that x^y always contains the LSB of the result.
|
||||||
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
||||||
|
// We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below.
|
||||||
|
|
||||||
|
code->mov(xor_a_b, reg_a);
|
||||||
|
code->and_(and_a_b, reg_b);
|
||||||
|
code->xor_(xor_a_b, reg_b);
|
||||||
|
code->shr(xor_a_b, 1);
|
||||||
|
code->and_(xor_a_b, 0x7FFF7FFF);
|
||||||
|
code->add(result, xor_a_b);
|
||||||
|
|
||||||
|
reg_alloc.DefineValue(inst, result);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitPackedHalvingAddS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitPackedHalvingAddS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
|
@ -1814,30 +1828,22 @@ void EmitX64::EmitPackedHalvingAddS8(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
|
||||||
void EmitX64::EmitPackedHalvingAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitPackedHalvingAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
auto args = reg_alloc.GetArgumentInfo(inst);
|
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
|
Xbyak::Xmm xmm_b = reg_alloc.UseXmm(args[1]);
|
||||||
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
|
Xbyak::Xmm tmp = reg_alloc.ScratchXmm();
|
||||||
Xbyak::Reg32 and_a_b = reg_a;
|
|
||||||
Xbyak::Reg32 result = reg_a;
|
|
||||||
Xbyak::Reg32 carry = reg_alloc.ScratchGpr().cvt32();
|
|
||||||
|
|
||||||
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
||||||
// Note that x^y always contains the LSB of the result.
|
// Note that x^y always contains the LSB of the result.
|
||||||
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>>1).
|
||||||
// We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below.
|
// The arithmetic shift right makes this signed.
|
||||||
// carry propagates the sign bit from (x^y)>>1 upwards by one.
|
|
||||||
|
|
||||||
code->mov(xor_a_b, reg_a);
|
code->movdqa(tmp, xmm_a);
|
||||||
code->and_(and_a_b, reg_b);
|
code->pand(xmm_a, xmm_b);
|
||||||
code->xor_(xor_a_b, reg_b);
|
code->pxor(tmp, xmm_b);
|
||||||
code->mov(carry, xor_a_b);
|
code->psraw(tmp, 1);
|
||||||
code->and_(carry, 0x80008000);
|
code->paddw(xmm_a, tmp);
|
||||||
code->shr(xor_a_b, 1);
|
|
||||||
code->and_(xor_a_b, 0x7FFF7FFF);
|
|
||||||
code->add(result, xor_a_b);
|
|
||||||
code->xor_(result, carry);
|
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, result);
|
reg_alloc.DefineValue(inst, xmm_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitPackedHalvingSubU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitPackedHalvingSubU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
|
|
Loading…
Reference in a new issue