emit_x64: Remove SSSE3 implementation of PackedHalvingAddU8
It is much slower than the SSE2 implementation, so there's no point keeping it around.
This commit is contained in:
parent
c4b40909f7
commit
18f11972c6
1 changed files with 16 additions and 47 deletions
|
@ -1742,47 +1742,17 @@ void EmitX64::EmitPackedSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst*
|
||||||
void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
auto args = reg_alloc.GetArgumentInfo(inst);
|
auto args = reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
// This code path requires SSSE3 because of the PSHUFB instruction.
|
|
||||||
// A fallback implementation is provided below.
|
|
||||||
if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
|
||||||
Xbyak::Xmm xmm_a = reg_alloc.UseScratchXmm(args[0]);
|
|
||||||
Xbyak::Xmm xmm_b = reg_alloc.UseScratchXmm(args[1]);
|
|
||||||
|
|
||||||
Xbyak::Xmm xmm_mask = reg_alloc.ScratchXmm();
|
|
||||||
Xbyak::Reg64 mask = reg_alloc.ScratchGpr();
|
|
||||||
|
|
||||||
// Set the mask to expand the values
|
|
||||||
// 0xAABBCCDD becomes 0x00AA00BB00CC00DD
|
|
||||||
code->mov(mask, 0x8003800280018000);
|
|
||||||
code->movq(xmm_mask, mask);
|
|
||||||
|
|
||||||
// Expand each 8-bit value to 16-bit
|
|
||||||
code->pshufb(xmm_a, xmm_mask);
|
|
||||||
code->pshufb(xmm_b, xmm_mask);
|
|
||||||
|
|
||||||
// Add the individual 16-bit values
|
|
||||||
code->paddw(xmm_a, xmm_b);
|
|
||||||
|
|
||||||
// Shift the 16-bit values to the right to halve them
|
|
||||||
code->psrlw(xmm_a, 1);
|
|
||||||
|
|
||||||
// Set the mask to pack the values again
|
|
||||||
// 0x00AA00BB00CC00DD becomes 0xAABBCCDD
|
|
||||||
code->mov(mask, 0x06040200);
|
|
||||||
code->movq(xmm_mask, mask);
|
|
||||||
|
|
||||||
// Shuffle them back to 8-bit values
|
|
||||||
code->pshufb(xmm_a, xmm_mask);
|
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, xmm_a);
|
|
||||||
} else {
|
|
||||||
// Fallback implementation in case the CPU doesn't support SSSE3
|
|
||||||
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||||
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
|
Xbyak::Reg32 reg_b = reg_alloc.UseGpr(args[1]).cvt32();
|
||||||
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
|
Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32();
|
||||||
Xbyak::Reg32 and_a_b = reg_a;
|
Xbyak::Reg32 and_a_b = reg_a;
|
||||||
Xbyak::Reg32 result = reg_a;
|
Xbyak::Reg32 result = reg_a;
|
||||||
|
|
||||||
|
// This relies on the equality x+y == ((x&y) << 1) + (x^y).
|
||||||
|
// Note that x^y always contains the LSB of the result.
|
||||||
|
// Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
|
||||||
|
// We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
|
||||||
|
|
||||||
code->mov(xor_a_b, reg_a);
|
code->mov(xor_a_b, reg_a);
|
||||||
code->and_(and_a_b, reg_b);
|
code->and_(and_a_b, reg_b);
|
||||||
code->xor_(xor_a_b, reg_b);
|
code->xor_(xor_a_b, reg_b);
|
||||||
|
@ -1791,7 +1761,6 @@ void EmitX64::EmitPackedHalvingAddU8(RegAlloc& reg_alloc, IR::Block&, IR::Inst*
|
||||||
code->add(result, xor_a_b);
|
code->add(result, xor_a_b);
|
||||||
|
|
||||||
reg_alloc.DefineValue(inst, result);
|
reg_alloc.DefineValue(inst, result);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
void EmitX64::EmitPackedHalvingAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) {
|
||||||
|
|
Loading…
Reference in a new issue