ir: Add opcodes for vector CLZ operations
We can optimize these cases further for with the use of a fair bit of shuffling via pshufb and the use of masks, but given the uncommon use of this instruction, I wouldn't consider it to be beneficial in terms of amount of code to be worth it over a simple manageable naive solution like this. If we ever do hit a case where vectorized CLZ happens to be a bottleneck, then we can revisit this. At least with AVX-512CD, this can be done with a single instruction for the 32-bit word case.
This commit is contained in:
parent
d4c37a68a8
commit
e739624296
4 changed files with 54 additions and 0 deletions
|
@ -616,6 +616,43 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
ctx.reg_alloc.DefineValue(inst, a);
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArray<T>& data) {
|
||||||
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
|
T element = data[i];
|
||||||
|
|
||||||
|
size_t count = Common::BitSize<T>();
|
||||||
|
while (element != 0) {
|
||||||
|
element >>= 1;
|
||||||
|
--count;
|
||||||
|
}
|
||||||
|
|
||||||
|
result[i] = static_cast<T>(count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512CD) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
code.vplzcntd(data, data);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, data);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
|
|
@ -916,6 +916,19 @@ U128 IREmitter::VectorBroadcast(size_t esize, const UAny& a) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorCountLeadingZeros(size_t esize, const U128& a) {
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
return Inst<U128>(Opcode::VectorCountLeadingZeros8, a);
|
||||||
|
case 16:
|
||||||
|
return Inst<U128>(Opcode::VectorCountLeadingZeros16, a);
|
||||||
|
case 32:
|
||||||
|
return Inst<U128>(Opcode::VectorCountLeadingZeros32, a);
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b) {
|
U128 IREmitter::VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 8:
|
case 8:
|
||||||
|
|
|
@ -209,6 +209,7 @@ public:
|
||||||
U128 VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount);
|
U128 VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount);
|
||||||
U128 VectorBroadcast(size_t esize, const UAny& a);
|
U128 VectorBroadcast(size_t esize, const UAny& a);
|
||||||
U128 VectorBroadcastLower(size_t esize, const UAny& a);
|
U128 VectorBroadcastLower(size_t esize, const UAny& a);
|
||||||
|
U128 VectorCountLeadingZeros(size_t esize, const U128& a);
|
||||||
U128 VectorEor(const U128& a, const U128& b);
|
U128 VectorEor(const U128& a, const U128& b);
|
||||||
U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b);
|
U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b);
|
||||||
U128 VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b);
|
U128 VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b);
|
||||||
|
|
|
@ -258,6 +258,9 @@ OPCODE(VectorBroadcast8, U128, U8
|
||||||
OPCODE(VectorBroadcast16, U128, U16 )
|
OPCODE(VectorBroadcast16, U128, U16 )
|
||||||
OPCODE(VectorBroadcast32, U128, U32 )
|
OPCODE(VectorBroadcast32, U128, U32 )
|
||||||
OPCODE(VectorBroadcast64, U128, U64 )
|
OPCODE(VectorBroadcast64, U128, U64 )
|
||||||
|
OPCODE(VectorCountLeadingZeros8, U128, U128 )
|
||||||
|
OPCODE(VectorCountLeadingZeros16, U128, U128 )
|
||||||
|
OPCODE(VectorCountLeadingZeros32, U128, U128 )
|
||||||
OPCODE(VectorDeinterleaveEven8, U128, U128, U128 )
|
OPCODE(VectorDeinterleaveEven8, U128, U128, U128 )
|
||||||
OPCODE(VectorDeinterleaveEven16, U128, U128, U128 )
|
OPCODE(VectorDeinterleaveEven16, U128, U128, U128 )
|
||||||
OPCODE(VectorDeinterleaveEven32, U128, U128, U128 )
|
OPCODE(VectorDeinterleaveEven32, U128, U128, U128 )
|
||||||
|
|
Loading…
Reference in a new issue