emit_x64_vector: AVX512VBMI implementation of EmitVectorTableLookup128
Also adds AVX512VBMI detection to host_feature
This commit is contained in:
parent
37b24ee29e
commit
3c693f2576
4 changed files with 111 additions and 50 deletions
|
@ -104,6 +104,8 @@ HostFeature GetHostFeatures() {
|
||||||
features |= HostFeature::AVX512DQ;
|
features |= HostFeature::AVX512DQ;
|
||||||
if (cpu_info.has(Cpu::tAVX512_BITALG))
|
if (cpu_info.has(Cpu::tAVX512_BITALG))
|
||||||
features |= HostFeature::AVX512BITALG;
|
features |= HostFeature::AVX512BITALG;
|
||||||
|
if (cpu_info.has(Cpu::tAVX512VBMI))
|
||||||
|
features |= HostFeature::AVX512VBMI;
|
||||||
if (cpu_info.has(Cpu::tPCLMULQDQ))
|
if (cpu_info.has(Cpu::tPCLMULQDQ))
|
||||||
features |= HostFeature::PCLMULQDQ;
|
features |= HostFeature::PCLMULQDQ;
|
||||||
if (cpu_info.has(Cpu::tF16C))
|
if (cpu_info.has(Cpu::tF16C))
|
||||||
|
|
|
@ -4333,6 +4333,64 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); });
|
const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); });
|
||||||
const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector;
|
const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector;
|
||||||
|
|
||||||
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) {
|
||||||
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||||
|
const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
|
||||||
|
const Xbyak::Opmask write_mask = k1;
|
||||||
|
const Xbyak::Opmask upper_mask = k2;
|
||||||
|
|
||||||
|
// Handle vector-table 0,1
|
||||||
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||||
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
||||||
|
|
||||||
|
code.vptestnmb(write_mask, indicies, code.MConst(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0));
|
||||||
|
code.vpermi2b(indicies | write_mask, xmm_table0, xmm_table1);
|
||||||
|
|
||||||
|
ctx.reg_alloc.Release(xmm_table0);
|
||||||
|
ctx.reg_alloc.Release(xmm_table1);
|
||||||
|
|
||||||
|
if (is_defaults_zero) {
|
||||||
|
code.vmovdqu8(defaults | write_mask | T_z, indicies);
|
||||||
|
} else {
|
||||||
|
code.vmovdqu8(defaults | write_mask, indicies);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle vector-table 2,3
|
||||||
|
// vpcmpuble
|
||||||
|
code.vpcmpub(upper_mask, indicies, code.MConst(xword, 0x3F3F3F3F3F3F3F3F, 0x3F3F3F3F3F3F3F3F), 2);
|
||||||
|
code.kandnw(write_mask, write_mask, upper_mask);
|
||||||
|
|
||||||
|
const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseScratchXmm(table[2]);
|
||||||
|
const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseScratchXmm(table[3]);
|
||||||
|
|
||||||
|
code.vpermi2b(indicies, xmm_table2, xmm_table3);
|
||||||
|
code.vmovdqu8(defaults | write_mask, indicies);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, defaults);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) {
|
||||||
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||||
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
||||||
|
const Xbyak::Opmask write_mask = k1;
|
||||||
|
|
||||||
|
code.vptestnmb(write_mask, indicies, code.MConst(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0));
|
||||||
|
code.vpermi2b(indicies, xmm_table0, xmm_table1);
|
||||||
|
|
||||||
|
if (is_defaults_zero) {
|
||||||
|
code.vmovdqu8(result | write_mask | T_z, indicies);
|
||||||
|
} else {
|
||||||
|
code.vmovdqu8(result | write_mask, indicies);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
|
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
|
||||||
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||||
|
|
|
@ -21,18 +21,19 @@ enum class HostFeature : u64 {
|
||||||
AVX512BW = 1ULL << 8,
|
AVX512BW = 1ULL << 8,
|
||||||
AVX512DQ = 1ULL << 9,
|
AVX512DQ = 1ULL << 9,
|
||||||
AVX512BITALG = 1ULL << 10,
|
AVX512BITALG = 1ULL << 10,
|
||||||
PCLMULQDQ = 1ULL << 11,
|
AVX512VBMI = 1ULL << 11,
|
||||||
F16C = 1ULL << 12,
|
PCLMULQDQ = 1ULL << 12,
|
||||||
FMA = 1ULL << 13,
|
F16C = 1ULL << 13,
|
||||||
AES = 1ULL << 14,
|
FMA = 1ULL << 14,
|
||||||
POPCNT = 1ULL << 15,
|
AES = 1ULL << 15,
|
||||||
BMI1 = 1ULL << 16,
|
POPCNT = 1ULL << 16,
|
||||||
BMI2 = 1ULL << 17,
|
BMI1 = 1ULL << 17,
|
||||||
LZCNT = 1ULL << 18,
|
BMI2 = 1ULL << 18,
|
||||||
GFNI = 1ULL << 19,
|
LZCNT = 1ULL << 19,
|
||||||
|
GFNI = 1ULL << 20,
|
||||||
|
|
||||||
// Zen-based BMI2
|
// Zen-based BMI2
|
||||||
FastBMI2 = 1ULL << 20,
|
FastBMI2 = 1ULL << 21,
|
||||||
|
|
||||||
// Orthographic AVX512 features on 128 and 256 vectors
|
// Orthographic AVX512 features on 128 and 256 vectors
|
||||||
AVX512_Ortho = AVX512F | AVX512VL,
|
AVX512_Ortho = AVX512F | AVX512VL,
|
||||||
|
|
Loading…
Reference in a new issue