From 3c693f2576c9f3b0afe9c14d13118a6aadbc8a59 Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Sat, 15 May 2021 21:54:33 -0700 Subject: [PATCH] emit_x64_vector: AVX512VBMI implementation of EmitVectorTableLookup128 Also adds AVX512VBMI detection to host_feature --- src/dynarmic/backend/x64/block_of_code.cpp | 2 + src/dynarmic/backend/x64/emit_x64_vector.cpp | 60 ++++++++++++++- src/dynarmic/backend/x64/host_feature.h | 21 +++--- tests/A64/a64.cpp | 78 ++++++++++---------- 4 files changed, 111 insertions(+), 50 deletions(-) diff --git a/src/dynarmic/backend/x64/block_of_code.cpp b/src/dynarmic/backend/x64/block_of_code.cpp index 9e03143f..43683d2a 100644 --- a/src/dynarmic/backend/x64/block_of_code.cpp +++ b/src/dynarmic/backend/x64/block_of_code.cpp @@ -104,6 +104,8 @@ HostFeature GetHostFeatures() { features |= HostFeature::AVX512DQ; if (cpu_info.has(Cpu::tAVX512_BITALG)) features |= HostFeature::AVX512BITALG; + if (cpu_info.has(Cpu::tAVX512VBMI)) + features |= HostFeature::AVX512VBMI; if (cpu_info.has(Cpu::tPCLMULQDQ)) features |= HostFeature::PCLMULQDQ; if (cpu_info.has(Cpu::tF16C)) diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index 9d8340b7..f07b52a0 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -4333,6 +4333,64 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); }); const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector; + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(args[0]); + + const Xbyak::Opmask write_mask = k1; + const Xbyak::Opmask upper_mask = k2; + + // Handle vector-table 0,1 + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]); + + code.vptestnmb(write_mask, indicies, code.MConst(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0)); + code.vpermi2b(indicies | write_mask, xmm_table0, xmm_table1); + + ctx.reg_alloc.Release(xmm_table0); + ctx.reg_alloc.Release(xmm_table1); + + if (is_defaults_zero) { + code.vmovdqu8(defaults | write_mask | T_z, indicies); + } else { + code.vmovdqu8(defaults | write_mask, indicies); + } + + // Handle vector-table 2,3 + // vpcmpuble + code.vpcmpub(upper_mask, indicies, code.MConst(xword, 0x3F3F3F3F3F3F3F3F, 0x3F3F3F3F3F3F3F3F), 2); + code.kandnw(write_mask, write_mask, upper_mask); + + const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseScratchXmm(table[2]); + const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseScratchXmm(table[3]); + + code.vpermi2b(indicies, xmm_table2, xmm_table3); + code.vmovdqu8(defaults | write_mask, indicies); + + ctx.reg_alloc.DefineValue(inst, defaults); + return; + } + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]); + const Xbyak::Opmask write_mask = k1; + + code.vptestnmb(write_mask, indicies, code.MConst(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0)); + code.vpermi2b(indicies, xmm_table0, xmm_table1); + + if (is_defaults_zero) { + code.vmovdqu8(result | write_mask | T_z, indicies); + } else { + code.vmovdqu8(result | write_mask, indicies); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); @@ -4408,7 +4466,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, result); return; } - + if (code.HasHostFeature(HostFeature::SSE41)) { const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); diff --git a/src/dynarmic/backend/x64/host_feature.h b/src/dynarmic/backend/x64/host_feature.h index 135a4906..dc47d7af 100644 --- a/src/dynarmic/backend/x64/host_feature.h +++ b/src/dynarmic/backend/x64/host_feature.h @@ -21,18 +21,19 @@ enum class HostFeature : u64 { AVX512BW = 1ULL << 8, AVX512DQ = 1ULL << 9, AVX512BITALG = 1ULL << 10, - PCLMULQDQ = 1ULL << 11, - F16C = 1ULL << 12, - FMA = 1ULL << 13, - AES = 1ULL << 14, - POPCNT = 1ULL << 15, - BMI1 = 1ULL << 16, - BMI2 = 1ULL << 17, - LZCNT = 1ULL << 18, - GFNI = 1ULL << 19, + AVX512VBMI = 1ULL << 11, + PCLMULQDQ = 1ULL << 12, + F16C = 1ULL << 13, + FMA = 1ULL << 14, + AES = 1ULL << 15, + POPCNT = 1ULL << 16, + BMI1 = 1ULL << 17, + BMI2 = 1ULL << 18, + LZCNT = 1ULL << 19, + GFNI = 1ULL << 20, // Zen-based BMI2 - FastBMI2 = 1ULL << 20, + FastBMI2 = 1ULL << 21, // Orthographic AVX512 features on 128 and 256 vectors AVX512_Ortho = AVX512F | AVX512VL, diff --git a/tests/A64/a64.cpp b/tests/A64/a64.cpp index e958f2cc..e351b23f 100644 --- a/tests/A64/a64.cpp +++ b/tests/A64/a64.cpp @@ -113,31 +113,31 @@ TEST_CASE("A64: XTN", "[a64]") { TEST_CASE("A64: TBL", "[a64]") { A64TestEnv env; A64::Jit jit{A64::UserConfig{&env}}; - - env.code_mem.emplace_back(0x0e000100); // TBL v0.8b, { v8.16b }, v0.8b - env.code_mem.emplace_back(0x4e010101); // TBL v1.16b, { v8.16b }, v1.16b - env.code_mem.emplace_back(0x0e022102); // TBL v2.8b, { v8.16b, v9.16b }, v2.8b - env.code_mem.emplace_back(0x4e032103); // TBL v3.16b, { v8.16b, v9.16b }, v3.16b - env.code_mem.emplace_back(0x0e044104); // TBL v4.8b, { v8.16b, v9.16b, v10.16b }, v4.8b - env.code_mem.emplace_back(0x4e054105); // TBL v5.16b, { v8.16b, v9.16b, v10.16b }, v5.16b - env.code_mem.emplace_back(0x0e066106); // TBL v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v6.8b - env.code_mem.emplace_back(0x4e076107); // TBL v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v7.16b - env.code_mem.emplace_back(0x14000000); // B . + + env.code_mem.emplace_back(0x0e000100); // TBL v0.8b, { v8.16b }, v0.8b + env.code_mem.emplace_back(0x4e010101); // TBL v1.16b, { v8.16b }, v1.16b + env.code_mem.emplace_back(0x0e022102); // TBL v2.8b, { v8.16b, v9.16b }, v2.8b + env.code_mem.emplace_back(0x4e032103); // TBL v3.16b, { v8.16b, v9.16b }, v3.16b + env.code_mem.emplace_back(0x0e044104); // TBL v4.8b, { v8.16b, v9.16b, v10.16b }, v4.8b + env.code_mem.emplace_back(0x4e054105); // TBL v5.16b, { v8.16b, v9.16b, v10.16b }, v5.16b + env.code_mem.emplace_back(0x0e066106); // TBL v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v6.8b + env.code_mem.emplace_back(0x4e076107); // TBL v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v7.16b + env.code_mem.emplace_back(0x14000000); // B . // Indices // 'FF' intended to test out-of-index - jit.SetVector( 0, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); - jit.SetVector( 1, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); - jit.SetVector( 2, {0x100011011202'FF'03, 0x1404150516061707}); - jit.SetVector( 3, {0x100011011202'FF'03, 0x1404150516061707}); - jit.SetVector( 4, {0x201000211101'FF'12, 0x0233231303241404}); - jit.SetVector( 5, {0x201000211101'FF'12, 0x0233231303241404}); - jit.SetVector( 6, {0x403010004131'FF'01, 0x4232120243332303}); - jit.SetVector( 7, {0x403010004131'FF'01, 0x4232120243332303}); + jit.SetVector(0, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); + jit.SetVector(1, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); + jit.SetVector(2, {0x100011011202'FF'03, 0x1404150516061707}); + jit.SetVector(3, {0x100011011202'FF'03, 0x1404150516061707}); + jit.SetVector(4, {0x201000211101'FF'12, 0x0233231303241404}); + jit.SetVector(5, {0x201000211101'FF'12, 0x0233231303241404}); + jit.SetVector(6, {0x403010004131'FF'01, 0x4232120243332303}); + jit.SetVector(7, {0x403010004131'FF'01, 0x4232120243332303}); // Table - jit.SetVector( 8, {0x7766554433221100, 0xffeeddccbbaa9988}); - jit.SetVector( 9, {0xffffffffffffffff, 0xffffffffffffffff}); + jit.SetVector(8, {0x7766554433221100, 0xffeeddccbbaa9988}); + jit.SetVector(9, {0xffffffffffffffff, 0xffffffffffffffff}); jit.SetVector(10, {0xeeeeeeeeeeeeeeee, 0xeeeeeeeeeeeeeeee}); jit.SetVector(11, {0xdddddddddddddddd, 0xdddddddddddddddd}); @@ -160,30 +160,30 @@ TEST_CASE("A64: TBX", "[a64]") { A64TestEnv env; A64::Jit jit{A64::UserConfig{&env}}; - env.code_mem.emplace_back(0x0e001100); // TBX v0.8b, { v8.16b }, v0.8b - env.code_mem.emplace_back(0x4e011101); // TBX v1.16b, { v8.16b }, v1.16b - env.code_mem.emplace_back(0x0e023102); // TBX v2.8b, { v8.16b, v9.16b }, v2.8b - env.code_mem.emplace_back(0x4e033103); // TBX v3.16b, { v8.16b, v9.16b }, v3.16b - env.code_mem.emplace_back(0x0e045104); // TBX v4.8b, { v8.16b, v9.16b, v10.16b }, v4.8b - env.code_mem.emplace_back(0x4e055105); // TBX v5.16b, { v8.16b, v9.16b, v10.16b }, v5.16b - env.code_mem.emplace_back(0x0e067106); // TBX v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v6.8b - env.code_mem.emplace_back(0x4e077107); // TBX v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v7.16b - env.code_mem.emplace_back(0x14000000); // B . + env.code_mem.emplace_back(0x0e001100); // TBX v0.8b, { v8.16b }, v0.8b + env.code_mem.emplace_back(0x4e011101); // TBX v1.16b, { v8.16b }, v1.16b + env.code_mem.emplace_back(0x0e023102); // TBX v2.8b, { v8.16b, v9.16b }, v2.8b + env.code_mem.emplace_back(0x4e033103); // TBX v3.16b, { v8.16b, v9.16b }, v3.16b + env.code_mem.emplace_back(0x0e045104); // TBX v4.8b, { v8.16b, v9.16b, v10.16b }, v4.8b + env.code_mem.emplace_back(0x4e055105); // TBX v5.16b, { v8.16b, v9.16b, v10.16b }, v5.16b + env.code_mem.emplace_back(0x0e067106); // TBX v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v6.8b + env.code_mem.emplace_back(0x4e077107); // TBX v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v7.16b + env.code_mem.emplace_back(0x14000000); // B . // Indices // 'FF' intended to test out-of-index - jit.SetVector( 0, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); - jit.SetVector( 1, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); - jit.SetVector( 2, {0x100011011202'FF'03, 0x1404150516061707}); - jit.SetVector( 3, {0x100011011202'FF'03, 0x1404150516061707}); - jit.SetVector( 4, {0x201000211101'FF'12, 0x0233231303241404}); - jit.SetVector( 5, {0x201000211101'FF'12, 0x0233231303241404}); - jit.SetVector( 6, {0x403010004131'FF'01, 0x4232120243332303}); - jit.SetVector( 7, {0x403010004131'FF'01, 0x4232120243332303}); + jit.SetVector(0, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); + jit.SetVector(1, {0x000102030405'FF'07, 0x08090a0b0c0d0e0f}); + jit.SetVector(2, {0x100011011202'FF'03, 0x1404150516061707}); + jit.SetVector(3, {0x100011011202'FF'03, 0x1404150516061707}); + jit.SetVector(4, {0x201000211101'FF'12, 0x0233231303241404}); + jit.SetVector(5, {0x201000211101'FF'12, 0x0233231303241404}); + jit.SetVector(6, {0x403010004131'FF'01, 0x4232120243332303}); + jit.SetVector(7, {0x403010004131'FF'01, 0x4232120243332303}); // Table - jit.SetVector( 8, {0x7766554433221100, 0xffeeddccbbaa9988}); - jit.SetVector( 9, {0xffffffffffffffff, 0xffffffffffffffff}); + jit.SetVector(8, {0x7766554433221100, 0xffeeddccbbaa9988}); + jit.SetVector(9, {0xffffffffffffffff, 0xffffffffffffffff}); jit.SetVector(10, {0xeeeeeeeeeeeeeeee, 0xeeeeeeeeeeeeeeee}); jit.SetVector(11, {0xdddddddddddddddd, 0xdddddddddddddddd});