From 8bbc9fdbb6fb8a4e0a8b5fa7ff350f3aab72cae0 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 20 Jun 2020 22:34:55 +0100 Subject: [PATCH] A32: Implement ASIMD VTBX --- src/backend/x64/emit_x64.h | 3 + src/backend/x64/emit_x64_vector.cpp | 169 +++++++++++++++++- src/frontend/A32/decoder/asimd.inc | 2 +- .../A32/translate/impl/asimd_misc.cpp | 50 +++--- .../A32/translate/impl/translate_arm.h | 1 + src/frontend/ir/ir_emitter.cpp | 14 +- src/frontend/ir/ir_emitter.h | 2 + src/frontend/ir/opcodes.inc | 5 +- 8 files changed, 220 insertions(+), 26 deletions(-) diff --git a/src/backend/x64/emit_x64.h b/src/backend/x64/emit_x64.h index 427db234..23a0f957 100644 --- a/src/backend/x64/emit_x64.h +++ b/src/backend/x64/emit_x64.h @@ -40,6 +40,9 @@ using A64FullVectorWidth = std::integral_constant; template using VectorArray = std::array()>; +template +using HalfVectorArray = std::array() / 2>; + struct EmitContext { EmitContext(RegAlloc& reg_alloc, IR::Block& block); diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 9b40e65b..f1892a89 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -4029,7 +4029,174 @@ void EmitX64::EmitVectorTable(EmitContext&, IR::Inst* inst) { ASSERT_MSG(inst->UseCount() == 1, "Table cannot be used multiple times"); } -void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) { +void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { + ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst()); + + const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem){ return !elem.IsVoid(); }); + const bool is_defaults_zero = inst->GetArg(0).IsZero(); + + // TODO: AVX512VL implementation when available (VPERMB / VPERMI2B / VPERMT2B) + + const std::array sat_const{ + 0, + 0x7878787878787878, + 0x7070707070707070, + 0x6868686868686868, + 0x6060606060606060, + }; + + if (code.HasSSSE3() && is_defaults_zero && table_size <= 2) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + + if (table_size == 2) { + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.punpcklqdq(xmm_table0, xmm_table0_upper); + ctx.reg_alloc.Release(xmm_table0_upper); + } + + code.paddusb(indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + code.pshufb(xmm_table0, indicies); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasSSE41() && table_size <= 2) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + + if (table_size == 2) { + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.punpcklqdq(xmm_table0, xmm_table0_upper); + ctx.reg_alloc.Release(xmm_table0_upper); + } + + if (code.HasAVX()) { + code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } + code.pshufb(xmm_table0, indicies); + code.pblendvb(xmm_table0, defaults); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasSSE41() && is_defaults_zero) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]); + + { + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.punpcklqdq(xmm_table0, xmm_table0_upper); + ctx.reg_alloc.Release(xmm_table0_upper); + } + if (table_size == 4) { + const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]); + code.punpcklqdq(xmm_table1, xmm_table1_upper); + ctx.reg_alloc.Release(xmm_table1_upper); + } + + if (code.HasAVX()) { + code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } + code.paddusb(indicies, code.MConst(xword, 0x6060606060606060, 0xFFFFFFFFFFFFFFFF)); + code.pshufb(xmm_table0, xmm0); + code.pshufb(xmm_table1, indicies); + code.pblendvb(xmm_table0, xmm_table1); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasSSE41()) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]); + + { + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.punpcklqdq(xmm_table0, xmm_table0_upper); + ctx.reg_alloc.Release(xmm_table0_upper); + } + if (table_size == 4) { + const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]); + code.punpcklqdq(xmm_table1, xmm_table1_upper); + ctx.reg_alloc.Release(xmm_table1_upper); + } + + if (code.HasAVX()) { + code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.MConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } + code.pshufb(xmm_table0, indicies); + code.pshufb(xmm_table1, indicies); + code.pblendvb(xmm_table0, xmm_table1); + if (code.HasAVX()) { + code.vpaddusb(xmm0, indicies, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.MConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } + code.pblendvb(xmm_table0, defaults); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + const u32 stack_space = static_cast(6 * 8); + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + for (size_t i = 0; i < table_size; ++i) { + const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(table[i]); + code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value); + ctx.reg_alloc.Release(table_value); + } + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]); + code.mov(code.ABI_PARAM4.cvt32(), table_size); + code.movq(qword[code.ABI_PARAM2], defaults); + code.movq(qword[code.ABI_PARAM3], indicies); + + code.CallLambda( + [](const HalfVectorArray* table, HalfVectorArray& result, const HalfVectorArray& indicies, size_t table_size) { + for (size_t i = 0; i < result.size(); ++i) { + const size_t index = indicies[i] / table[0].size(); + const size_t elem = indicies[i] % table[0].size(); + if (index < table_size) { + result[i] = table[index][elem]; + } + } + } + ); + + code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]); + code.add(rsp, stack_space + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable); auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/A32/decoder/asimd.inc b/src/frontend/A32/decoder/asimd.inc index 642a2a62..f4b0f939 100644 --- a/src/frontend/A32/decoder/asimd.inc +++ b/src/frontend/A32/decoder/asimd.inc @@ -112,7 +112,7 @@ INST(asimd_VRSQRTE, "VRSQRTE", "111100111D11zz11dddd010 // Miscellaneous INST(asimd_VEXT, "VEXT", "111100101D11nnnnddddiiiiNQM0mmmm") // ASIMD INST(asimd_VTBL, "VTBL", "111100111D11nnnndddd10zzN0M0mmmm") // ASIMD -//INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD +INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD //INST(asimd_VDUP_scalar, "VDUP (scalar)", "111100111D11iiiidddd11000QM0mmmm") // ASIMD // One register and modified immediate diff --git a/src/frontend/A32/translate/impl/asimd_misc.cpp b/src/frontend/A32/translate/impl/asimd_misc.cpp index 72371856..1518ce48 100644 --- a/src/frontend/A32/translate/impl/asimd_misc.cpp +++ b/src/frontend/A32/translate/impl/asimd_misc.cpp @@ -10,6 +10,31 @@ namespace Dynarmic::A32 { +static bool TableLookup(ArmTranslatorVisitor& v, bool is_vtbl, bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) { + const size_t length = len + 1; + const auto d = ToVector(false, Vd, D); + const auto m = ToVector(false, Vm, M); + const auto n = ToVector(false, Vn, N); + + if (RegNumber(n) + length > 32) { + return v.UnpredictableInstruction(); + } + + const IR::Table table = v.ir.VectorTable([&]{ + std::vector result; + for (size_t i = 0; i < length; ++i) { + result.emplace_back(v.ir.GetExtendedRegister(n + i)); + } + return result; + }()); + const IR::U64 indicies = v.ir.GetExtendedRegister(m); + const IR::U64 defaults = is_vtbl ? v.ir.Imm64(0) : IR::U64{v.ir.GetExtendedRegister(d)}; + const IR::U64 result = v.ir.VectorTableLookup(defaults, table, indicies); + + v.ir.SetExtendedRegister(d, result); + return true; +} + bool ArmTranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm) { if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) { return UndefinedInstruction(); @@ -33,28 +58,11 @@ bool ArmTranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, } bool ArmTranslatorVisitor::asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) { - const size_t length = len + 1; - const auto d = ToVector(false, Vd, D); - const auto m = ToVector(false, Vm, M); - const auto n = ToVector(false, Vn, N); + return TableLookup(*this, true, D, Vn, Vd, len, N, M, Vm); +} - if (RegNumber(n) + length > 32) { - return UnpredictableInstruction(); - } - - const IR::U64 table0 = ir.GetExtendedRegister(n); - const IR::U64 table1 = length >= 2 ? IR::U64{ir.GetExtendedRegister(n + 1)} : ir.Imm64(0); - const IR::U64 table2 = length >= 3 ? IR::U64{ir.GetExtendedRegister(n + 2)} : ir.Imm64(0); - const IR::U64 table3 = length == 4 ? IR::U64{ir.GetExtendedRegister(n + 3)} : ir.Imm64(0); - - const IR::Table table = ir.VectorTable(length <= 2 - ? std::vector{ir.Pack2x64To1x128(table0, table1)} - : std::vector{ir.Pack2x64To1x128(table0, table1), ir.Pack2x64To1x128(table2, table3)}); - const IR::U128 indicies = ir.GetVector(m); - const IR::U128 result = ir.VectorTableLookup(ir.ZeroVector(), table, indicies); - - ir.SetVector(d, result); - return true; +bool ArmTranslatorVisitor::asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) { + return TableLookup(*this, false, D, Vn, Vd, len, N, M, Vm); } } // namespace Dynarmic::A32 diff --git a/src/frontend/A32/translate/impl/translate_arm.h b/src/frontend/A32/translate/impl/translate_arm.h index 4735497b..216abcef 100644 --- a/src/frontend/A32/translate/impl/translate_arm.h +++ b/src/frontend/A32/translate/impl/translate_arm.h @@ -514,6 +514,7 @@ struct ArmTranslatorVisitor final { // Advanced SIMD miscellaneous bool asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm); bool asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm); + bool asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm); // Advanced SIMD load/store structures bool v8_VST_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t sz, size_t align, Reg m); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 69684973..16b9371e 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1739,14 +1739,26 @@ U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) { UNREACHABLE(); } +Table IREmitter::VectorTable(std::vector values) { + ASSERT(values.size() >= 1 && values.size() <= 4); + values.resize(4); + return Inst(Opcode::VectorTable, values[0], values[1], values[2], values[3]); +} + Table IREmitter::VectorTable(std::vector values) { ASSERT(values.size() >= 1 && values.size() <= 4); values.resize(4); return Inst
(Opcode::VectorTable, values[0], values[1], values[2], values[3]); } +U64 IREmitter::VectorTableLookup(const U64& defaults, const Table& table, const U64& indices) { + ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U64); + return Inst(Opcode::VectorTableLookup64, defaults, table, indices); +} + U128 IREmitter::VectorTableLookup(const U128& defaults, const Table& table, const U128& indices) { - return Inst(Opcode::VectorTableLookup, defaults, table, indices); + ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U128); + return Inst(Opcode::VectorTableLookup128, defaults, table, indices); } U128 IREmitter::VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) { diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 0311d057..03607636 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -297,7 +297,9 @@ public: U128 VectorSignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedShiftLeftUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorSub(size_t esize, const U128& a, const U128& b); + Table VectorTable(std::vector values); Table VectorTable(std::vector values); + U64 VectorTableLookup(const U64& defaults, const Table& table, const U64& indices); U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices); U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); U128 VectorUnsignedRecipEstimate(const U128& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index d94fe7bd..1ede0e8b 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -470,8 +470,9 @@ OPCODE(VectorSub8, U128, U128 OPCODE(VectorSub16, U128, U128, U128 ) OPCODE(VectorSub32, U128, U128, U128 ) OPCODE(VectorSub64, U128, U128, U128 ) -OPCODE(VectorTable, Table, U128, Opaque, Opaque, Opaque ) -OPCODE(VectorTableLookup, U128, U128, Table, U128 ) +OPCODE(VectorTable, Table, Opaque, Opaque, Opaque, Opaque ) +OPCODE(VectorTableLookup64, U64, U64, Table, U64 ) +OPCODE(VectorTableLookup128, U128, U128, Table, U128 ) OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 ) OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 ) OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 )