From 6c2c68bce6c3589a576c461bfb343185db42f68d Mon Sep 17 00:00:00 2001 From: Lioncash Date: Thu, 11 Apr 2019 16:44:15 -0400 Subject: [PATCH] A64: Implement FCMLA's indexed element variant With this, all of the instructions introduced with ARMv8.3-CompNum have an implementation. --- src/frontend/A64/decoder/a64.inc | 2 +- .../impl/simd_vector_x_indexed_element.cpp | 89 +++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index 5bdca7e8..6d0ae98e 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -896,7 +896,7 @@ INST(UDOT_elt, "UDOT (by element)", "0Q101 //INST(SQRDMLSH_elt_2, "SQRDMLSH (by element)", "0Q101111zzLMmmmm1111H0nnnnnddddd") //INST(FMULX_elt_3, "FMULX (by element)", "0Q10111100LMmmmm1001H0nnnnnddddd") INST(FMULX_elt_4, "FMULX (by element)", "0Q1011111zLMmmmm1001H0nnnnnddddd") -//INST(FCMLA_elt, "FCMLA (by element)", "0Q101111zzLMmmmm0rr1H0nnnnnddddd") +INST(FCMLA_elt, "FCMLA (by element)", "0Q101111zzLMmmmm0rr1H0nnnnnddddd") // Data Processing - FP and SIMD - Cryptographic three register INST(SM3TT1A, "SM3TT1A", "11001110010mmmmm10ii00nnnnnddddd") diff --git a/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp b/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp index 2fc62bab..8eda641d 100644 --- a/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp +++ b/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp @@ -187,6 +187,95 @@ bool TranslatorVisitor::MUL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None); } +bool TranslatorVisitor::FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<2> rot, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + if (size == 0b01 && H == 1 && Q == 0) { + return ReservedValue(); + } + + if (size == 0b10 && (L == 1 || Q == 0)) { + return ReservedValue(); + } + + const size_t esize = 8U << size.ZeroExtend(); + + // TODO: We don't support the half-precision floating point variant yet. + if (esize == 16) { + return InterpretThisInstruction(); + } + + const size_t index = [=] { + if (size == 0b01) { + return concatenate(H, L).ZeroExtend(); + } + return H.ZeroExtend(); + }(); + + const Vec Vm = concatenate(M, Vmlo).ZeroExtend(); + + const size_t datasize = Q ? 128 : 64; + const size_t num_elements = datasize / esize; + const size_t num_iterations = num_elements / 2; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + IR::U128 result = ir.ZeroVector(); + + IR::U32U64 element1; + IR::U32U64 element2; + IR::U32U64 element3; + IR::U32U64 element4; + for (size_t e = 0; e < num_iterations; ++e) { + const size_t first = e * 2; + const size_t second = first + 1; + + const size_t index_first = index * 2; + const size_t index_second = index_first + 1; + + switch (rot.ZeroExtend()) { + case 0b00: // 0 degrees + element1 = ir.VectorGetElement(esize, operand2, index_first); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.VectorGetElement(esize, operand2, index_second); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b01: // 90 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second)); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.VectorGetElement(esize, operand2, index_first); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + case 0b10: // 180 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first)); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second)); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b11: // 270 degrees + element1 = ir.VectorGetElement(esize, operand2, index_second); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first)); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + } + + const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first); + const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second); + + result = ir.VectorSetElement(esize, result, first, + ir.FPMulAdd(operand3_elem1, element2, element1, true)); + result = ir.VectorSetElement(esize, result, second, + ir.FPMulAdd(operand3_elem2, element4, element3, true)); + } + + ir.SetQ(Vd, result); + return true; +} + bool TranslatorVisitor::FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); }