Merge pull request #466 from lioncash/fcmla
A64: Implement FCMLA's indexed element variant
This commit is contained in:
commit
0b794cbcea
2 changed files with 90 additions and 1 deletions
|
@ -896,7 +896,7 @@ INST(UDOT_elt, "UDOT (by element)", "0Q101
|
|||
//INST(SQRDMLSH_elt_2, "SQRDMLSH (by element)", "0Q101111zzLMmmmm1111H0nnnnnddddd")
|
||||
//INST(FMULX_elt_3, "FMULX (by element)", "0Q10111100LMmmmm1001H0nnnnnddddd")
|
||||
INST(FMULX_elt_4, "FMULX (by element)", "0Q1011111zLMmmmm1001H0nnnnnddddd")
|
||||
//INST(FCMLA_elt, "FCMLA (by element)", "0Q101111zzLMmmmm0rr1H0nnnnnddddd")
|
||||
INST(FCMLA_elt, "FCMLA (by element)", "0Q101111zzLMmmmm0rr1H0nnnnnddddd")
|
||||
|
||||
// Data Processing - FP and SIMD - Cryptographic three register
|
||||
INST(SM3TT1A, "SM3TT1A", "11001110010mmmmm10ii00nnnnnddddd")
|
||||
|
|
|
@ -187,6 +187,95 @@ bool TranslatorVisitor::MUL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4>
|
|||
return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None);
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<2> rot, Imm<1> H, Vec Vn, Vec Vd) {
|
||||
if (size == 0b00 || size == 0b11) {
|
||||
return ReservedValue();
|
||||
}
|
||||
|
||||
if (size == 0b01 && H == 1 && Q == 0) {
|
||||
return ReservedValue();
|
||||
}
|
||||
|
||||
if (size == 0b10 && (L == 1 || Q == 0)) {
|
||||
return ReservedValue();
|
||||
}
|
||||
|
||||
const size_t esize = 8U << size.ZeroExtend();
|
||||
|
||||
// TODO: We don't support the half-precision floating point variant yet.
|
||||
if (esize == 16) {
|
||||
return InterpretThisInstruction();
|
||||
}
|
||||
|
||||
const size_t index = [=] {
|
||||
if (size == 0b01) {
|
||||
return concatenate(H, L).ZeroExtend();
|
||||
}
|
||||
return H.ZeroExtend();
|
||||
}();
|
||||
|
||||
const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>();
|
||||
|
||||
const size_t datasize = Q ? 128 : 64;
|
||||
const size_t num_elements = datasize / esize;
|
||||
const size_t num_iterations = num_elements / 2;
|
||||
|
||||
const IR::U128 operand1 = V(datasize, Vn);
|
||||
const IR::U128 operand2 = V(datasize, Vm);
|
||||
const IR::U128 operand3 = V(datasize, Vd);
|
||||
IR::U128 result = ir.ZeroVector();
|
||||
|
||||
IR::U32U64 element1;
|
||||
IR::U32U64 element2;
|
||||
IR::U32U64 element3;
|
||||
IR::U32U64 element4;
|
||||
for (size_t e = 0; e < num_iterations; ++e) {
|
||||
const size_t first = e * 2;
|
||||
const size_t second = first + 1;
|
||||
|
||||
const size_t index_first = index * 2;
|
||||
const size_t index_second = index_first + 1;
|
||||
|
||||
switch (rot.ZeroExtend()) {
|
||||
case 0b00: // 0 degrees
|
||||
element1 = ir.VectorGetElement(esize, operand2, index_first);
|
||||
element2 = ir.VectorGetElement(esize, operand1, first);
|
||||
element3 = ir.VectorGetElement(esize, operand2, index_second);
|
||||
element4 = ir.VectorGetElement(esize, operand1, first);
|
||||
break;
|
||||
case 0b01: // 90 degrees
|
||||
element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second));
|
||||
element2 = ir.VectorGetElement(esize, operand1, second);
|
||||
element3 = ir.VectorGetElement(esize, operand2, index_first);
|
||||
element4 = ir.VectorGetElement(esize, operand1, second);
|
||||
break;
|
||||
case 0b10: // 180 degrees
|
||||
element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first));
|
||||
element2 = ir.VectorGetElement(esize, operand1, first);
|
||||
element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second));
|
||||
element4 = ir.VectorGetElement(esize, operand1, first);
|
||||
break;
|
||||
case 0b11: // 270 degrees
|
||||
element1 = ir.VectorGetElement(esize, operand2, index_second);
|
||||
element2 = ir.VectorGetElement(esize, operand1, second);
|
||||
element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first));
|
||||
element4 = ir.VectorGetElement(esize, operand1, second);
|
||||
break;
|
||||
}
|
||||
|
||||
const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first);
|
||||
const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second);
|
||||
|
||||
result = ir.VectorSetElement(esize, result, first,
|
||||
ir.FPMulAdd(operand3_elem1, element2, element1, true));
|
||||
result = ir.VectorSetElement(esize, result, second,
|
||||
ir.FPMulAdd(operand3_elem2, element4, element3, true));
|
||||
}
|
||||
|
||||
ir.SetQ(Vd, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
|
||||
return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue