A64: Implement PMULL{2}
This commit is contained in:
parent
5ebf496d4e
commit
7fdd8b0197
6 changed files with 64 additions and 5 deletions
|
@ -1866,12 +1866,12 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* ins
|
||||||
ctx.reg_alloc.DefineValue(inst, a);
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename D, typename T>
|
||||||
static T PolynomialMultiply(T lhs, T rhs) {
|
static D PolynomialMultiply(T lhs, T rhs) {
|
||||||
constexpr size_t bit_size = Common::BitSize<T>();
|
constexpr size_t bit_size = Common::BitSize<T>();
|
||||||
const std::bitset<bit_size> operand(lhs);
|
const std::bitset<bit_size> operand(lhs);
|
||||||
|
|
||||||
T res = 0;
|
D res = 0;
|
||||||
for (size_t i = 0; i < bit_size; i++) {
|
for (size_t i = 0; i < bit_size; i++) {
|
||||||
if (operand[i]) {
|
if (operand[i]) {
|
||||||
res ^= rhs << i;
|
res ^= rhs << i;
|
||||||
|
@ -1883,7 +1883,35 @@ static T PolynomialMultiply(T lhs, T rhs) {
|
||||||
|
|
||||||
void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
|
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
|
||||||
std::transform(a.begin(), a.end(), b.begin(), result.begin(), PolynomialMultiply<u8>);
|
std::transform(a.begin(), a.end(), b.begin(), result.begin(), PolynomialMultiply<u8, u8>);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
|
||||||
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
|
result[i] = PolynomialMultiply<u16, u8>(a[i], b[i]);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
|
||||||
|
const auto handle_high_bits = [](u64 lhs, u64 rhs) {
|
||||||
|
constexpr size_t bit_size = Common::BitSize<u64>();
|
||||||
|
u64 result = 0;
|
||||||
|
|
||||||
|
for (size_t i = 1; i < bit_size; i++) {
|
||||||
|
if (Common::Bit(i, lhs)) {
|
||||||
|
result ^= rhs >> (bit_size - i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
|
||||||
|
result[0] = PolynomialMultiply<u64, u64>(a[0], b[0]);
|
||||||
|
result[1] = handle_high_bits(a[0], b[0]);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -687,7 +687,7 @@ INST(SABDL, "SABDL, SABDL2", "0Q001
|
||||||
INST(SMLAL_vec, "SMLAL, SMLAL2 (vector)", "0Q001110zz1mmmmm100000nnnnnddddd")
|
INST(SMLAL_vec, "SMLAL, SMLAL2 (vector)", "0Q001110zz1mmmmm100000nnnnnddddd")
|
||||||
INST(SMLSL_vec, "SMLSL, SMLSL2 (vector)", "0Q001110zz1mmmmm101000nnnnnddddd")
|
INST(SMLSL_vec, "SMLSL, SMLSL2 (vector)", "0Q001110zz1mmmmm101000nnnnnddddd")
|
||||||
INST(SMULL_vec, "SMULL, SMULL2 (vector)", "0Q001110zz1mmmmm110000nnnnnddddd")
|
INST(SMULL_vec, "SMULL, SMULL2 (vector)", "0Q001110zz1mmmmm110000nnnnnddddd")
|
||||||
//INST(PMULL, "PMULL, PMULL2", "0Q001110zz1mmmmm111000nnnnnddddd")
|
INST(PMULL, "PMULL, PMULL2", "0Q001110zz1mmmmm111000nnnnnddddd")
|
||||||
INST(UADDL, "UADDL, UADDL2", "0Q101110zz1mmmmm000000nnnnnddddd")
|
INST(UADDL, "UADDL, UADDL2", "0Q101110zz1mmmmm000000nnnnnddddd")
|
||||||
INST(UADDW, "UADDW, UADDW2", "0Q101110zz1mmmmm000100nnnnnddddd")
|
INST(UADDW, "UADDW, UADDW2", "0Q101110zz1mmmmm000100nnnnnddddd")
|
||||||
INST(USUBL, "USUBL, USUBL2", "0Q101110zz1mmmmm001000nnnnnddddd")
|
INST(USUBL, "USUBL, USUBL2", "0Q101110zz1mmmmm001000nnnnnddddd")
|
||||||
|
|
|
@ -161,6 +161,22 @@ bool WideOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Ve
|
||||||
}
|
}
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
|
bool TranslatorVisitor::PMULL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
|
||||||
|
if (size == 0b01 || size == 0b10) {
|
||||||
|
return ReservedValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t esize = 8 << size.ZeroExtend();
|
||||||
|
const size_t datasize = 64;
|
||||||
|
|
||||||
|
const IR::U128 operand1 = Vpart(datasize, Vn, Q);
|
||||||
|
const IR::U128 operand2 = Vpart(datasize, Vm, Q);
|
||||||
|
const IR::U128 result = ir.VectorPolynomialMultiplyLong(esize, operand1, operand2);
|
||||||
|
|
||||||
|
V(128, Vd, result);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool TranslatorVisitor::SABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
|
bool TranslatorVisitor::SABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
|
||||||
return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::Accumulate, Signedness::Signed);
|
return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::Accumulate, Signedness::Signed);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1198,6 +1198,18 @@ U128 IREmitter::VectorPolynomialMultiply(const U128& a, const U128& b) {
|
||||||
return Inst<U128>(Opcode::VectorPolynomialMultiply8, a, b);
|
return Inst<U128>(Opcode::VectorPolynomialMultiply8, a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b) {
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
return Inst<U128>(Opcode::VectorPolynomialMultiplyLong8, a, b);
|
||||||
|
case 64:
|
||||||
|
return Inst<U128>(Opcode::VectorPolynomialMultiplyLong64, a, b);
|
||||||
|
default:
|
||||||
|
UNREACHABLE();
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::VectorPopulationCount(const U128& a) {
|
U128 IREmitter::VectorPopulationCount(const U128& a) {
|
||||||
return Inst<U128>(Opcode::VectorPopulationCount, a);
|
return Inst<U128>(Opcode::VectorPopulationCount, a);
|
||||||
}
|
}
|
||||||
|
|
|
@ -239,6 +239,7 @@ public:
|
||||||
U128 VectorPairedAddSignedWiden(size_t original_esize, const U128& a);
|
U128 VectorPairedAddSignedWiden(size_t original_esize, const U128& a);
|
||||||
U128 VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a);
|
U128 VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a);
|
||||||
U128 VectorPolynomialMultiply(const U128& a, const U128& b);
|
U128 VectorPolynomialMultiply(const U128& a, const U128& b);
|
||||||
|
U128 VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b);
|
||||||
U128 VectorPopulationCount(const U128& a);
|
U128 VectorPopulationCount(const U128& a);
|
||||||
U128 VectorReverseBits(const U128& a);
|
U128 VectorReverseBits(const U128& a);
|
||||||
U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount);
|
U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount);
|
||||||
|
|
|
@ -331,6 +331,8 @@ OPCODE(VectorPairedAdd16, T::U128, T::U128,
|
||||||
OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 )
|
OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 )
|
OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorPolynomialMultiply8, T::U128, T::U128, T::U128 )
|
OPCODE(VectorPolynomialMultiply8, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorPolynomialMultiplyLong8, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorPolynomialMultiplyLong64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorPopulationCount, T::U128, T::U128 )
|
OPCODE(VectorPopulationCount, T::U128, T::U128 )
|
||||||
OPCODE(VectorReverseBits, T::U128, T::U128 )
|
OPCODE(VectorReverseBits, T::U128, T::U128 )
|
||||||
OPCODE(VectorRoundingHalvingAddS8, T::U128, T::U128, T::U128 )
|
OPCODE(VectorRoundingHalvingAddS8, T::U128, T::U128, T::U128 )
|
||||||
|
|
Loading…
Reference in a new issue