Merge pull request #475 from lioncash/muladd

A64: Enable half-precision variants of floating-point multiply-add instructions
This commit is contained in:
Merry 2019-04-13 14:30:03 +01:00 committed by MerryMage
commit cb9a1b18b6
13 changed files with 241 additions and 102 deletions

View file

@ -608,6 +608,7 @@ template<size_t fsize>
static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>; using FPT = mp::unsigned_integer_of_size<fsize>;
if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -657,6 +658,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
return; return;
} }
}
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ctx.reg_alloc.HostCall(inst, args[0], args[1], args[2]); ctx.reg_alloc.HostCall(inst, args[0], args[1], args[2]);
@ -673,6 +675,10 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
#endif #endif
} }
void EmitX64::EmitFPMulAdd16(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<16>(code, ctx, inst);
}
void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<32>(code, ctx, inst); EmitFPMulAdd<32>(code, ctx, inst);
} }

View file

@ -908,6 +908,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
} }
}; };
if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -942,10 +943,15 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
return; return;
} }
}
EmitFourOpFallback(code, ctx, inst, fallback_fn); EmitFourOpFallback(code, ctx, inst, fallback_fn);
} }
void EmitX64::EmitFPVectorMulAdd16(EmitContext& ctx, IR::Inst* inst) {
EmitFPVectorMulAdd<16>(code, ctx, inst);
}
void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) {
EmitFPVectorMulAdd<32>(code, ctx, inst); EmitFPVectorMulAdd<32>(code, ctx, inst);
} }

View file

@ -35,7 +35,7 @@ FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
if (typeA == FPType::QNaN && ((inf1 && zero2) || (zero1 && inf2))) { if (typeA == FPType::QNaN && ((inf1 && zero2) || (zero1 && inf2))) {
FPProcessException(FPExc::InvalidOp, fpcr, fpsr); FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
return FPInfo<FPT>::DefaultNaN(); return FPT(FPInfo<FPT>::DefaultNaN());
} }
if (maybe_nan) { if (maybe_nan) {
@ -50,29 +50,30 @@ FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
// Raise NaN on (inf * inf) of opposite signs or (inf * zero). // Raise NaN on (inf * inf) of opposite signs or (inf * zero).
if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) { if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) {
FPProcessException(FPExc::InvalidOp, fpcr, fpsr); FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
return FPInfo<FPT>::DefaultNaN(); return FPT(FPInfo<FPT>::DefaultNaN());
} }
// Handle infinities // Handle infinities
if ((infA && !signA) || (infP && !signP)) { if ((infA && !signA) || (infP && !signP)) {
return FPInfo<FPT>::Infinity(false); return FPT(FPInfo<FPT>::Infinity(false));
} }
if ((infA && signA) || (infP && signP)) { if ((infA && signA) || (infP && signP)) {
return FPInfo<FPT>::Infinity(true); return FPT(FPInfo<FPT>::Infinity(true));
} }
// Result is exactly zero // Result is exactly zero
if (zeroA && zeroP && signA == signP) { if (zeroA && zeroP && signA == signP) {
return FPInfo<FPT>::Zero(signA); return FPT(FPInfo<FPT>::Zero(signA));
} }
const FPUnpacked result_value = FusedMulAdd(valueA, value1, value2); const FPUnpacked result_value = FusedMulAdd(valueA, value1, value2);
if (result_value.mantissa == 0) { if (result_value.mantissa == 0) {
return FPInfo<FPT>::Zero(rounding == RoundingMode::TowardsMinusInfinity); return FPT(FPInfo<FPT>::Zero(rounding == RoundingMode::TowardsMinusInfinity));
} }
return FPRound<FPT>(result_value, fpcr, fpsr); return FPRound<FPT>(result_value, fpcr, fpsr);
} }
template u16 FPMulAdd<u16>(u16 addend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
template u32 FPMulAdd<u32>(u32 addend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); template u32 FPMulAdd<u32>(u32 addend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
template u64 FPMulAdd<u64>(u64 addend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); template u64 FPMulAdd<u64>(u64 addend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);

View file

@ -538,9 +538,9 @@ INST(FCVTZU_fix_1, "FCVTZU (vector, fixed-point)", "01111
INST(SQDMULL_elt_1, "SQDMULL, SQDMULL2 (by element)", "01011111zzLMmmmm1011H0nnnnnddddd") INST(SQDMULL_elt_1, "SQDMULL, SQDMULL2 (by element)", "01011111zzLMmmmm1011H0nnnnnddddd")
INST(SQDMULH_elt_1, "SQDMULH (by element)", "01011111zzLMmmmm1100H0nnnnnddddd") INST(SQDMULH_elt_1, "SQDMULH (by element)", "01011111zzLMmmmm1100H0nnnnnddddd")
INST(SQRDMULH_elt_1, "SQRDMULH (by element)", "01011111zzLMmmmm1101H0nnnnnddddd") INST(SQRDMULH_elt_1, "SQRDMULH (by element)", "01011111zzLMmmmm1101H0nnnnnddddd")
//INST(FMLA_elt_1, "FMLA (by element)", "0101111100LMmmmm0001H0nnnnnddddd") INST(FMLA_elt_1, "FMLA (by element)", "0101111100LMmmmm0001H0nnnnnddddd")
INST(FMLA_elt_2, "FMLA (by element)", "010111111zLMmmmm0001H0nnnnnddddd") INST(FMLA_elt_2, "FMLA (by element)", "010111111zLMmmmm0001H0nnnnnddddd")
//INST(FMLS_elt_1, "FMLS (by element)", "0101111100LMmmmm0101H0nnnnnddddd") INST(FMLS_elt_1, "FMLS (by element)", "0101111100LMmmmm0101H0nnnnnddddd")
INST(FMLS_elt_2, "FMLS (by element)", "010111111zLMmmmm0101H0nnnnnddddd") INST(FMLS_elt_2, "FMLS (by element)", "010111111zLMmmmm0101H0nnnnnddddd")
//INST(FMUL_elt_1, "FMUL (by element)", "0101111100LMmmmm1001H0nnnnnddddd") //INST(FMUL_elt_1, "FMUL (by element)", "0101111100LMmmmm1001H0nnnnnddddd")
INST(FMUL_elt_2, "FMUL (by element)", "010111111zLMmmmm1001H0nnnnnddddd") INST(FMUL_elt_2, "FMUL (by element)", "010111111zLMmmmm1001H0nnnnnddddd")
@ -583,11 +583,11 @@ INST(INS_elt, "INS (element)", "01101
//INST(FCMGT_reg_3, "FCMGT (register)", "0Q101110110mmmmm001001nnnnnddddd") //INST(FCMGT_reg_3, "FCMGT (register)", "0Q101110110mmmmm001001nnnnnddddd")
//INST(FACGT_3, "FACGT", "0Q101110110mmmmm001011nnnnnddddd") //INST(FACGT_3, "FACGT", "0Q101110110mmmmm001011nnnnnddddd")
//INST(FMAXNM_1, "FMAXNM (vector)", "0Q001110010mmmmm000001nnnnnddddd") //INST(FMAXNM_1, "FMAXNM (vector)", "0Q001110010mmmmm000001nnnnnddddd")
//INST(FMLA_vec_1, "FMLA (vector)", "0Q001110010mmmmm000011nnnnnddddd") INST(FMLA_vec_1, "FMLA (vector)", "0Q001110010mmmmm000011nnnnnddddd")
//INST(FADD_1, "FADD (vector)", "0Q001110010mmmmm000101nnnnnddddd") //INST(FADD_1, "FADD (vector)", "0Q001110010mmmmm000101nnnnnddddd")
//INST(FMAX_1, "FMAX (vector)", "0Q001110010mmmmm001101nnnnnddddd") //INST(FMAX_1, "FMAX (vector)", "0Q001110010mmmmm001101nnnnnddddd")
//INST(FMINNM_1, "FMINNM (vector)", "0Q001110110mmmmm000001nnnnnddddd") //INST(FMINNM_1, "FMINNM (vector)", "0Q001110110mmmmm000001nnnnnddddd")
//INST(FMLS_vec_1, "FMLS (vector)", "0Q001110110mmmmm000011nnnnnddddd") INST(FMLS_vec_1, "FMLS (vector)", "0Q001110110mmmmm000011nnnnnddddd")
//INST(FSUB_1, "FSUB (vector)", "0Q001110110mmmmm000101nnnnnddddd") //INST(FSUB_1, "FSUB (vector)", "0Q001110110mmmmm000101nnnnnddddd")
//INST(FMIN_1, "FMIN (vector)", "0Q001110110mmmmm001101nnnnnddddd") //INST(FMIN_1, "FMIN (vector)", "0Q001110110mmmmm001101nnnnnddddd")
//INST(FMAXNMP_vec_1, "FMAXNMP (vector)", "0Q101110010mmmmm000001nnnnnddddd") //INST(FMAXNMP_vec_1, "FMAXNMP (vector)", "0Q101110010mmmmm000001nnnnnddddd")
@ -876,9 +876,9 @@ INST(SQDMULL_elt_2, "SQDMULL, SQDMULL2 (by element)", "0Q001
INST(SQDMULH_elt_2, "SQDMULH (by element)", "0Q001111zzLMmmmm1100H0nnnnnddddd") INST(SQDMULH_elt_2, "SQDMULH (by element)", "0Q001111zzLMmmmm1100H0nnnnnddddd")
INST(SQRDMULH_elt_2, "SQRDMULH (by element)", "0Q001111zzLMmmmm1101H0nnnnnddddd") INST(SQRDMULH_elt_2, "SQRDMULH (by element)", "0Q001111zzLMmmmm1101H0nnnnnddddd")
INST(SDOT_elt, "SDOT (by element)", "0Q001111zzLMmmmm1110H0nnnnnddddd") INST(SDOT_elt, "SDOT (by element)", "0Q001111zzLMmmmm1110H0nnnnnddddd")
//INST(FMLA_elt_3, "FMLA (by element)", "0Q00111100LMmmmm0001H0nnnnnddddd") INST(FMLA_elt_3, "FMLA (by element)", "0Q00111100LMmmmm0001H0nnnnnddddd")
INST(FMLA_elt_4, "FMLA (by element)", "0Q0011111zLMmmmm0001H0nnnnnddddd") INST(FMLA_elt_4, "FMLA (by element)", "0Q0011111zLMmmmm0001H0nnnnnddddd")
//INST(FMLS_elt_3, "FMLS (by element)", "0Q00111100LMmmmm0101H0nnnnnddddd") INST(FMLS_elt_3, "FMLS (by element)", "0Q00111100LMmmmm0101H0nnnnnddddd")
INST(FMLS_elt_4, "FMLS (by element)", "0Q0011111zLMmmmm0101H0nnnnnddddd") INST(FMLS_elt_4, "FMLS (by element)", "0Q0011111zLMmmmm0101H0nnnnnddddd")
//INST(FMUL_elt_3, "FMUL (by element)", "0Q00111100LMmmmm1001H0nnnnnddddd") //INST(FMUL_elt_3, "FMUL (by element)", "0Q00111100LMmmmm1001H0nnnnnddddd")
INST(FMUL_elt_4, "FMUL (by element)", "0Q0011111zLMmmmm1001H0nnnnnddddd") INST(FMUL_elt_4, "FMUL (by element)", "0Q0011111zLMmmmm1001H0nnnnnddddd")

View file

@ -25,7 +25,7 @@ namespace Dynarmic::A64 {
class LocationDescriptor { class LocationDescriptor {
public: public:
static constexpr u64 PC_MASK = 0x00FF'FFFF'FFFF'FFFFull; static constexpr u64 PC_MASK = 0x00FF'FFFF'FFFF'FFFFull;
static constexpr u32 FPCR_MASK = 0x07C0'0000; static constexpr u32 FPCR_MASK = 0x07C8'0000;
LocationDescriptor(u64 pc, FP::FPCR fpcr) : pc(pc & PC_MASK), fpcr(fpcr.Value() & FPCR_MASK) {} LocationDescriptor(u64 pc, FP::FPCR fpcr) : pc(pc & PC_MASK), fpcr(fpcr.Value() & FPCR_MASK) {}

View file

@ -12,56 +12,56 @@ namespace Dynarmic::A64 {
bool TranslatorVisitor::FMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { bool TranslatorVisitor::FMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
const auto datasize = FPGetDataSize(type); const auto datasize = FPGetDataSize(type);
if (!datasize || *datasize == 16) { if (!datasize) {
return UnallocatedEncoding(); return UnallocatedEncoding();
} }
const IR::U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
const IR::U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
const IR::U32U64 operand2 = V_scalar(*datasize, Vm); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
const IR::U32U64 result = ir.FPMulAdd(operanda, operand1, operand2, true); const IR::U16U32U64 result = ir.FPMulAdd(operanda, operand1, operand2, true);
V_scalar(*datasize, Vd, result); V_scalar(*datasize, Vd, result);
return true; return true;
} }
bool TranslatorVisitor::FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { bool TranslatorVisitor::FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
const auto datasize = FPGetDataSize(type); const auto datasize = FPGetDataSize(type);
if (!datasize || *datasize == 16) { if (!datasize) {
return UnallocatedEncoding(); return UnallocatedEncoding();
} }
const IR::U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
const IR::U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
const IR::U32U64 operand2 = V_scalar(*datasize, Vm); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
const IR::U32U64 result = ir.FPMulAdd(operanda, ir.FPNeg(operand1), operand2, true); const IR::U16U32U64 result = ir.FPMulAdd(operanda, ir.FPNeg(operand1), operand2, true);
V_scalar(*datasize, Vd, result); V_scalar(*datasize, Vd, result);
return true; return true;
} }
bool TranslatorVisitor::FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { bool TranslatorVisitor::FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
const auto datasize = FPGetDataSize(type); const auto datasize = FPGetDataSize(type);
if (!datasize || *datasize == 16) { if (!datasize) {
return UnallocatedEncoding(); return UnallocatedEncoding();
} }
const IR::U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
const IR::U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
const IR::U32U64 operand2 = V_scalar(*datasize, Vm); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
const IR::U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), ir.FPNeg(operand1), operand2, true); const IR::U16U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), ir.FPNeg(operand1), operand2, true);
V_scalar(*datasize, Vd, result); V_scalar(*datasize, Vd, result);
return true; return true;
} }
bool TranslatorVisitor::FNMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { bool TranslatorVisitor::FNMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
const auto datasize = FPGetDataSize(type); const auto datasize = FPGetDataSize(type);
if (!datasize || *datasize == 16) { if (!datasize) {
return UnallocatedEncoding(); return UnallocatedEncoding();
} }
const IR::U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
const IR::U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
const IR::U32U64 operand2 = V_scalar(*datasize, Vm); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
const IR::U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), operand1, operand2, true); const IR::U16U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), operand1, operand2, true);
V_scalar(*datasize, Vd, result); V_scalar(*datasize, Vd, result);
return true; return true;
} }

View file

@ -36,7 +36,7 @@ bool MultiplyByElement(TranslatorVisitor& v, bool sz, Imm<1> L, Imm<1> M, Imm<4>
const size_t esize = sz ? 64 : 32; const size_t esize = sz ? 64 : 32;
const IR::U32U64 element = v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index); const IR::U32U64 element = v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index);
const IR::U32U64 result = [&] { const IR::U32U64 result = [&]() -> IR::U32U64 {
IR::U32U64 operand1 = v.V_scalar(esize, Vn); IR::U32U64 operand1 = v.V_scalar(esize, Vn);
if (extra_behavior == ExtraBehavior::None) { if (extra_behavior == ExtraBehavior::None) {
@ -58,12 +58,54 @@ bool MultiplyByElement(TranslatorVisitor& v, bool sz, Imm<1> L, Imm<1> M, Imm<4>
v.V_scalar(esize, Vd, result); v.V_scalar(esize, Vd, result);
return true; return true;
} }
bool MultiplyByElementHalfPrecision(TranslatorVisitor& v, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H,
Vec Vn, Vec Vd, ExtraBehavior extra_behavior) {
const size_t esize = 16;
const size_t idxsize = H == 1 ? 128 : 64;
const size_t index = concatenate(H, L, M).ZeroExtend();
const auto Vm = Vmlo.ZeroExtend<Vec>();
const IR::U16 element = v.ir.VectorGetElement(esize, v.V(idxsize, Vm), index);
const IR::U16 result = [&]() -> IR::U16 {
IR::U16 operand1 = v.V_scalar(esize, Vn);
// TODO: Currently we don't implement half-precision paths
// for regular multiplication and extended multiplication.
if (extra_behavior == ExtraBehavior::None) {
UNIMPLEMENTED();
}
if (extra_behavior == ExtraBehavior::MultiplyExtended) {
UNIMPLEMENTED();
}
if (extra_behavior == ExtraBehavior::Subtract) {
operand1 = v.ir.FPNeg(operand1);
}
const IR::U16 operand2 = v.V_scalar(esize, Vd);
return v.ir.FPMulAdd(operand2, operand1, element, true);
}();
v.V_scalar(esize, Vd, result);
return true;
}
} // Anonymous namespace } // Anonymous namespace
bool TranslatorVisitor::FMLA_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return MultiplyByElementHalfPrecision(*this, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
}
bool TranslatorVisitor::FMLA_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { bool TranslatorVisitor::FMLA_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
} }
bool TranslatorVisitor::FMLS_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return MultiplyByElementHalfPrecision(*this, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
}
bool TranslatorVisitor::FMLS_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { bool TranslatorVisitor::FMLS_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract); return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
} }

View file

@ -680,10 +680,24 @@ bool TranslatorVisitor::FADD_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
return true; return true;
} }
bool TranslatorVisitor::FMLA_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd) {
const size_t datasize = Q ? 128 : 64;
const size_t esize = 16;
const IR::U128 operand1 = V(datasize, Vn);
const IR::U128 operand2 = V(datasize, Vm);
const IR::U128 operand3 = V(datasize, Vd);
const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, operand1, operand2);
V(datasize, Vd, result);
return true;
}
bool TranslatorVisitor::FMLA_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { bool TranslatorVisitor::FMLA_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
if (sz && !Q) { if (sz && !Q) {
return ReservedValue(); return ReservedValue();
} }
const size_t esize = sz ? 64 : 32; const size_t esize = sz ? 64 : 32;
const size_t datasize = Q ? 128 : 64; const size_t datasize = Q ? 128 : 64;
@ -691,6 +705,20 @@ bool TranslatorVisitor::FMLA_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
const IR::U128 operand2 = V(datasize, Vm); const IR::U128 operand2 = V(datasize, Vm);
const IR::U128 operand3 = V(datasize, Vd); const IR::U128 operand3 = V(datasize, Vd);
const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, operand1, operand2); const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, operand1, operand2);
V(datasize, Vd, result);
return true;
}
bool TranslatorVisitor::FMLS_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd) {
const size_t datasize = Q ? 128 : 64;
const size_t esize = 16;
const IR::U128 operand1 = V(datasize, Vn);
const IR::U128 operand2 = V(datasize, Vm);
const IR::U128 operand3 = V(datasize, Vd);
const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, ir.FPVectorNeg(esize, operand1), operand2);
V(datasize, Vd, result); V(datasize, Vd, result);
return true; return true;
} }
@ -699,6 +727,7 @@ bool TranslatorVisitor::FMLS_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
if (sz && !Q) { if (sz && !Q) {
return ReservedValue(); return ReservedValue();
} }
const size_t esize = sz ? 64 : 32; const size_t esize = sz ? 64 : 32;
const size_t datasize = Q ? 128 : 64; const size_t datasize = Q ? 128 : 64;
@ -706,6 +735,7 @@ bool TranslatorVisitor::FMLS_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
const IR::U128 operand2 = V(datasize, Vm); const IR::U128 operand2 = V(datasize, Vm);
const IR::U128 operand3 = V(datasize, Vd); const IR::U128 operand3 = V(datasize, Vd);
const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, ir.FPVectorNeg(esize, operand1), operand2); const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, ir.FPVectorNeg(esize, operand1), operand2);
V(datasize, Vd, result); V(datasize, Vd, result);
return true; return true;
} }

View file

@ -89,6 +89,39 @@ bool FPMultiplyByElement(TranslatorVisitor& v, bool Q, bool sz, Imm<1> L, Imm<1>
return true; return true;
} }
bool FPMultiplyByElementHalfPrecision(TranslatorVisitor& v, bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H,
Vec Vn, Vec Vd, ExtraBehavior extra_behavior) {
const size_t idxdsize = H == 1 ? 128 : 64;
const size_t index = concatenate(H, L, M).ZeroExtend();
const Vec Vm = Vmlo.ZeroExtend<Vec>();
const size_t esize = 16;
const size_t datasize = Q ? 128 : 64;
const IR::UAny element2 = v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index);
const IR::U128 operand1 = v.V(datasize, Vn);
const IR::U128 operand2 = Q ? v.ir.VectorBroadcast(esize, element2) : v.ir.VectorBroadcastLower(esize, element2);
const IR::U128 operand3 = v.V(datasize, Vd);
// TODO: We currently don't implement half-precision paths for
// regular multiplies and extended multiplies.
const IR::U128 result = [&]{
switch (extra_behavior) {
case ExtraBehavior::None:
break;
case ExtraBehavior::Extended:
break;
case ExtraBehavior::Accumulate:
return v.ir.FPVectorMulAdd(esize, operand3, operand1, operand2);
case ExtraBehavior::Subtract:
return v.ir.FPVectorMulAdd(esize, operand3, v.ir.FPVectorNeg(esize, operand1), operand2);
}
UNREACHABLE();
return IR::U128{};
}();
v.V(datasize, Vd, result);
return true;
}
using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::UAny&); using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::UAny&);
bool DotProduct(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, bool DotProduct(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H,
@ -276,10 +309,18 @@ bool TranslatorVisitor::FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4
return true; return true;
} }
bool TranslatorVisitor::FMLA_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return FPMultiplyByElementHalfPrecision(*this, Q, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
}
bool TranslatorVisitor::FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { bool TranslatorVisitor::FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
} }
bool TranslatorVisitor::FMLS_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return FPMultiplyByElementHalfPrecision(*this, Q, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
}
bool TranslatorVisitor::FMLS_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { bool TranslatorVisitor::FMLS_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract); return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
} }

View file

@ -1882,13 +1882,20 @@ U32U64 IREmitter::FPMul(const U32U64& a, const U32U64& b, bool fpcr_controlled)
} }
} }
U32U64 IREmitter::FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bool fpcr_controlled) { U16U32U64 IREmitter::FPMulAdd(const U16U32U64& a, const U16U32U64& b, const U16U32U64& c, bool fpcr_controlled) {
ASSERT(fpcr_controlled); ASSERT(fpcr_controlled);
ASSERT(a.GetType() == b.GetType()); ASSERT(a.GetType() == b.GetType());
if (a.GetType() == Type::U32) {
switch (a.GetType()) {
case Type::U16:
return Inst<U16>(Opcode::FPMulAdd16, a, b, c);
case Type::U32:
return Inst<U32>(Opcode::FPMulAdd32, a, b, c); return Inst<U32>(Opcode::FPMulAdd32, a, b, c);
} else { case Type::U64:
return Inst<U64>(Opcode::FPMulAdd64, a, b, c); return Inst<U64>(Opcode::FPMulAdd64, a, b, c);
default:
UNREACHABLE();
return U16U32U64{};
} }
} }
@ -2181,6 +2188,8 @@ U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b) {
U128 IREmitter::FPVectorMulAdd(size_t esize, const U128& a, const U128& b, const U128& c) { U128 IREmitter::FPVectorMulAdd(size_t esize, const U128& a, const U128& b, const U128& c) {
switch (esize) { switch (esize) {
case 16:
return Inst<U128>(Opcode::FPVectorMulAdd16, a, b, c);
case 32: case 32:
return Inst<U128>(Opcode::FPVectorMulAdd32, a, b, c); return Inst<U128>(Opcode::FPVectorMulAdd32, a, b, c);
case 64: case 64:

View file

@ -302,7 +302,7 @@ public:
U32U64 FPMin(const U32U64& a, const U32U64& b, bool fpcr_controlled); U32U64 FPMin(const U32U64& a, const U32U64& b, bool fpcr_controlled);
U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpcr_controlled); U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpcr_controlled);
U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpcr_controlled); U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpcr_controlled);
U32U64 FPMulAdd(const U32U64& addend, const U32U64& op1, const U32U64& op2, bool fpcr_controlled); U16U32U64 FPMulAdd(const U16U32U64& addend, const U16U32U64& op1, const U16U32U64& op2, bool fpcr_controlled);
U32U64 FPMulX(const U32U64& a, const U32U64& b); U32U64 FPMulX(const U32U64& a, const U32U64& b);
U16U32U64 FPNeg(const U16U32U64& a); U16U32U64 FPNeg(const U16U32U64& a);
U32U64 FPRecipEstimate(const U32U64& a); U32U64 FPRecipEstimate(const U32U64& a);

View file

@ -269,6 +269,7 @@ bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const {
case Opcode::FPMinNumeric64: case Opcode::FPMinNumeric64:
case Opcode::FPMul32: case Opcode::FPMul32:
case Opcode::FPMul64: case Opcode::FPMul64:
case Opcode::FPMulAdd16:
case Opcode::FPMulAdd32: case Opcode::FPMulAdd32:
case Opcode::FPMulAdd64: case Opcode::FPMulAdd64:
case Opcode::FPRecipEstimate32: case Opcode::FPRecipEstimate32:
@ -326,6 +327,7 @@ bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const {
case Opcode::FPVectorGreaterEqual64: case Opcode::FPVectorGreaterEqual64:
case Opcode::FPVectorMul32: case Opcode::FPVectorMul32:
case Opcode::FPVectorMul64: case Opcode::FPVectorMul64:
case Opcode::FPVectorMulAdd16:
case Opcode::FPVectorMulAdd32: case Opcode::FPVectorMulAdd32:
case Opcode::FPVectorMulAdd64: case Opcode::FPVectorMulAdd64:
case Opcode::FPVectorPairedAddLower32: case Opcode::FPVectorPairedAddLower32:

View file

@ -483,6 +483,7 @@ OPCODE(FPMinNumeric32, U32, U32,
OPCODE(FPMinNumeric64, U64, U64, U64 ) OPCODE(FPMinNumeric64, U64, U64, U64 )
OPCODE(FPMul32, U32, U32, U32 ) OPCODE(FPMul32, U32, U32, U32 )
OPCODE(FPMul64, U64, U64, U64 ) OPCODE(FPMul64, U64, U64, U64 )
OPCODE(FPMulAdd16, U16, U16, U16, U16 )
OPCODE(FPMulAdd32, U32, U32, U32, U32 ) OPCODE(FPMulAdd32, U32, U32, U32, U32 )
OPCODE(FPMulAdd64, U64, U64, U64, U64 ) OPCODE(FPMulAdd64, U64, U64, U64, U64 )
OPCODE(FPMulX32, U32, U32, U32 ) OPCODE(FPMulX32, U32, U32, U32 )
@ -556,6 +557,7 @@ OPCODE(FPVectorMin32, U128, U128
OPCODE(FPVectorMin64, U128, U128, U128 ) OPCODE(FPVectorMin64, U128, U128, U128 )
OPCODE(FPVectorMul32, U128, U128, U128 ) OPCODE(FPVectorMul32, U128, U128, U128 )
OPCODE(FPVectorMul64, U128, U128, U128 ) OPCODE(FPVectorMul64, U128, U128, U128 )
OPCODE(FPVectorMulAdd16, U128, U128, U128, U128 )
OPCODE(FPVectorMulAdd32, U128, U128, U128, U128 ) OPCODE(FPVectorMulAdd32, U128, U128, U128, U128 )
OPCODE(FPVectorMulAdd64, U128, U128, U128, U128 ) OPCODE(FPVectorMulAdd64, U128, U128, U128, U128 )
OPCODE(FPVectorMulX32, U128, U128, U128 ) OPCODE(FPVectorMulX32, U128, U128, U128 )