A32: Implement ASIMD VMAX, VMIN (floating-point)

This commit is contained in:
MerryMage 2020-06-20 01:18:17 +01:00
parent 8d067d5d60
commit bb4f3aa407
7 changed files with 100 additions and 52 deletions

View file

@ -828,52 +828,56 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) {
template<size_t fsize, bool is_max> template<size_t fsize, bool is_max>
static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if (ctx.FPCR().DN()) { const bool fpcr_controlled = inst->GetArg(2).GetU1();
if (ctx.FPCR(fpcr_controlled).DN()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.FPCR().FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm mask = xmm0; const Xbyak::Xmm mask = xmm0;
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
DenormalsAreZero<fsize>(code, ctx.FPCR(), {result, xmm_b}, mask); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
if (code.HasAVX()) { if (code.HasAVX()) {
FCODE(vcmpeqp)(mask, result, xmm_b); FCODE(vcmpeqp)(mask, result, xmm_b);
FCODE(vcmpunordp)(nan_mask, result, xmm_b); FCODE(vcmpunordp)(nan_mask, result, xmm_b);
if constexpr (is_max) { if constexpr (is_max) {
FCODE(vandp)(eq, result, xmm_b); FCODE(vandp)(eq, result, xmm_b);
FCODE(vmaxp)(result, result, xmm_b); FCODE(vmaxp)(result, result, xmm_b);
} else {
FCODE(vorp)(eq, result, xmm_b);
FCODE(vminp)(result, result, xmm_b);
}
FCODE(blendvp)(result, eq);
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
} else { } else {
FCODE(vorp)(eq, result, xmm_b); code.movaps(mask, result);
FCODE(vminp)(result, result, xmm_b); code.movaps(eq, result);
code.movaps(nan_mask, result);
FCODE(cmpneqp)(mask, xmm_b);
FCODE(cmpordp)(nan_mask, xmm_b);
if constexpr (is_max) {
code.andps(eq, xmm_b);
FCODE(maxp)(result, xmm_b);
} else {
code.orps(eq, xmm_b);
FCODE(minp)(result, xmm_b);
}
code.andps(result, mask);
code.andnps(mask, eq);
code.orps(result, mask);
code.andps(result, nan_mask);
code.andnps(nan_mask, GetNaNVector<fsize>(code));
code.orps(result, nan_mask);
} }
FCODE(blendvp)(result, eq); });
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
} else {
code.movaps(mask, result);
code.movaps(eq, result);
code.movaps(nan_mask, result);
FCODE(cmpneqp)(mask, xmm_b);
FCODE(cmpordp)(nan_mask, xmm_b);
if constexpr (is_max) {
code.andps(eq, xmm_b);
FCODE(maxp)(result, xmm_b);
} else {
code.orps(eq, xmm_b);
FCODE(minp)(result, xmm_b);
}
code.andps(result, mask);
code.andnps(mask, eq);
code.orps(result, mask);
code.andps(result, nan_mask);
code.andnps(nan_mask, GetNaNVector<fsize>(code));
code.orps(result, nan_mask);
}
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
@ -884,11 +888,11 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm mask = xmm0; const Xbyak::Xmm mask = xmm0;
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
if (ctx.FPCR().FZ()) { if (ctx.FPCR(fpcr_controlled).FZ()) {
const Xbyak::Xmm prev_xmm_b = xmm_b; const Xbyak::Xmm prev_xmm_b = xmm_b;
xmm_b = ctx.reg_alloc.ScratchXmm(); xmm_b = ctx.reg_alloc.ScratchXmm();
code.movaps(xmm_b, prev_xmm_b); code.movaps(xmm_b, prev_xmm_b);
DenormalsAreZero<fsize>(code, ctx.FPCR(), {result, xmm_b}, mask); DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
} }
// What we are doing here is handling the case when the inputs are differently signed zeros. // What we are doing here is handling the case when the inputs are differently signed zeros.

View file

@ -43,8 +43,8 @@ INST(asimd_VMUL, "VMUL", "1111001P0Dzznnnndddd100
//INST(asimd_VCGE_reg, "VCGE (register)", "111100110-0C--------1110---0----") // ASIMD //INST(asimd_VCGE_reg, "VCGE (register)", "111100110-0C--------1110---0----") // ASIMD
//INST(asimd_VCGT_reg, "VCGT (register)", "111100110-1C--------1110---0----") // ASIMD //INST(asimd_VCGT_reg, "VCGT (register)", "111100110-1C--------1110---0----") // ASIMD
//INST(asimd_VACGE, "VACGE", "111100110-CC--------1110---1----") // ASIMD //INST(asimd_VACGE, "VACGE", "111100110-CC--------1110---1----") // ASIMD
//INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100-CC--------1111---0----") // ASIMD INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100D0znnnndddd1111NQM0mmmm") // ASIMD
//INST(asimd_VPMAX_float, "VMIN (floating-point)", "111100110-CC--------1111---0----") // ASIMD INST(asimd_VMIN_float, "VMIN (floating-point)", "111100100D1znnnndddd1111NQM0mmmm") // ASIMD
//INST(asimd_VRECPS, "VRECPS", "111100100-0C--------1111---1----") // ASIMD //INST(asimd_VRECPS, "VRECPS", "111100100-0C--------1111---1----") // ASIMD
//INST(asimd_VRSQRTS, "VRSQRTS", "111100100-1C--------1111---1----") // ASIMD //INST(asimd_VRSQRTS, "VRSQRTS", "111100100-1C--------1111---1----") // ASIMD

View file

@ -333,4 +333,46 @@ bool ArmTranslatorVisitor::asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size
return true; return true;
} }
bool ArmTranslatorVisitor::asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) {
return UndefinedInstruction();
}
if (sz == 0b1) {
return UndefinedInstruction();
}
const auto d = ToVector(Q, Vd, D);
const auto m = ToVector(Q, Vm, M);
const auto n = ToVector(Q, Vn, N);
const auto reg_n = ir.GetVector(n);
const auto reg_m = ir.GetVector(m);
const auto result = ir.FPVectorMax(32, reg_m, reg_n, false);
ir.SetVector(d, result);
return true;
}
bool ArmTranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) {
return UndefinedInstruction();
}
if (sz == 0b1) {
return UndefinedInstruction();
}
const auto d = ToVector(Q, Vd, D);
const auto m = ToVector(Q, Vm, M);
const auto n = ToVector(Q, Vn, N);
const auto reg_n = ir.GetVector(n);
const auto reg_m = ir.GetVector(m);
const auto result = ir.FPVectorMin(32, reg_m, reg_n, false);
ir.SetVector(d, result);
return true;
}
} // namespace Dynarmic::A32 } // namespace Dynarmic::A32

View file

@ -462,6 +462,8 @@ struct ArmTranslatorVisitor final {
bool asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
// Two registers and a shift amount // Two registers and a shift amount
bool asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); bool asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);

View file

@ -2356,22 +2356,22 @@ U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b,
UNREACHABLE(); UNREACHABLE();
} }
U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b) { U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
switch (esize) { switch (esize) {
case 32: case 32:
return Inst<U128>(Opcode::FPVectorMax32, a, b); return Inst<U128>(Opcode::FPVectorMax32, a, b, Imm1(fpcr_controlled));
case 64: case 64:
return Inst<U128>(Opcode::FPVectorMax64, a, b); return Inst<U128>(Opcode::FPVectorMax64, a, b, Imm1(fpcr_controlled));
} }
UNREACHABLE(); UNREACHABLE();
} }
U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b) { U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
switch (esize) { switch (esize) {
case 32: case 32:
return Inst<U128>(Opcode::FPVectorMin32, a, b); return Inst<U128>(Opcode::FPVectorMin32, a, b, Imm1(fpcr_controlled));
case 64: case 64:
return Inst<U128>(Opcode::FPVectorMin64, a, b); return Inst<U128>(Opcode::FPVectorMin64, a, b, Imm1(fpcr_controlled));
} }
UNREACHABLE(); UNREACHABLE();
} }

View file

@ -352,8 +352,8 @@ public:
U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding); U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding);
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorMax(size_t esize, const U128& a, const U128& b); U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorMin(size_t esize, const U128& a, const U128& b); U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorMul(size_t esize, const U128& a, const U128& b); U128 FPVectorMul(size_t esize, const U128& a, const U128& b);
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2); U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2);
U128 FPVectorMulX(size_t esize, const U128& a, const U128& b); U128 FPVectorMulX(size_t esize, const U128& a, const U128& b);

View file

@ -595,10 +595,10 @@ OPCODE(FPVectorGreater32, U128, U128
OPCODE(FPVectorGreater64, U128, U128, U128, U1 ) OPCODE(FPVectorGreater64, U128, U128, U128, U1 )
OPCODE(FPVectorGreaterEqual32, U128, U128, U128, U1 ) OPCODE(FPVectorGreaterEqual32, U128, U128, U128, U1 )
OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 ) OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
OPCODE(FPVectorMax32, U128, U128, U128 ) OPCODE(FPVectorMax32, U128, U128, U128, U1 )
OPCODE(FPVectorMax64, U128, U128, U128 ) OPCODE(FPVectorMax64, U128, U128, U128, U1 )
OPCODE(FPVectorMin32, U128, U128, U128 ) OPCODE(FPVectorMin32, U128, U128, U128, U1 )
OPCODE(FPVectorMin64, U128, U128, U128 ) OPCODE(FPVectorMin64, U128, U128, U128, U1 )
OPCODE(FPVectorMul32, U128, U128, U128 ) OPCODE(FPVectorMul32, U128, U128, U128 )
OPCODE(FPVectorMul64, U128, U128, U128 ) OPCODE(FPVectorMul64, U128, U128, U128 )
OPCODE(FPVectorMulAdd16, U128, U128, U128, U128 ) OPCODE(FPVectorMulAdd16, U128, U128, U128, U128 )