A32: Implement ASIMD VMAX, VMIN (floating-point)
This commit is contained in:
parent
8d067d5d60
commit
bb4f3aa407
7 changed files with 100 additions and 52 deletions
|
@ -828,52 +828,56 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
template<size_t fsize, bool is_max>
|
template<size_t fsize, bool is_max>
|
||||||
static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
if (ctx.FPCR().DN()) {
|
const bool fpcr_controlled = inst->GetArg(2).GetU1();
|
||||||
|
|
||||||
|
if (ctx.FPCR(fpcr_controlled).DN()) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm xmm_b = ctx.FPCR().FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
const Xbyak::Xmm mask = xmm0;
|
const Xbyak::Xmm mask = xmm0;
|
||||||
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
DenormalsAreZero<fsize>(code, ctx.FPCR(), {result, xmm_b}, mask);
|
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
|
||||||
|
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
|
||||||
|
|
||||||
if (code.HasAVX()) {
|
if (code.HasAVX()) {
|
||||||
FCODE(vcmpeqp)(mask, result, xmm_b);
|
FCODE(vcmpeqp)(mask, result, xmm_b);
|
||||||
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
||||||
if constexpr (is_max) {
|
if constexpr (is_max) {
|
||||||
FCODE(vandp)(eq, result, xmm_b);
|
FCODE(vandp)(eq, result, xmm_b);
|
||||||
FCODE(vmaxp)(result, result, xmm_b);
|
FCODE(vmaxp)(result, result, xmm_b);
|
||||||
|
} else {
|
||||||
|
FCODE(vorp)(eq, result, xmm_b);
|
||||||
|
FCODE(vminp)(result, result, xmm_b);
|
||||||
|
}
|
||||||
|
FCODE(blendvp)(result, eq);
|
||||||
|
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
|
||||||
} else {
|
} else {
|
||||||
FCODE(vorp)(eq, result, xmm_b);
|
code.movaps(mask, result);
|
||||||
FCODE(vminp)(result, result, xmm_b);
|
code.movaps(eq, result);
|
||||||
|
code.movaps(nan_mask, result);
|
||||||
|
FCODE(cmpneqp)(mask, xmm_b);
|
||||||
|
FCODE(cmpordp)(nan_mask, xmm_b);
|
||||||
|
|
||||||
|
if constexpr (is_max) {
|
||||||
|
code.andps(eq, xmm_b);
|
||||||
|
FCODE(maxp)(result, xmm_b);
|
||||||
|
} else {
|
||||||
|
code.orps(eq, xmm_b);
|
||||||
|
FCODE(minp)(result, xmm_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
code.andps(result, mask);
|
||||||
|
code.andnps(mask, eq);
|
||||||
|
code.orps(result, mask);
|
||||||
|
|
||||||
|
code.andps(result, nan_mask);
|
||||||
|
code.andnps(nan_mask, GetNaNVector<fsize>(code));
|
||||||
|
code.orps(result, nan_mask);
|
||||||
}
|
}
|
||||||
FCODE(blendvp)(result, eq);
|
});
|
||||||
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
|
|
||||||
} else {
|
|
||||||
code.movaps(mask, result);
|
|
||||||
code.movaps(eq, result);
|
|
||||||
code.movaps(nan_mask, result);
|
|
||||||
FCODE(cmpneqp)(mask, xmm_b);
|
|
||||||
FCODE(cmpordp)(nan_mask, xmm_b);
|
|
||||||
|
|
||||||
if constexpr (is_max) {
|
|
||||||
code.andps(eq, xmm_b);
|
|
||||||
FCODE(maxp)(result, xmm_b);
|
|
||||||
} else {
|
|
||||||
code.orps(eq, xmm_b);
|
|
||||||
FCODE(minp)(result, xmm_b);
|
|
||||||
}
|
|
||||||
|
|
||||||
code.andps(result, mask);
|
|
||||||
code.andnps(mask, eq);
|
|
||||||
code.orps(result, mask);
|
|
||||||
|
|
||||||
code.andps(result, nan_mask);
|
|
||||||
code.andnps(nan_mask, GetNaNVector<fsize>(code));
|
|
||||||
code.orps(result, nan_mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
|
||||||
|
@ -884,11 +888,11 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||||
const Xbyak::Xmm mask = xmm0;
|
const Xbyak::Xmm mask = xmm0;
|
||||||
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
if (ctx.FPCR().FZ()) {
|
if (ctx.FPCR(fpcr_controlled).FZ()) {
|
||||||
const Xbyak::Xmm prev_xmm_b = xmm_b;
|
const Xbyak::Xmm prev_xmm_b = xmm_b;
|
||||||
xmm_b = ctx.reg_alloc.ScratchXmm();
|
xmm_b = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movaps(xmm_b, prev_xmm_b);
|
code.movaps(xmm_b, prev_xmm_b);
|
||||||
DenormalsAreZero<fsize>(code, ctx.FPCR(), {result, xmm_b}, mask);
|
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
||||||
|
|
|
@ -43,8 +43,8 @@ INST(asimd_VMUL, "VMUL", "1111001P0Dzznnnndddd100
|
||||||
//INST(asimd_VCGE_reg, "VCGE (register)", "111100110-0C--------1110---0----") // ASIMD
|
//INST(asimd_VCGE_reg, "VCGE (register)", "111100110-0C--------1110---0----") // ASIMD
|
||||||
//INST(asimd_VCGT_reg, "VCGT (register)", "111100110-1C--------1110---0----") // ASIMD
|
//INST(asimd_VCGT_reg, "VCGT (register)", "111100110-1C--------1110---0----") // ASIMD
|
||||||
//INST(asimd_VACGE, "VACGE", "111100110-CC--------1110---1----") // ASIMD
|
//INST(asimd_VACGE, "VACGE", "111100110-CC--------1110---1----") // ASIMD
|
||||||
//INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100-CC--------1111---0----") // ASIMD
|
INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100D0znnnndddd1111NQM0mmmm") // ASIMD
|
||||||
//INST(asimd_VPMAX_float, "VMIN (floating-point)", "111100110-CC--------1111---0----") // ASIMD
|
INST(asimd_VMIN_float, "VMIN (floating-point)", "111100100D1znnnndddd1111NQM0mmmm") // ASIMD
|
||||||
//INST(asimd_VRECPS, "VRECPS", "111100100-0C--------1111---1----") // ASIMD
|
//INST(asimd_VRECPS, "VRECPS", "111100100-0C--------1111---1----") // ASIMD
|
||||||
//INST(asimd_VRSQRTS, "VRSQRTS", "111100100-1C--------1111---1----") // ASIMD
|
//INST(asimd_VRSQRTS, "VRSQRTS", "111100100-1C--------1111---1----") // ASIMD
|
||||||
|
|
||||||
|
|
|
@ -333,4 +333,46 @@ bool ArmTranslatorVisitor::asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ArmTranslatorVisitor::asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||||
|
if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) {
|
||||||
|
return UndefinedInstruction();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sz == 0b1) {
|
||||||
|
return UndefinedInstruction();
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto d = ToVector(Q, Vd, D);
|
||||||
|
const auto m = ToVector(Q, Vm, M);
|
||||||
|
const auto n = ToVector(Q, Vn, N);
|
||||||
|
|
||||||
|
const auto reg_n = ir.GetVector(n);
|
||||||
|
const auto reg_m = ir.GetVector(m);
|
||||||
|
const auto result = ir.FPVectorMax(32, reg_m, reg_n, false);
|
||||||
|
|
||||||
|
ir.SetVector(d, result);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ArmTranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||||
|
if (Q && (Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm))) {
|
||||||
|
return UndefinedInstruction();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sz == 0b1) {
|
||||||
|
return UndefinedInstruction();
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto d = ToVector(Q, Vd, D);
|
||||||
|
const auto m = ToVector(Q, Vm, M);
|
||||||
|
const auto n = ToVector(Q, Vn, N);
|
||||||
|
|
||||||
|
const auto reg_n = ir.GetVector(n);
|
||||||
|
const auto reg_m = ir.GetVector(m);
|
||||||
|
const auto result = ir.FPVectorMin(32, reg_m, reg_n, false);
|
||||||
|
|
||||||
|
ir.SetVector(d, result);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Dynarmic::A32
|
} // namespace Dynarmic::A32
|
||||||
|
|
|
@ -462,6 +462,8 @@ struct ArmTranslatorVisitor final {
|
||||||
bool asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
bool asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||||
bool asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
bool asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||||
bool asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
bool asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||||
|
bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||||
|
bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||||
|
|
||||||
// Two registers and a shift amount
|
// Two registers and a shift amount
|
||||||
bool asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
|
bool asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
|
||||||
|
|
|
@ -2356,22 +2356,22 @@ U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b,
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
||||||
U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b) {
|
U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 32:
|
case 32:
|
||||||
return Inst<U128>(Opcode::FPVectorMax32, a, b);
|
return Inst<U128>(Opcode::FPVectorMax32, a, b, Imm1(fpcr_controlled));
|
||||||
case 64:
|
case 64:
|
||||||
return Inst<U128>(Opcode::FPVectorMax64, a, b);
|
return Inst<U128>(Opcode::FPVectorMax64, a, b, Imm1(fpcr_controlled));
|
||||||
}
|
}
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
||||||
U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b) {
|
U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 32:
|
case 32:
|
||||||
return Inst<U128>(Opcode::FPVectorMin32, a, b);
|
return Inst<U128>(Opcode::FPVectorMin32, a, b, Imm1(fpcr_controlled));
|
||||||
case 64:
|
case 64:
|
||||||
return Inst<U128>(Opcode::FPVectorMin64, a, b);
|
return Inst<U128>(Opcode::FPVectorMin64, a, b, Imm1(fpcr_controlled));
|
||||||
}
|
}
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
|
@ -352,8 +352,8 @@ public:
|
||||||
U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding);
|
U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding);
|
||||||
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMax(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMin(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||||
U128 FPVectorMul(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorMul(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2);
|
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2);
|
||||||
U128 FPVectorMulX(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorMulX(size_t esize, const U128& a, const U128& b);
|
||||||
|
|
|
@ -595,10 +595,10 @@ OPCODE(FPVectorGreater32, U128, U128
|
||||||
OPCODE(FPVectorGreater64, U128, U128, U128, U1 )
|
OPCODE(FPVectorGreater64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorGreaterEqual32, U128, U128, U128, U1 )
|
OPCODE(FPVectorGreaterEqual32, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
|
OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMax32, U128, U128, U128 )
|
OPCODE(FPVectorMax32, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMax64, U128, U128, U128 )
|
OPCODE(FPVectorMax64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMin32, U128, U128, U128 )
|
OPCODE(FPVectorMin32, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMin64, U128, U128, U128 )
|
OPCODE(FPVectorMin64, U128, U128, U128, U1 )
|
||||||
OPCODE(FPVectorMul32, U128, U128, U128 )
|
OPCODE(FPVectorMul32, U128, U128, U128 )
|
||||||
OPCODE(FPVectorMul64, U128, U128, U128 )
|
OPCODE(FPVectorMul64, U128, U128, U128 )
|
||||||
OPCODE(FPVectorMulAdd16, U128, U128, U128, U128 )
|
OPCODE(FPVectorMulAdd16, U128, U128, U128, U128 )
|
||||||
|
|
Loading…
Reference in a new issue