ir: Add fpcr_controlled argument to FPVector{Equal,Greater,GreaterEqual}

This commit is contained in:
MerryMage 2020-06-20 00:01:10 +01:00
parent 1b3a70a83c
commit 656419286c
13 changed files with 102 additions and 53 deletions

View file

@ -71,8 +71,9 @@ bool A32EmitContext::IsSingleStep() const {
return Location().SingleStepping();
}
FP::FPCR A32EmitContext::FPCR() const {
return FP::FPCR{Location().FPSCR().Value()};
FP::FPCR A32EmitContext::FPCR(bool fpcr_controlled) const {
const FP::FPCR fpcr = FP::FPCR{Location().FPSCR().Value()};
return fpcr_controlled ? fpcr : fpcr.ASIMDStandardValue();
}
A32EmitX64::A32EmitX64(BlockOfCode& code, A32::UserConfig conf, A32::Jit* jit_interface)

View file

@ -30,7 +30,7 @@ struct A32EmitContext final : public EmitContext {
A32::LocationDescriptor Location() const;
bool IsSingleStep() const;
FP::FPCR FPCR() const override;
FP::FPCR FPCR(bool fpcr_controlled = true) const override;
const A32::UserConfig& conf;
};

View file

@ -185,7 +185,7 @@ void A32JitState::SetFpscr(u32 FPSCR) {
fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK;
guest_MXCSR = 0x00001f80;
asimd_MXCSR = 0x00001f80;
asimd_MXCSR = 0x00009fc0;
// RMode
const std::array<u32, 4> MXCSR_RMode {0x0, 0x4000, 0x2000, 0x6000};

View file

@ -48,7 +48,7 @@ struct A32JitState {
// For internal use (See: BlockOfCode::RunCode)
u32 guest_MXCSR = 0x00001f80;
u32 asimd_MXCSR = 0x00001f80;
u32 asimd_MXCSR = 0x00009fc0;
u32 save_host_MXCSR = 0;
s64 cycles_to_run = 0;
s64 cycles_remaining = 0;

View file

@ -48,8 +48,8 @@ bool A64EmitContext::IsSingleStep() const {
return Location().SingleStepping();
}
FP::FPCR A64EmitContext::FPCR() const {
return Location().FPCR();
FP::FPCR A64EmitContext::FPCR(bool fpcr_controlled) const {
return fpcr_controlled ? Location().FPCR() : Location().FPCR().ASIMDStandardValue();
}
bool A64EmitContext::AccurateNaN() const {

View file

@ -27,7 +27,7 @@ struct A64EmitContext final : public EmitContext {
A64::LocationDescriptor Location() const;
bool IsSingleStep() const;
FP::FPCR FPCR() const override;
FP::FPCR FPCR(bool fpcr_controlled = true) const override;
bool AccurateNaN() const override;
const A64::UserConfig& conf;

View file

@ -51,7 +51,7 @@ struct A64JitState {
// For internal use (See: BlockOfCode::RunCode)
u32 guest_MXCSR = 0x00001f80;
u32 asimd_MXCSR = 0x00001f80;
u32 asimd_MXCSR = 0x00009fc0;
u32 save_host_MXCSR = 0;
s64 cycles_to_run = 0;
s64 cycles_remaining = 0;

View file

@ -46,7 +46,7 @@ struct EmitContext {
size_t GetInstOffset(IR::Inst* inst) const;
void EraseInstruction(IR::Inst* inst);
virtual FP::FPCR FPCR() const = 0;
virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0;
virtual bool AccurateNaN() const { return true; }
RegAlloc& reg_alloc;

View file

@ -35,6 +35,11 @@ using namespace Xbyak::util;
namespace {
enum FpcrControlledArgument {
Present,
Absent,
};
template<size_t fsize, typename T>
T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
@ -196,9 +201,9 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
}
template<size_t fsize>
void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
if (ctx.FPCR().FZ()) {
if (ctx.FPCR().RMode() != FP::RoundingMode::TowardsMinusInfinity) {
void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
if (fpcr.FZ()) {
if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
} else {
code.xorps(tmp, tmp);
@ -383,16 +388,18 @@ void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lamb
}
template<typename Lambda>
void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Lambda lambda) {
void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Lambda lambda, bool fpcr_controlled = true) {
const auto fn = static_cast<mp::equivalent_function_type<Lambda>*>(lambda);
const u32 fpcr = ctx.FPCR(fpcr_controlled).Value();
#ifdef _WIN32
constexpr u32 stack_space = 4 * 16;
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]);
code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM4.cvt32(), fpcr);
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(qword[rsp + ABI_SHADOW_SPACE + 0], rax);
#else
@ -401,7 +408,7 @@ void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xby
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value());
code.mov(code.ABI_PARAM4.cvt32(), fpcr);
code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
#endif
@ -418,7 +425,7 @@ void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xby
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
}
template<typename Lambda>
template<FpcrControlledArgument fcarg = FpcrControlledArgument::Absent, typename Lambda>
void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
@ -427,7 +434,9 @@ void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, La
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, lambda);
const bool fpcr_controlled = fcarg == FpcrControlledArgument::Absent || args[2].GetImmediateU1();
EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, lambda, fpcr_controlled);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -486,6 +495,19 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam
ctx.reg_alloc.DefineValue(inst, result);
}
template<typename Lambda>
void MaybeStandardFPSCRValue(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, Lambda lambda) {
const bool switch_mxcsr = ctx.FPCR(fpcr_controlled) != ctx.FPCR();
if (switch_mxcsr) {
code.EnterStandardASIMD();
lambda();
code.LeaveStandardASIMD();
} else {
lambda();
}
}
} // anonymous namespace
void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
@ -538,7 +560,7 @@ void EmitX64::EmitFPVectorDiv64(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitFPVectorEqual16(EmitContext& ctx, IR::Inst* inst) {
EmitThreeOpFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& op1, const VectorArray<u16>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
EmitThreeOpFallback<FpcrControlledArgument::Present>(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& op1, const VectorArray<u16>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
for (size_t i = 0; i < result.size(); i++) {
result[i] = FP::FPCompareEQ(op1[i], op2[i], fpcr, fpsr) ? 0xFFFF : 0;
}
@ -548,9 +570,13 @@ void EmitX64::EmitFPVectorEqual16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const bool fpcr_controlled = args[2].GetImmediateU1();
code.cmpeqps(a, b);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpeqps(a, b);
});
ctx.reg_alloc.DefineValue(inst, a);
}
@ -558,9 +584,13 @@ void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const bool fpcr_controlled = args[2].GetImmediateU1();
code.cmpeqpd(a, b);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpeqpd(a, b);
});
ctx.reg_alloc.DefineValue(inst, a);
}
@ -742,40 +772,56 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitFPVectorGreater32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const bool fpcr_controlled = args[2].GetImmediateU1();
code.cmpltps(b, a);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpltps(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
}
void EmitX64::EmitFPVectorGreater64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const bool fpcr_controlled = args[2].GetImmediateU1();
code.cmpltpd(b, a);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpltpd(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
}
void EmitX64::EmitFPVectorGreaterEqual32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const bool fpcr_controlled = args[2].GetImmediateU1();
code.cmpleps(b, a);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmpleps(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
}
void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
const bool fpcr_controlled = args[2].GetImmediateU1();
code.cmplepd(b, a);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
code.cmplepd(b, a);
});
ctx.reg_alloc.DefineValue(inst, b);
}
@ -791,7 +837,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
DenormalsAreZero<fsize>(code, ctx.FPCR(), {result, xmm_b}, mask);
if (code.HasAVX()) {
FCODE(vcmpeqp)(mask, result, xmm_b);
@ -842,7 +888,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm prev_xmm_b = xmm_b;
xmm_b = ctx.reg_alloc.ScratchXmm();
code.movaps(xmm_b, prev_xmm_b);
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
DenormalsAreZero<fsize>(code, ctx.FPCR(), {result, xmm_b}, mask);
}
// What we are doing here is handling the case when the inputs are differently signed zeros.

View file

@ -185,6 +185,8 @@ public:
FPCR stdvalue;
stdvalue.AHP(AHP());
stdvalue.FZ16(FZ16());
stdvalue.FZ(true);
stdvalue.DN(true);
return stdvalue;
}

View file

@ -2302,14 +2302,14 @@ U128 IREmitter::FPVectorDiv(size_t esize, const U128& a, const U128& b) {
UNREACHABLE();
}
U128 IREmitter::FPVectorEqual(size_t esize, const U128& a, const U128& b) {
U128 IREmitter::FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
switch (esize) {
case 16:
return Inst<U128>(Opcode::FPVectorEqual16, a, b);
return Inst<U128>(Opcode::FPVectorEqual16, a, b, Imm1(fpcr_controlled));
case 32:
return Inst<U128>(Opcode::FPVectorEqual32, a, b);
return Inst<U128>(Opcode::FPVectorEqual32, a, b, Imm1(fpcr_controlled));
case 64:
return Inst<U128>(Opcode::FPVectorEqual64, a, b);
return Inst<U128>(Opcode::FPVectorEqual64, a, b, Imm1(fpcr_controlled));
}
UNREACHABLE();
}
@ -2336,22 +2336,22 @@ U128 IREmitter::FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fb
UNREACHABLE();
}
U128 IREmitter::FPVectorGreater(size_t esize, const U128& a, const U128& b) {
U128 IREmitter::FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
switch (esize) {
case 32:
return Inst<U128>(Opcode::FPVectorGreater32, a, b);
return Inst<U128>(Opcode::FPVectorGreater32, a, b, Imm1(fpcr_controlled));
case 64:
return Inst<U128>(Opcode::FPVectorGreater64, a, b);
return Inst<U128>(Opcode::FPVectorGreater64, a, b, Imm1(fpcr_controlled));
}
UNREACHABLE();
}
U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b) {
U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
switch (esize) {
case 32:
return Inst<U128>(Opcode::FPVectorGreaterEqual32, a, b);
return Inst<U128>(Opcode::FPVectorGreaterEqual32, a, b, Imm1(fpcr_controlled));
case 64:
return Inst<U128>(Opcode::FPVectorGreaterEqual64, a, b);
return Inst<U128>(Opcode::FPVectorGreaterEqual64, a, b, Imm1(fpcr_controlled));
}
UNREACHABLE();
}

View file

@ -347,11 +347,11 @@ public:
U128 FPVectorAbs(size_t esize, const U128& a);
U128 FPVectorAdd(size_t esize, const U128& a, const U128& b);
U128 FPVectorDiv(size_t esize, const U128& a, const U128& b);
U128 FPVectorEqual(size_t esize, const U128& a, const U128& b);
U128 FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding);
U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding);
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b);
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b);
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
U128 FPVectorMax(size_t esize, const U128& a, const U128& b);
U128 FPVectorMin(size_t esize, const U128& a, const U128& b);
U128 FPVectorMul(size_t esize, const U128& a, const U128& b);

View file

@ -584,17 +584,17 @@ OPCODE(FPVectorAdd32, U128, U128
OPCODE(FPVectorAdd64, U128, U128, U128 )
OPCODE(FPVectorDiv32, U128, U128, U128 )
OPCODE(FPVectorDiv64, U128, U128, U128 )
OPCODE(FPVectorEqual16, U128, U128, U128 )
OPCODE(FPVectorEqual32, U128, U128, U128 )
OPCODE(FPVectorEqual64, U128, U128, U128 )
OPCODE(FPVectorEqual16, U128, U128, U128, U1 )
OPCODE(FPVectorEqual32, U128, U128, U128, U1 )
OPCODE(FPVectorEqual64, U128, U128, U128, U1 )
OPCODE(FPVectorFromSignedFixed32, U128, U128, U8, U8 )
OPCODE(FPVectorFromSignedFixed64, U128, U128, U8, U8 )
OPCODE(FPVectorFromUnsignedFixed32, U128, U128, U8, U8 )
OPCODE(FPVectorFromUnsignedFixed64, U128, U128, U8, U8 )
OPCODE(FPVectorGreater32, U128, U128, U128 )
OPCODE(FPVectorGreater64, U128, U128, U128 )
OPCODE(FPVectorGreaterEqual32, U128, U128, U128 )
OPCODE(FPVectorGreaterEqual64, U128, U128, U128 )
OPCODE(FPVectorGreater32, U128, U128, U128, U1 )
OPCODE(FPVectorGreater64, U128, U128, U128, U1 )
OPCODE(FPVectorGreaterEqual32, U128, U128, U128, U1 )
OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
OPCODE(FPVectorMax32, U128, U128, U128 )
OPCODE(FPVectorMax64, U128, U128, U128 )
OPCODE(FPVectorMin32, U128, U128, U128 )