emit_x64_floating_point: Fixup special NaN case in FMA FPMulAdd implementation
This commit is contained in:
parent
070637e0f6
commit
66bb05fc0a
2 changed files with 42 additions and 14 deletions
|
@ -147,7 +147,8 @@ static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya
|
||||||
code.SwitchToNearCode();
|
code.SwitchToNearCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end) {
|
template<typename NaNHandler>
|
||||||
|
static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
|
||||||
Xbyak::Label nan;
|
Xbyak::Label nan;
|
||||||
|
|
||||||
code.ucomiss(a, b);
|
code.ucomiss(a, b);
|
||||||
|
@ -165,9 +166,7 @@ static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya
|
||||||
code.movd(code.ABI_PARAM1.cvt32(), a);
|
code.movd(code.ABI_PARAM1.cvt32(), a);
|
||||||
code.movd(code.ABI_PARAM2.cvt32(), b);
|
code.movd(code.ABI_PARAM2.cvt32(), b);
|
||||||
code.movd(code.ABI_PARAM3.cvt32(), c);
|
code.movd(code.ABI_PARAM3.cvt32(), c);
|
||||||
code.CallFunction(static_cast<u32(*)(u32, u32, u32)>([](u32 a, u32 b, u32 c) -> u32 {
|
code.CallFunction(static_cast<u32(*)(u32, u32, u32)>(nan_handler));
|
||||||
return *FP::ProcessNaNs(a, b, c);
|
|
||||||
}));
|
|
||||||
code.movd(a, code.ABI_RETURN.cvt32());
|
code.movd(a, code.ABI_RETURN.cvt32());
|
||||||
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
code.add(rsp, 8);
|
code.add(rsp, 8);
|
||||||
|
@ -214,7 +213,8 @@ static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya
|
||||||
code.SwitchToNearCode();
|
code.SwitchToNearCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end) {
|
template<typename NaNHandler>
|
||||||
|
static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) {
|
||||||
Xbyak::Label nan;
|
Xbyak::Label nan;
|
||||||
|
|
||||||
code.ucomisd(a, b);
|
code.ucomisd(a, b);
|
||||||
|
@ -229,9 +229,7 @@ static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya
|
||||||
code.movq(code.ABI_PARAM1, a);
|
code.movq(code.ABI_PARAM1, a);
|
||||||
code.movq(code.ABI_PARAM2, b);
|
code.movq(code.ABI_PARAM2, b);
|
||||||
code.movq(code.ABI_PARAM3, c);
|
code.movq(code.ABI_PARAM3, c);
|
||||||
code.CallFunction(static_cast<u64(*)(u64, u64, u64)>([](u64 a, u64 b, u64 c) -> u64 {
|
code.CallFunction(static_cast<u64(*)(u64, u64, u64)>(nan_handler));
|
||||||
return *FP::ProcessNaNs(a, b, c);
|
|
||||||
}));
|
|
||||||
code.movq(a, code.ABI_RETURN);
|
code.movq(a, code.ABI_RETURN);
|
||||||
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
code.add(rsp, 8);
|
code.add(rsp, 8);
|
||||||
|
@ -437,8 +435,8 @@ static void FPTwoOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Funct
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Function>
|
template <typename Function, typename NaNHandler>
|
||||||
static void FPFourOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
static void FPFourOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, NaNHandler nan_handler) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
Xbyak::Label end;
|
Xbyak::Label end;
|
||||||
|
@ -454,7 +452,7 @@ static void FPFourOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Func
|
||||||
DenormalsAreZero32(code, operand3, gpr_scratch);
|
DenormalsAreZero32(code, operand3, gpr_scratch);
|
||||||
}
|
}
|
||||||
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
PreProcessNaNs32(code, result, operand2, operand3, end);
|
PreProcessNaNs32(code, result, operand2, operand3, end, nan_handler);
|
||||||
}
|
}
|
||||||
fn(result, operand2, operand3);
|
fn(result, operand2, operand3);
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
@ -470,8 +468,8 @@ static void FPFourOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Func
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Function>
|
template <typename Function, typename NaNHandler>
|
||||||
static void FPFourOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
static void FPFourOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, NaNHandler nan_handler) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
Xbyak::Label end;
|
Xbyak::Label end;
|
||||||
|
@ -487,7 +485,7 @@ static void FPFourOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Func
|
||||||
DenormalsAreZero64(code, operand3, gpr_scratch);
|
DenormalsAreZero64(code, operand3, gpr_scratch);
|
||||||
}
|
}
|
||||||
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
PreProcessNaNs64(code, result, operand2, operand3, end);
|
PreProcessNaNs64(code, result, operand2, operand3, end, nan_handler);
|
||||||
}
|
}
|
||||||
fn(result, operand2, operand3);
|
fn(result, operand2, operand3);
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
@ -787,6 +785,11 @@ void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
||||||
FPFourOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
FPFourOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
||||||
code.vfmadd231ss(result, operand2, operand3);
|
code.vfmadd231ss(result, operand2, operand3);
|
||||||
|
}, [](u32 a, u32 b, u32 c) -> u32 {
|
||||||
|
if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c)) || (FP::IsZero(b) && FP::IsInf(c)))) {
|
||||||
|
return f32_nan;
|
||||||
|
}
|
||||||
|
return *FP::ProcessNaNs(a, b, c);
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -798,6 +801,11 @@ void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
||||||
FPFourOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
FPFourOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
||||||
code.vfmadd231sd(result, operand2, operand3);
|
code.vfmadd231sd(result, operand2, operand3);
|
||||||
|
}, [](u64 a, u64 b, u64 c) -> u64 {
|
||||||
|
if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c)) || (FP::IsZero(b) && FP::IsInf(c)))) {
|
||||||
|
return f64_nan;
|
||||||
|
}
|
||||||
|
return *FP::ProcessNaNs(a, b, c);
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,16 @@
|
||||||
|
|
||||||
namespace Dynarmic::FP {
|
namespace Dynarmic::FP {
|
||||||
|
|
||||||
|
/// Is 32-bit floating point value a zero?
|
||||||
|
constexpr bool IsZero(u32 value) {
|
||||||
|
return (value & 0x7fffffff) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is 32-bit floating point value an infinity?
|
||||||
|
constexpr bool IsInf(u32 value) {
|
||||||
|
return (value & 0x7fffffff) == 0x7f800000;
|
||||||
|
}
|
||||||
|
|
||||||
/// Is 32-bit floating point value a QNaN?
|
/// Is 32-bit floating point value a QNaN?
|
||||||
constexpr bool IsQNaN(u32 value) {
|
constexpr bool IsQNaN(u32 value) {
|
||||||
return (value & 0x7fc00000) == 0x7fc00000;
|
return (value & 0x7fc00000) == 0x7fc00000;
|
||||||
|
@ -60,6 +70,16 @@ inline boost::optional<u32> ProcessNaNs(u32 a, u32 b, u32 c) {
|
||||||
return boost::none;
|
return boost::none;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Is 64-bit floating point value a zero?
|
||||||
|
constexpr bool IsZero(u64 value) {
|
||||||
|
return (value & 0x7FFF'FFFF'FFFF'FFFF) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is 64-bit floating point value an infinity?
|
||||||
|
constexpr bool IsInf(u64 value) {
|
||||||
|
return (value & 0x7FFF'FFFF'FFFF'FFFF) == 0x7FF0'0000'0000'000;
|
||||||
|
}
|
||||||
|
|
||||||
/// Is 64-bit floating point value a QNaN?
|
/// Is 64-bit floating point value a QNaN?
|
||||||
constexpr bool IsQNaN(u64 value) {
|
constexpr bool IsQNaN(u64 value) {
|
||||||
return (value & 0x7FF8'0000'0000'0000) == 0x7FF8'0000'0000'0000;
|
return (value & 0x7FF8'0000'0000'0000) == 0x7FF8'0000'0000'0000;
|
||||||
|
|
Loading…
Reference in a new issue