backend_x64: Accurately handle NaNs

This commit is contained in:
MerryMage 2018-02-18 12:54:39 +00:00
parent e97581d063
commit 07520f32c3
4 changed files with 383 additions and 28 deletions

View file

@ -36,6 +36,7 @@ struct EmitContext {
virtual bool FPSCR_RoundTowardsZero() const = 0;
virtual bool FPSCR_FTZ() const = 0;
virtual bool FPSCR_DN() const = 0;
virtual bool AccurateNaN() const { return true; }
RegAlloc& reg_alloc;
IR::Block& block;

View file

@ -4,10 +4,12 @@
* General Public License version 2 or any later version.
*/
#include "backend_x64/abi.h"
#include "backend_x64/block_of_code.h"
#include "backend_x64/emit_x64.h"
#include "common/assert.h"
#include "common/common_types.h"
#include "common/fp_util.h"
#include "frontend/ir/basic_block.h"
#include "frontend/ir/microinstruction.h"
#include "frontend/ir/opcodes.h"
@ -95,33 +97,127 @@ static void FlushToZero64(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg64
code.L(end);
}
static void DefaultNaN32(BlockOfCode& code, Xbyak::Xmm xmm_value) {
Xbyak::Label end;
code.ucomiss(xmm_value, xmm_value);
code.jnp(end);
code.movaps(xmm_value, code.MConst(f32_nan));
code.L(end);
}
static void DefaultNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value) {
Xbyak::Label end;
code.ucomisd(xmm_value, xmm_value);
code.jnp(end);
code.movaps(xmm_value, code.MConst(f64_nan));
code.L(end);
}
static void ZeroIfNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) {
code.pxor(xmm_scratch, xmm_scratch);
code.cmpordsd(xmm_scratch, xmm_value); // true mask when ordered (i.e.: when not an NaN)
code.pand(xmm_value, xmm_scratch);
}
static Xbyak::Label PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b) {
Xbyak::Label nan, end;
code.ucomiss(a, b);
code.jp(nan, code.T_NEAR);
code.SwitchToFarCode();
code.L(nan);
code.sub(rsp, 8);
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
code.xor_(code.ABI_PARAM1.cvt32(), code.ABI_PARAM1.cvt32());
code.xor_(code.ABI_PARAM2.cvt32(), code.ABI_PARAM2.cvt32());
code.movd(code.ABI_PARAM1.cvt32(), a);
code.movd(code.ABI_PARAM2.cvt32(), b);
code.CallFunction(static_cast<u32(*)(u32, u32)>([](u32 a, u32 b) -> u32 {
return *Common::ProcessNaNs(a, b);
}));
code.movd(a, code.ABI_RETURN.cvt32());
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
code.add(rsp, 8);
code.jmp(end, code.T_NEAR);
code.SwitchToNearCode();
return end;
}
static void PostProcessNaNs32(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
code.movaps(tmp, result);
code.cmpunordps(tmp, tmp);
code.pslld(tmp, 31);
code.xorps(result, tmp);
}
static void DefaultNaN32(BlockOfCode& code, Xbyak::Xmm xmm_value) {
Xbyak::Label end;
code.ucomiss(xmm_value, xmm_value);
code.jnp(end);
code.movaps(xmm_value, code.MConst(f32_nan));
code.L(end);
}
static Xbyak::Label PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b) {
Xbyak::Label nan, end;
code.ucomisd(a, b);
code.jp(nan, code.T_NEAR);
code.SwitchToFarCode();
code.L(nan);
code.sub(rsp, 8);
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
code.movq(code.ABI_PARAM1, a);
code.movq(code.ABI_PARAM2, b);
code.CallFunction(static_cast<u64(*)(u64, u64)>([](u64 a, u64 b) -> u64 {
return *Common::ProcessNaNs(a, b);
}));
code.movq(a, code.ABI_RETURN);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
code.add(rsp, 8);
code.jmp(end, code.T_NEAR);
code.SwitchToNearCode();
return end;
}
static void PostProcessNaNs64(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
code.movaps(tmp, result);
code.cmpunordpd(tmp, tmp);
code.psllq(tmp, 63);
code.xorps(result, tmp);
}
static void DefaultNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value) {
Xbyak::Label end;
code.ucomisd(xmm_value, xmm_value);
code.jnp(end);
code.movaps(xmm_value, code.MConst(f64_nan));
code.L(end);
}
static Xbyak::Label ProcessNaN32(BlockOfCode& code, Xbyak::Xmm a) {
Xbyak::Label nan, end;
code.ucomiss(a, a);
code.jp(nan, code.T_NEAR);
code.SwitchToFarCode();
code.L(nan);
code.orps(a, code.MConst(0x00400000));
code.jmp(end, code.T_NEAR);
code.SwitchToNearCode();
return end;
}
static Xbyak::Label ProcessNaN64(BlockOfCode& code, Xbyak::Xmm a) {
Xbyak::Label nan, end;
code.ucomisd(a, a);
code.jp(nan, code.T_NEAR);
code.SwitchToFarCode();
code.L(nan);
code.orps(a, code.MConst(0x0008'0000'0000'0000));
code.jmp(end, code.T_NEAR);
code.SwitchToNearCode();
return end;
}
static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end;
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
@ -130,13 +226,19 @@ static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
DenormalsAreZero32(code, result, gpr_scratch);
DenormalsAreZero32(code, operand, gpr_scratch);
}
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
end = PreProcessNaNs32(code, result, operand);
}
(code.*fn)(result, operand);
if (ctx.FPSCR_FTZ()) {
FlushToZero32(code, result, gpr_scratch);
}
if (ctx.FPSCR_DN()) {
DefaultNaN32(code, result);
} else if (ctx.AccurateNaN()) {
PostProcessNaNs32(code, result, operand);
}
code.L(end);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -144,6 +246,8 @@ static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end;
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
@ -152,13 +256,19 @@ static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
DenormalsAreZero64(code, result, gpr_scratch);
DenormalsAreZero64(code, operand, gpr_scratch);
}
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
end = PreProcessNaNs64(code, result, operand);
}
(code.*fn)(result, operand);
if (ctx.FPSCR_FTZ()) {
FlushToZero64(code, result, gpr_scratch);
}
if (ctx.FPSCR_DN()) {
DefaultNaN64(code, result);
} else if (ctx.AccurateNaN()) {
PostProcessNaNs64(code, result, operand);
}
code.L(end);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -166,20 +276,27 @@ static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end;
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
if (ctx.FPSCR_FTZ()) {
DenormalsAreZero32(code, result, gpr_scratch);
}
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
end = ProcessNaN32(code, result);
}
(code.*fn)(result, result);
if (ctx.FPSCR_FTZ()) {
FlushToZero32(code, result, gpr_scratch);
}
if (ctx.FPSCR_DN()) {
DefaultNaN32(code, result);
} else if (ctx.AccurateNaN()) {
PostProcessNaNs32(code, result, ctx.reg_alloc.ScratchXmm());
}
code.L(end);
ctx.reg_alloc.DefineValue(inst, result);
}
@ -187,20 +304,27 @@ static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void
static void FPTwoOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end;
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
if (ctx.FPSCR_FTZ()) {
DenormalsAreZero64(code, result, gpr_scratch);
}
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
end = ProcessNaN64(code, result);
}
(code.*fn)(result, result);
if (ctx.FPSCR_FTZ()) {
FlushToZero64(code, result, gpr_scratch);
}
if (ctx.FPSCR_DN()) {
DefaultNaN64(code, result);
} else if (ctx.AccurateNaN()) {
PostProcessNaNs64(code, result, ctx.reg_alloc.ScratchXmm());
}
code.L(end);
ctx.reg_alloc.DefineValue(inst, result);
}

View file

@ -4,8 +4,10 @@
* General Public License version 2 or any later version.
*/
#include "backend_x64/abi.h"
#include "backend_x64/block_of_code.h"
#include "backend_x64/emit_x64.h"
#include "common/fp_util.h"
#include "frontend/ir/basic_block.h"
#include "frontend/ir/microinstruction.h"
@ -14,31 +16,183 @@ namespace Dynarmic::BackendX64 {
using namespace Xbyak::util;
template <typename Function>
static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
(code.*fn)(xmm_a, xmm_b);
if (ctx.FPSCR_DN()) {
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pcmpeqw(tmp, tmp);
code.movaps(nan_mask, xmm_a);
code.cmpordps(nan_mask, nan_mask);
code.andps(xmm_a, nan_mask);
code.xorps(nan_mask, tmp);
code.andps(nan_mask, code.MConst(0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000));
code.orps(xmm_a, nan_mask);
}
ctx.reg_alloc.DefineValue(inst, xmm_a);
return;
}
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
Xbyak::Label end, nan;
Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
(code.*fn)(xmm_a, xmm_b);
code.movaps(nan_mask, xmm_b);
code.movaps(result, xmm_a);
code.cmpunordps(nan_mask, xmm_a);
(code.*fn)(result, xmm_b);
code.cmpunordps(nan_mask, result);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
code.ptest(nan_mask, nan_mask);
} else {
Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
code.movmskps(bitmask, nan_mask);
code.cmp(bitmask, 0);
}
code.jz(end);
code.jmp(nan, code.T_NEAR);
code.L(end);
ctx.reg_alloc.DefineValue(inst, xmm_a);
code.SwitchToFarCode();
code.L(nan);
code.sub(rsp, 8);
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
const size_t stack_space = 3 * 16;
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.movaps(xword[code.ABI_PARAM1], result);
code.movaps(xword[code.ABI_PARAM2], xmm_a);
code.movaps(xword[code.ABI_PARAM3], xmm_b);
code.CallFunction(static_cast<void(*)(std::array<u32, 4>&, const std::array<u32, 4>&, const std::array<u32, 4>&)>(
[](std::array<u32, 4>& result, const std::array<u32, 4>& a, const std::array<u32, 4>& b) {
for (size_t i = 0; i < 4; ++i) {
if (auto r = Common::ProcessNaNs(a[i], b[i])) {
result[i] = *r;
} else if (Common::IsNaN(result[i])) {
result[i] = 0x7fc00000;
}
}
}
));
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
code.add(rsp, 8);
code.jmp(end, code.T_NEAR);
code.SwitchToNearCode();
ctx.reg_alloc.DefineValue(inst, result);
}
template <typename Function>
static void EmitVectorOperation64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
(code.*fn)(xmm_a, xmm_b);
if (ctx.FPSCR_DN()) {
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pcmpeqw(tmp, tmp);
code.movaps(nan_mask, xmm_a);
code.cmpordpd(nan_mask, nan_mask);
code.andps(xmm_a, nan_mask);
code.xorps(nan_mask, tmp);
code.andps(nan_mask, code.MConst(0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000));
code.orps(xmm_a, nan_mask);
}
ctx.reg_alloc.DefineValue(inst, xmm_a);
return;
}
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end, nan;
Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
code.movaps(nan_mask, xmm_b);
code.movaps(result, xmm_a);
code.cmpunordpd(nan_mask, xmm_a);
(code.*fn)(result, xmm_b);
code.cmpunordpd(nan_mask, result);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
code.ptest(nan_mask, nan_mask);
} else {
Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
code.movmskps(bitmask, nan_mask);
code.cmp(bitmask, 0);
}
code.jz(end);
code.jmp(nan, code.T_NEAR);
code.L(end);
code.SwitchToFarCode();
code.L(nan);
code.sub(rsp, 8);
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
const size_t stack_space = 3 * 16;
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.movaps(xword[code.ABI_PARAM1], result);
code.movaps(xword[code.ABI_PARAM2], xmm_a);
code.movaps(xword[code.ABI_PARAM3], xmm_b);
code.CallFunction(static_cast<void(*)(std::array<u64, 2>&, const std::array<u64, 2>&, const std::array<u64, 2>&)>(
[](std::array<u64, 2>& result, const std::array<u64, 2>& a, const std::array<u64, 2>& b) {
for (size_t i = 0; i < 4; ++i) {
if (auto r = Common::ProcessNaNs(a[i], b[i])) {
result[i] = *r;
} else if (Common::IsNaN(result[i])) {
result[i] = 0x7ff8'0000'0000'0000;
}
}
}
));
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
code.add(rsp, 8);
code.jmp(end, code.T_NEAR);
code.SwitchToNearCode();
ctx.reg_alloc.DefineValue(inst, result);
}
void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::addps);
EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::addps);
}
void EmitX64::EmitFPVectorAdd64(EmitContext& ctx, IR::Inst* inst) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::addpd);
EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::addpd);
}
void EmitX64::EmitFPVectorSub32(EmitContext& ctx, IR::Inst* inst) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::subps);
EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::subps);
}
void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::subpd);
EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::subpd);
}
} // namespace Dynarmic::BackendX64

76
src/common/fp_util.h Normal file
View file

@ -0,0 +1,76 @@
/* This file is part of the dynarmic project.
* Copyright (c) 2018 MerryMage
* This software may be used and distributed according to the terms of the GNU
* General Public License version 2 or any later version.
*/
#pragma once
#include <boost/optional.hpp>
namespace Dynarmic {
namespace Common {
/// Is 32-bit floating point value a QNaN?
constexpr bool IsQNaN(u32 value) {
return (value & 0x7fc00000) == 0x7fc00000;
}
/// Is 32-bit floating point value a SNaN?
constexpr bool IsSNaN(u32 value) {
return (value & 0x7fc00000) == 0x7f800000 && (value & 0x007fffff) != 0;
}
/// Is 32-bit floating point value a NaN?
constexpr bool IsNaN(u32 value) {
return IsQNaN(value) || IsSNaN(value);
}
/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor.
/// If neither argument is a NaN, returns boost::none.
inline boost::optional<u32> ProcessNaNs(u32 a, u32 b) {
if (IsSNaN(a)) {
return a | 0x00400000;
} else if (IsSNaN(b)) {
return b | 0x00400000;
} else if (IsQNaN(a)) {
return a;
} else if (IsQNaN(b)) {
return b;
}
return boost::none;
}
/// Is 64-bit floating point value a QNaN?
constexpr bool IsQNaN(u64 value) {
return (value & 0x7FF8'0000'0000'0000) == 0x7FF8'0000'0000'0000;
}
/// Is 64-bit floating point value a SNaN?
constexpr bool IsSNaN(u64 value) {
return (value & 0x7FF8'0000'0000'0000) == 0x7FF0'0000'0000'0000
&& (value & 0x0007'FFFF'FFFF'FFFF) != 0;
}
/// Is 64-bit floating point value a NaN?
constexpr bool IsNaN(u64 value) {
return IsQNaN(value) || IsSNaN(value);
}
/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor.
/// If neither argument is a NaN, returns boost::none.
inline boost::optional<u64> ProcessNaNs(u64 a, u64 b) {
if (IsSNaN(a)) {
return a | 0x0008'0000'0000'0000;
} else if (IsSNaN(b)) {
return b | 0x0008'0000'0000'0000;
} else if (IsQNaN(a)) {
return a;
} else if (IsQNaN(b)) {
return b;
}
return boost::none;
}
} // namespace Common
} // namespace Dynarmic