backend_x64: Accurately handle NaNs
This commit is contained in:
parent
e97581d063
commit
07520f32c3
4 changed files with 383 additions and 28 deletions
|
@ -36,6 +36,7 @@ struct EmitContext {
|
||||||
virtual bool FPSCR_RoundTowardsZero() const = 0;
|
virtual bool FPSCR_RoundTowardsZero() const = 0;
|
||||||
virtual bool FPSCR_FTZ() const = 0;
|
virtual bool FPSCR_FTZ() const = 0;
|
||||||
virtual bool FPSCR_DN() const = 0;
|
virtual bool FPSCR_DN() const = 0;
|
||||||
|
virtual bool AccurateNaN() const { return true; }
|
||||||
|
|
||||||
RegAlloc& reg_alloc;
|
RegAlloc& reg_alloc;
|
||||||
IR::Block& block;
|
IR::Block& block;
|
||||||
|
|
|
@ -4,10 +4,12 @@
|
||||||
* General Public License version 2 or any later version.
|
* General Public License version 2 or any later version.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "backend_x64/abi.h"
|
||||||
#include "backend_x64/block_of_code.h"
|
#include "backend_x64/block_of_code.h"
|
||||||
#include "backend_x64/emit_x64.h"
|
#include "backend_x64/emit_x64.h"
|
||||||
#include "common/assert.h"
|
#include "common/assert.h"
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
|
#include "common/fp_util.h"
|
||||||
#include "frontend/ir/basic_block.h"
|
#include "frontend/ir/basic_block.h"
|
||||||
#include "frontend/ir/microinstruction.h"
|
#include "frontend/ir/microinstruction.h"
|
||||||
#include "frontend/ir/opcodes.h"
|
#include "frontend/ir/opcodes.h"
|
||||||
|
@ -95,33 +97,127 @@ static void FlushToZero64(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg64
|
||||||
code.L(end);
|
code.L(end);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DefaultNaN32(BlockOfCode& code, Xbyak::Xmm xmm_value) {
|
|
||||||
Xbyak::Label end;
|
|
||||||
|
|
||||||
code.ucomiss(xmm_value, xmm_value);
|
|
||||||
code.jnp(end);
|
|
||||||
code.movaps(xmm_value, code.MConst(f32_nan));
|
|
||||||
code.L(end);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DefaultNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value) {
|
|
||||||
Xbyak::Label end;
|
|
||||||
|
|
||||||
code.ucomisd(xmm_value, xmm_value);
|
|
||||||
code.jnp(end);
|
|
||||||
code.movaps(xmm_value, code.MConst(f64_nan));
|
|
||||||
code.L(end);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ZeroIfNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) {
|
static void ZeroIfNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) {
|
||||||
code.pxor(xmm_scratch, xmm_scratch);
|
code.pxor(xmm_scratch, xmm_scratch);
|
||||||
code.cmpordsd(xmm_scratch, xmm_value); // true mask when ordered (i.e.: when not an NaN)
|
code.cmpordsd(xmm_scratch, xmm_value); // true mask when ordered (i.e.: when not an NaN)
|
||||||
code.pand(xmm_value, xmm_scratch);
|
code.pand(xmm_value, xmm_scratch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Xbyak::Label PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b) {
|
||||||
|
Xbyak::Label nan, end;
|
||||||
|
|
||||||
|
code.ucomiss(a, b);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.xor_(code.ABI_PARAM1.cvt32(), code.ABI_PARAM1.cvt32());
|
||||||
|
code.xor_(code.ABI_PARAM2.cvt32(), code.ABI_PARAM2.cvt32());
|
||||||
|
code.movd(code.ABI_PARAM1.cvt32(), a);
|
||||||
|
code.movd(code.ABI_PARAM2.cvt32(), b);
|
||||||
|
code.CallFunction(static_cast<u32(*)(u32, u32)>([](u32 a, u32 b) -> u32 {
|
||||||
|
return *Common::ProcessNaNs(a, b);
|
||||||
|
}));
|
||||||
|
code.movd(a, code.ABI_RETURN.cvt32());
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void PostProcessNaNs32(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
|
||||||
|
code.movaps(tmp, result);
|
||||||
|
code.cmpunordps(tmp, tmp);
|
||||||
|
code.pslld(tmp, 31);
|
||||||
|
code.xorps(result, tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DefaultNaN32(BlockOfCode& code, Xbyak::Xmm xmm_value) {
|
||||||
|
Xbyak::Label end;
|
||||||
|
code.ucomiss(xmm_value, xmm_value);
|
||||||
|
code.jnp(end);
|
||||||
|
code.movaps(xmm_value, code.MConst(f32_nan));
|
||||||
|
code.L(end);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Xbyak::Label PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b) {
|
||||||
|
Xbyak::Label nan, end;
|
||||||
|
|
||||||
|
code.ucomisd(a, b);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.movq(code.ABI_PARAM1, a);
|
||||||
|
code.movq(code.ABI_PARAM2, b);
|
||||||
|
code.CallFunction(static_cast<u64(*)(u64, u64)>([](u64 a, u64 b) -> u64 {
|
||||||
|
return *Common::ProcessNaNs(a, b);
|
||||||
|
}));
|
||||||
|
code.movq(a, code.ABI_RETURN);
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void PostProcessNaNs64(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
|
||||||
|
code.movaps(tmp, result);
|
||||||
|
code.cmpunordpd(tmp, tmp);
|
||||||
|
code.psllq(tmp, 63);
|
||||||
|
code.xorps(result, tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DefaultNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value) {
|
||||||
|
Xbyak::Label end;
|
||||||
|
code.ucomisd(xmm_value, xmm_value);
|
||||||
|
code.jnp(end);
|
||||||
|
code.movaps(xmm_value, code.MConst(f64_nan));
|
||||||
|
code.L(end);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Xbyak::Label ProcessNaN32(BlockOfCode& code, Xbyak::Xmm a) {
|
||||||
|
Xbyak::Label nan, end;
|
||||||
|
|
||||||
|
code.ucomiss(a, a);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
|
||||||
|
code.orps(a, code.MConst(0x00400000));
|
||||||
|
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Xbyak::Label ProcessNaN64(BlockOfCode& code, Xbyak::Xmm a) {
|
||||||
|
Xbyak::Label nan, end;
|
||||||
|
|
||||||
|
code.ucomisd(a, a);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
|
||||||
|
code.orps(a, code.MConst(0x0008'0000'0000'0000));
|
||||||
|
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end;
|
||||||
|
|
||||||
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
|
Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
@ -130,13 +226,19 @@ static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
|
||||||
DenormalsAreZero32(code, result, gpr_scratch);
|
DenormalsAreZero32(code, result, gpr_scratch);
|
||||||
DenormalsAreZero32(code, operand, gpr_scratch);
|
DenormalsAreZero32(code, operand, gpr_scratch);
|
||||||
}
|
}
|
||||||
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
|
end = PreProcessNaNs32(code, result, operand);
|
||||||
|
}
|
||||||
(code.*fn)(result, operand);
|
(code.*fn)(result, operand);
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
FlushToZero32(code, result, gpr_scratch);
|
FlushToZero32(code, result, gpr_scratch);
|
||||||
}
|
}
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
DefaultNaN32(code, result);
|
DefaultNaN32(code, result);
|
||||||
|
} else if (ctx.AccurateNaN()) {
|
||||||
|
PostProcessNaNs32(code, result, operand);
|
||||||
}
|
}
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
@ -144,6 +246,8 @@ static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
|
||||||
static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end;
|
||||||
|
|
||||||
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
|
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
|
||||||
|
@ -152,13 +256,19 @@ static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
|
||||||
DenormalsAreZero64(code, result, gpr_scratch);
|
DenormalsAreZero64(code, result, gpr_scratch);
|
||||||
DenormalsAreZero64(code, operand, gpr_scratch);
|
DenormalsAreZero64(code, operand, gpr_scratch);
|
||||||
}
|
}
|
||||||
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
|
end = PreProcessNaNs64(code, result, operand);
|
||||||
|
}
|
||||||
(code.*fn)(result, operand);
|
(code.*fn)(result, operand);
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
FlushToZero64(code, result, gpr_scratch);
|
FlushToZero64(code, result, gpr_scratch);
|
||||||
}
|
}
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
DefaultNaN64(code, result);
|
DefaultNaN64(code, result);
|
||||||
|
} else if (ctx.AccurateNaN()) {
|
||||||
|
PostProcessNaNs64(code, result, operand);
|
||||||
}
|
}
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
@ -166,20 +276,27 @@ static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, voi
|
||||||
static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end;
|
||||||
|
|
||||||
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
|
Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
DenormalsAreZero32(code, result, gpr_scratch);
|
DenormalsAreZero32(code, result, gpr_scratch);
|
||||||
}
|
}
|
||||||
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
|
end = ProcessNaN32(code, result);
|
||||||
|
}
|
||||||
(code.*fn)(result, result);
|
(code.*fn)(result, result);
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
FlushToZero32(code, result, gpr_scratch);
|
FlushToZero32(code, result, gpr_scratch);
|
||||||
}
|
}
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
DefaultNaN32(code, result);
|
DefaultNaN32(code, result);
|
||||||
|
} else if (ctx.AccurateNaN()) {
|
||||||
|
PostProcessNaNs32(code, result, ctx.reg_alloc.ScratchXmm());
|
||||||
}
|
}
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
@ -187,20 +304,27 @@ static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void
|
||||||
static void FPTwoOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
static void FPTwoOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end;
|
||||||
|
|
||||||
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
|
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
DenormalsAreZero64(code, result, gpr_scratch);
|
DenormalsAreZero64(code, result, gpr_scratch);
|
||||||
}
|
}
|
||||||
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
|
end = ProcessNaN64(code, result);
|
||||||
|
}
|
||||||
(code.*fn)(result, result);
|
(code.*fn)(result, result);
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
FlushToZero64(code, result, gpr_scratch);
|
FlushToZero64(code, result, gpr_scratch);
|
||||||
}
|
}
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
DefaultNaN64(code, result);
|
DefaultNaN64(code, result);
|
||||||
|
} else if (ctx.AccurateNaN()) {
|
||||||
|
PostProcessNaNs64(code, result, ctx.reg_alloc.ScratchXmm());
|
||||||
}
|
}
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,10 @@
|
||||||
* General Public License version 2 or any later version.
|
* General Public License version 2 or any later version.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "backend_x64/abi.h"
|
||||||
#include "backend_x64/block_of_code.h"
|
#include "backend_x64/block_of_code.h"
|
||||||
#include "backend_x64/emit_x64.h"
|
#include "backend_x64/emit_x64.h"
|
||||||
|
#include "common/fp_util.h"
|
||||||
#include "frontend/ir/basic_block.h"
|
#include "frontend/ir/basic_block.h"
|
||||||
#include "frontend/ir/microinstruction.h"
|
#include "frontend/ir/microinstruction.h"
|
||||||
|
|
||||||
|
@ -14,31 +16,183 @@ namespace Dynarmic::BackendX64 {
|
||||||
using namespace Xbyak::util;
|
using namespace Xbyak::util;
|
||||||
|
|
||||||
template <typename Function>
|
template <typename Function>
|
||||||
static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||||
|
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
(code.*fn)(xmm_a, xmm_b);
|
(code.*fn)(xmm_a, xmm_b);
|
||||||
|
|
||||||
|
if (ctx.FPSCR_DN()) {
|
||||||
|
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
code.pcmpeqw(tmp, tmp);
|
||||||
|
code.movaps(nan_mask, xmm_a);
|
||||||
|
code.cmpordps(nan_mask, nan_mask);
|
||||||
|
code.andps(xmm_a, nan_mask);
|
||||||
|
code.xorps(nan_mask, tmp);
|
||||||
|
code.andps(nan_mask, code.MConst(0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000));
|
||||||
|
code.orps(xmm_a, nan_mask);
|
||||||
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end, nan;
|
||||||
|
Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code.movaps(nan_mask, xmm_b);
|
||||||
|
code.movaps(result, xmm_a);
|
||||||
|
code.cmpunordps(nan_mask, xmm_a);
|
||||||
|
(code.*fn)(result, xmm_b);
|
||||||
|
code.cmpunordps(nan_mask, result);
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
||||||
|
code.ptest(nan_mask, nan_mask);
|
||||||
|
} else {
|
||||||
|
Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
code.movmskps(bitmask, nan_mask);
|
||||||
|
code.cmp(bitmask, 0);
|
||||||
|
}
|
||||||
|
code.jz(end);
|
||||||
|
code.jmp(nan, code.T_NEAR);
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
const size_t stack_space = 3 * 16;
|
||||||
|
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||||
|
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||||
|
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
|
||||||
|
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
|
||||||
|
code.movaps(xword[code.ABI_PARAM1], result);
|
||||||
|
code.movaps(xword[code.ABI_PARAM2], xmm_a);
|
||||||
|
code.movaps(xword[code.ABI_PARAM3], xmm_b);
|
||||||
|
code.CallFunction(static_cast<void(*)(std::array<u32, 4>&, const std::array<u32, 4>&, const std::array<u32, 4>&)>(
|
||||||
|
[](std::array<u32, 4>& result, const std::array<u32, 4>& a, const std::array<u32, 4>& b) {
|
||||||
|
for (size_t i = 0; i < 4; ++i) {
|
||||||
|
if (auto r = Common::ProcessNaNs(a[i], b[i])) {
|
||||||
|
result[i] = *r;
|
||||||
|
} else if (Common::IsNaN(result[i])) {
|
||||||
|
result[i] = 0x7fc00000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
));
|
||||||
|
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||||
|
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Function>
|
||||||
|
static void EmitVectorOperation64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||||
|
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
|
(code.*fn)(xmm_a, xmm_b);
|
||||||
|
|
||||||
|
if (ctx.FPSCR_DN()) {
|
||||||
|
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
code.pcmpeqw(tmp, tmp);
|
||||||
|
code.movaps(nan_mask, xmm_a);
|
||||||
|
code.cmpordpd(nan_mask, nan_mask);
|
||||||
|
code.andps(xmm_a, nan_mask);
|
||||||
|
code.xorps(nan_mask, tmp);
|
||||||
|
code.andps(nan_mask, code.MConst(0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000));
|
||||||
|
code.orps(xmm_a, nan_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end, nan;
|
||||||
|
Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code.movaps(nan_mask, xmm_b);
|
||||||
|
code.movaps(result, xmm_a);
|
||||||
|
code.cmpunordpd(nan_mask, xmm_a);
|
||||||
|
(code.*fn)(result, xmm_b);
|
||||||
|
code.cmpunordpd(nan_mask, result);
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
||||||
|
code.ptest(nan_mask, nan_mask);
|
||||||
|
} else {
|
||||||
|
Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
code.movmskps(bitmask, nan_mask);
|
||||||
|
code.cmp(bitmask, 0);
|
||||||
|
}
|
||||||
|
code.jz(end);
|
||||||
|
code.jmp(nan, code.T_NEAR);
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
const size_t stack_space = 3 * 16;
|
||||||
|
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||||
|
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||||
|
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
|
||||||
|
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
|
||||||
|
code.movaps(xword[code.ABI_PARAM1], result);
|
||||||
|
code.movaps(xword[code.ABI_PARAM2], xmm_a);
|
||||||
|
code.movaps(xword[code.ABI_PARAM3], xmm_b);
|
||||||
|
code.CallFunction(static_cast<void(*)(std::array<u64, 2>&, const std::array<u64, 2>&, const std::array<u64, 2>&)>(
|
||||||
|
[](std::array<u64, 2>& result, const std::array<u64, 2>& a, const std::array<u64, 2>& b) {
|
||||||
|
for (size_t i = 0; i < 4; ++i) {
|
||||||
|
if (auto r = Common::ProcessNaNs(a[i], b[i])) {
|
||||||
|
result[i] = *r;
|
||||||
|
} else if (Common::IsNaN(result[i])) {
|
||||||
|
result[i] = 0x7ff8'0000'0000'0000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
));
|
||||||
|
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||||
|
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::addps);
|
EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::addps);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorAdd64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorAdd64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::addpd);
|
EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::addpd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorSub32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorSub32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::subps);
|
EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::subps);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::subpd);
|
EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::subpd);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Dynarmic::BackendX64
|
} // namespace Dynarmic::BackendX64
|
||||||
|
|
76
src/common/fp_util.h
Normal file
76
src/common/fp_util.h
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
/* This file is part of the dynarmic project.
|
||||||
|
* Copyright (c) 2018 MerryMage
|
||||||
|
* This software may be used and distributed according to the terms of the GNU
|
||||||
|
* General Public License version 2 or any later version.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <boost/optional.hpp>
|
||||||
|
|
||||||
|
namespace Dynarmic {
|
||||||
|
namespace Common {
|
||||||
|
|
||||||
|
/// Is 32-bit floating point value a QNaN?
|
||||||
|
constexpr bool IsQNaN(u32 value) {
|
||||||
|
return (value & 0x7fc00000) == 0x7fc00000;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is 32-bit floating point value a SNaN?
|
||||||
|
constexpr bool IsSNaN(u32 value) {
|
||||||
|
return (value & 0x7fc00000) == 0x7f800000 && (value & 0x007fffff) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is 32-bit floating point value a NaN?
|
||||||
|
constexpr bool IsNaN(u32 value) {
|
||||||
|
return IsQNaN(value) || IsSNaN(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor.
|
||||||
|
/// If neither argument is a NaN, returns boost::none.
|
||||||
|
inline boost::optional<u32> ProcessNaNs(u32 a, u32 b) {
|
||||||
|
if (IsSNaN(a)) {
|
||||||
|
return a | 0x00400000;
|
||||||
|
} else if (IsSNaN(b)) {
|
||||||
|
return b | 0x00400000;
|
||||||
|
} else if (IsQNaN(a)) {
|
||||||
|
return a;
|
||||||
|
} else if (IsQNaN(b)) {
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
return boost::none;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is 64-bit floating point value a QNaN?
|
||||||
|
constexpr bool IsQNaN(u64 value) {
|
||||||
|
return (value & 0x7FF8'0000'0000'0000) == 0x7FF8'0000'0000'0000;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is 64-bit floating point value a SNaN?
|
||||||
|
constexpr bool IsSNaN(u64 value) {
|
||||||
|
return (value & 0x7FF8'0000'0000'0000) == 0x7FF0'0000'0000'0000
|
||||||
|
&& (value & 0x0007'FFFF'FFFF'FFFF) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is 64-bit floating point value a NaN?
|
||||||
|
constexpr bool IsNaN(u64 value) {
|
||||||
|
return IsQNaN(value) || IsSNaN(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor.
|
||||||
|
/// If neither argument is a NaN, returns boost::none.
|
||||||
|
inline boost::optional<u64> ProcessNaNs(u64 a, u64 b) {
|
||||||
|
if (IsSNaN(a)) {
|
||||||
|
return a | 0x0008'0000'0000'0000;
|
||||||
|
} else if (IsSNaN(b)) {
|
||||||
|
return b | 0x0008'0000'0000'0000;
|
||||||
|
} else if (IsQNaN(a)) {
|
||||||
|
return a;
|
||||||
|
} else if (IsQNaN(b)) {
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
return boost::none;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Common
|
||||||
|
} // namespace Dynarmic
|
Loading…
Reference in a new issue