emit_x64_vector_floating_point: Correct FMA in FTZ mode
x64 rounds before flushing to zero AArch64 rounds after flushing to zero This difference of behaviour is noticable if something would round to a smallest normalized number
This commit is contained in:
parent
8ef195db3c
commit
bb93353f94
3 changed files with 130 additions and 92 deletions
|
@ -39,10 +39,12 @@ namespace {
|
||||||
constexpr u64 f32_negative_zero = 0x80000000u;
|
constexpr u64 f32_negative_zero = 0x80000000u;
|
||||||
constexpr u64 f32_nan = 0x7fc00000u;
|
constexpr u64 f32_nan = 0x7fc00000u;
|
||||||
constexpr u64 f32_non_sign_mask = 0x7fffffffu;
|
constexpr u64 f32_non_sign_mask = 0x7fffffffu;
|
||||||
|
constexpr u64 f32_smallest_normal = 0x00800000u;
|
||||||
|
|
||||||
constexpr u64 f64_negative_zero = 0x8000000000000000u;
|
constexpr u64 f64_negative_zero = 0x8000000000000000u;
|
||||||
constexpr u64 f64_nan = 0x7ff8000000000000u;
|
constexpr u64 f64_nan = 0x7ff8000000000000u;
|
||||||
constexpr u64 f64_non_sign_mask = 0x7fffffffffffffffu;
|
constexpr u64 f64_non_sign_mask = 0x7fffffffffffffffu;
|
||||||
|
constexpr u64 f64_smallest_normal = 0x0010000000000000u;
|
||||||
|
|
||||||
constexpr u64 f64_penultimate_positive_denormal = 0x000ffffffffffffeu;
|
constexpr u64 f64_penultimate_positive_denormal = 0x000ffffffffffffeu;
|
||||||
constexpr u64 f64_min_s32 = 0xc1e0000000000000u; // -2147483648 as a double
|
constexpr u64 f64_min_s32 = 0xc1e0000000000000u; // -2147483648 as a double
|
||||||
|
@ -590,14 +592,52 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
||||||
FPFourOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end, fallback;
|
||||||
|
|
||||||
|
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code.movaps(result, operand1);
|
||||||
FCODE(vfmadd231s)(result, operand2, operand3);
|
FCODE(vfmadd231s)(result, operand2, operand3);
|
||||||
}, [](FPT a, FPT b, FPT c, FP::FPCR fpcr) -> FPT {
|
|
||||||
if (FP::IsQNaN(a) && ((FP::IsInf(b) && FP::IsZero(c, fpcr)) || (FP::IsZero(b, fpcr) && FP::IsInf(c)))) {
|
code.movaps(tmp, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
||||||
return FP::FPInfo<FPT>::DefaultNaN();
|
code.andps(tmp, result);
|
||||||
}
|
FCODE(ucomis)(result, code.MConst(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal));
|
||||||
return *FP::ProcessNaNs(a, b, c);
|
code.jz(fallback, code.T_NEAR);
|
||||||
});
|
code.L(end);
|
||||||
|
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(fallback);
|
||||||
|
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
code.movq(code.ABI_PARAM1, operand1);
|
||||||
|
code.movq(code.ABI_PARAM2, operand2);
|
||||||
|
code.movq(code.ABI_PARAM3, operand3);
|
||||||
|
code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR());
|
||||||
|
#ifdef _WIN32
|
||||||
|
code.sub(rsp, 16 + ABI_SHADOW_SPACE);
|
||||||
|
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
|
||||||
|
code.mov(qword[rsp + ABI_SHADOW_SPACE], rax);
|
||||||
|
code.CallFunction(&FP::FPMulAdd<FPT>);
|
||||||
|
code.add(rsp, 16 + ABI_SHADOW_SPACE);
|
||||||
|
#else
|
||||||
|
code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
|
||||||
|
code.CallFunction(&FP::FPMulAdd<FPT>);
|
||||||
|
#endif
|
||||||
|
code.movq(result, code.ABI_RETURN);
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -141,6 +141,15 @@ Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<size_t fsize>
|
||||||
|
Xbyak::Address GetSmallestNormalVector(BlockOfCode& code) {
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
return code.MConst(xword, 0x0080'0000'0080'0000, 0x0080'0000'0080'0000);
|
||||||
|
} else {
|
||||||
|
return code.MConst(xword, 0x0010'0000'0000'0000, 0x0010'0000'0000'0000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
|
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
|
@ -310,52 +319,6 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize, template<typename> class Indexer, typename Function>
|
|
||||||
void EmitFourOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 4>::function_type nan_handler = NaNHandler<fsize, Indexer, 4>::GetDefault()) {
|
|
||||||
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
|
|
||||||
|
|
||||||
if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) {
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
||||||
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
||||||
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
||||||
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
|
|
||||||
|
|
||||||
if constexpr (std::is_member_function_pointer_v<Function>) {
|
|
||||||
(code.*fn)(xmm_a, xmm_b, xmm_c);
|
|
||||||
} else {
|
|
||||||
fn(xmm_a, xmm_b, xmm_c);
|
|
||||||
}
|
|
||||||
|
|
||||||
ForceToDefaultNaN<fsize>(code, ctx, xmm_a);
|
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
||||||
|
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
|
||||||
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
|
|
||||||
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
|
||||||
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
|
|
||||||
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
|
||||||
|
|
||||||
code.movaps(nan_mask, xmm_b);
|
|
||||||
code.movaps(result, xmm_a);
|
|
||||||
FCODE(cmpunordp)(nan_mask, xmm_a);
|
|
||||||
FCODE(cmpunordp)(nan_mask, xmm_c);
|
|
||||||
if constexpr (std::is_member_function_pointer_v<Function>) {
|
|
||||||
(code.*fn)(result, xmm_b, xmm_c);
|
|
||||||
} else {
|
|
||||||
fn(result, xmm_b, xmm_c);
|
|
||||||
}
|
|
||||||
FCODE(cmpunordp)(nan_mask, result);
|
|
||||||
|
|
||||||
HandleNaNs<fsize, 3>(code, ctx, {result, xmm_a, xmm_b, xmm_c}, nan_mask, nan_handler);
|
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Lambda>
|
template<typename Lambda>
|
||||||
void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
|
void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
|
||||||
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
|
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
|
||||||
|
@ -426,16 +389,9 @@ void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, La
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Lambda>
|
template<typename Lambda>
|
||||||
void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
|
void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Xbyak::Xmm arg3, Lambda lambda) {
|
||||||
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
|
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
|
||||||
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
||||||
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
|
|
||||||
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
|
|
||||||
const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(args[2]);
|
|
||||||
ctx.reg_alloc.EndOfAllocScope();
|
|
||||||
ctx.reg_alloc.HostCall(nullptr);
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
constexpr u32 stack_space = 5 * 16;
|
constexpr u32 stack_space = 5 * 16;
|
||||||
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
|
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||||
|
@ -463,12 +419,24 @@ void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lam
|
||||||
code.CallFunction(fn);
|
code.CallFunction(fn);
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
code.movaps(xmm0, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]);
|
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]);
|
||||||
#else
|
#else
|
||||||
code.movaps(xmm0, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
|
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Lambda>
|
||||||
|
void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(args[2]);
|
||||||
|
ctx.reg_alloc.EndOfAllocScope();
|
||||||
|
ctx.reg_alloc.HostCall(nullptr);
|
||||||
|
|
||||||
|
EmitFourOpFallbackWithoutRegAlloc(code, ctx, xmm0, arg1, arg2, arg3, lambda);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, xmm0);
|
ctx.reg_alloc.DefineValue(inst, xmm0);
|
||||||
}
|
}
|
||||||
|
@ -770,37 +738,48 @@ template<size_t fsize>
|
||||||
void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||||
|
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& addend, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||||
const auto x64_instruction = fsize == 32 ? &Xbyak::CodeGenerator::vfmadd231ps : &Xbyak::CodeGenerator::vfmadd231pd;
|
|
||||||
EmitFourOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, x64_instruction,
|
|
||||||
static_cast<void(*)(std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr)>(
|
|
||||||
[](std::array<VectorArray<FPT>, 4>& values, FP::FPCR fpcr) {
|
|
||||||
VectorArray<FPT>& result = values[0];
|
|
||||||
const VectorArray<FPT>& a = values[1];
|
|
||||||
const VectorArray<FPT>& b = values[2];
|
|
||||||
const VectorArray<FPT>& c = values[3];
|
|
||||||
for (size_t i = 0; i < result.size(); i++) {
|
|
||||||
if (FP::IsQNaN(a[i]) && ((FP::IsInf(b[i]) && FP::IsZero(c[i], fpcr)) || (FP::IsZero(b[i], fpcr) && FP::IsInf(c[i])))) {
|
|
||||||
result[i] = FP::FPInfo<FPT>::DefaultNaN();
|
|
||||||
} else if (auto r = FP::ProcessNaNs(a[i], b[i], c[i])) {
|
|
||||||
result[i] = *r;
|
|
||||||
} else if (FP::IsNaN(result[i])) {
|
|
||||||
result[i] = FP::FPInfo<FPT>::DefaultNaN();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
EmitFourOpFallback(code, ctx, inst,
|
|
||||||
[](VectorArray<FPT>& result, const VectorArray<FPT>& addend, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
|
||||||
for (size_t i = 0; i < result.size(); i++) {
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
result[i] = FP::FPMulAdd<FPT>(addend[i], op1[i], op2[i], fpcr, fpsr);
|
result[i] = FP::FPMulAdd<FPT>(addend[i], op1[i], op2[i], fpcr, fpsr);
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
|
||||||
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
Xbyak::Label end, fallback;
|
||||||
|
|
||||||
|
code.movaps(result, xmm_a);
|
||||||
|
FCODE(vfmadd231p)(result, xmm_b, xmm_c);
|
||||||
|
|
||||||
|
code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
|
||||||
|
code.andnps(tmp, result);
|
||||||
|
FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
|
||||||
|
code.vptest(tmp, tmp);
|
||||||
|
code.jnz(fallback, code.T_NEAR);
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(fallback);
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, xmm_a, xmm_b, xmm_c, fallback_fn);
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
);
|
|
||||||
|
EmitFourOpFallback(code, ctx, inst, fallback_fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
|
@ -409,3 +409,22 @@ TEST_CASE("A64: FMLA.4S (denormal)", "[a64]") {
|
||||||
|
|
||||||
REQUIRE(jit.GetVector(12) == Vector{0x7ff800007fc00000, 0xbff0000068e8e581});
|
REQUIRE(jit.GetVector(12) == Vector{0x7ff800007fc00000, 0xbff0000068e8e581});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("A64: FMLA.4S (0x80800000)", "[a64]") {
|
||||||
|
TestEnv env;
|
||||||
|
Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}};
|
||||||
|
|
||||||
|
env.code_mem[0] = 0x4e38cc2b; // FMLA.4S V11, V1, V24
|
||||||
|
env.code_mem[1] = 0x14000000; // B .
|
||||||
|
|
||||||
|
jit.SetPC(0);
|
||||||
|
jit.SetVector(11, {0xc79b271efff05678, 0xffc0000080800000});
|
||||||
|
jit.SetVector(1, {0x00636d2400800000, 0x0966320bb26bddee});
|
||||||
|
jit.SetVector(24, {0x460e8c84fff00000, 0x8ba98d2780800002});
|
||||||
|
jit.SetFpcr(0x03000000);
|
||||||
|
|
||||||
|
env.ticks_left = 2;
|
||||||
|
jit.Run();
|
||||||
|
|
||||||
|
REQUIRE(jit.GetVector(11) == Vector{0xc79b271e7fc00000, 0x7fc0000080000000});
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue