From 8c90fcf58e90199eb5de7e480b2b02cec1e9032c Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 6 Jun 2018 20:03:12 +0100 Subject: [PATCH] IR: Implement FPMulAdd --- src/CMakeLists.txt | 1 + src/backend_x64/emit_x64_floating_point.cpp | 151 ++++++++++++++++++++ src/common/fp_util.h | 38 +++++ src/frontend/ir/ir_emitter.cpp | 10 ++ src/frontend/ir/ir_emitter.h | 1 + src/frontend/ir/opcodes.inc | 2 + tests/A64/fuzz_with_unicorn.cpp | 5 +- 7 files changed, 207 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9876a6f0..88cce036 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,6 +89,7 @@ add_library(dynarmic frontend/A64/translate/impl/floating_point_conditional_compare.cpp frontend/A64/translate/impl/floating_point_conditional_select.cpp frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp + frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp frontend/A64/translate/impl/impl.cpp frontend/A64/translate/impl/impl.h diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp index ccfcf4d7..7b074fa0 100644 --- a/src/backend_x64/emit_x64_floating_point.cpp +++ b/src/backend_x64/emit_x64_floating_point.cpp @@ -130,6 +130,35 @@ static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya code.SwitchToNearCode(); } +static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end) { + Xbyak::Label nan; + + code.ucomiss(a, b); + code.jp(nan, code.T_NEAR); + code.ucomiss(c, c); + code.jp(nan, code.T_NEAR); + code.SwitchToFarCode(); + code.L(nan); + + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); + code.xor_(code.ABI_PARAM1.cvt32(), code.ABI_PARAM1.cvt32()); + code.xor_(code.ABI_PARAM2.cvt32(), code.ABI_PARAM2.cvt32()); + code.xor_(code.ABI_PARAM3.cvt32(), code.ABI_PARAM3.cvt32()); + code.movd(code.ABI_PARAM1.cvt32(), a); + code.movd(code.ABI_PARAM2.cvt32(), b); + code.movd(code.ABI_PARAM3.cvt32(), c); + code.CallFunction(static_cast([](u32 a, u32 b, u32 c) -> u32 { + return *Common::ProcessNaNs(a, b, c); + })); + code.movd(a, code.ABI_RETURN.cvt32()); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); + code.add(rsp, 8); + + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); +} + static void PostProcessNaNs32(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { code.movaps(tmp, result); code.cmpunordps(tmp, tmp); @@ -168,6 +197,32 @@ static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya code.SwitchToNearCode(); } +static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end) { + Xbyak::Label nan; + + code.ucomisd(a, b); + code.jp(nan, code.T_NEAR); + code.ucomisd(c, c); + code.jp(nan, code.T_NEAR); + code.SwitchToFarCode(); + code.L(nan); + + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); + code.movq(code.ABI_PARAM1, a); + code.movq(code.ABI_PARAM2, b); + code.movq(code.ABI_PARAM3, c); + code.CallFunction(static_cast([](u64 a, u64 b, u64 c) -> u64 { + return *Common::ProcessNaNs(a, b, c); + })); + code.movq(a, code.ABI_RETURN); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); + code.add(rsp, 8); + + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); +} + static void PostProcessNaNs64(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { code.movaps(tmp, result); code.cmpunordpd(tmp, tmp); @@ -365,6 +420,72 @@ static void FPTwoOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Funct ctx.reg_alloc.DefineValue(inst, result); } +template +static void FPFourOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Label end; + + Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); + Xbyak::Xmm operand3 = ctx.reg_alloc.UseScratchXmm(args[2]); + Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32(); + + if (ctx.FPSCR_FTZ()) { + DenormalsAreZero32(code, result, gpr_scratch); + DenormalsAreZero32(code, operand2, gpr_scratch); + DenormalsAreZero32(code, operand3, gpr_scratch); + } + if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { + PreProcessNaNs32(code, result, operand2, operand3, end); + } + fn(result, operand2, operand3); + if (ctx.FPSCR_FTZ()) { + FlushToZero32(code, result, gpr_scratch); + } + if (ctx.FPSCR_DN()) { + DefaultNaN32(code, result); + } else if (ctx.AccurateNaN()) { + PostProcessNaNs32(code, result, operand2); + } + code.L(end); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template +static void FPFourOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Label end; + + Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); + Xbyak::Xmm operand3 = ctx.reg_alloc.UseScratchXmm(args[2]); + Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr(); + + if (ctx.FPSCR_FTZ()) { + DenormalsAreZero64(code, result, gpr_scratch); + DenormalsAreZero64(code, operand2, gpr_scratch); + DenormalsAreZero64(code, operand3, gpr_scratch); + } + if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { + PreProcessNaNs64(code, result, operand2, operand3, end); + } + fn(result, operand2, operand3); + if (ctx.FPSCR_FTZ()) { + FlushToZero64(code, result, gpr_scratch); + } + if (ctx.FPSCR_DN()) { + DefaultNaN64(code, result); + } else if (ctx.AccurateNaN()) { + PostProcessNaNs64(code, result, operand2); + } + code.L(end); + + ctx.reg_alloc.DefineValue(inst, result); +} + void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); @@ -628,6 +749,36 @@ void EmitX64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) { FPThreeOp64(code, ctx, inst, &Xbyak::CodeGenerator::mulsd); } +void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { + FPFourOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) { + code.vfmadd231ss(result, operand2, operand3); + }); + return; + } + + // TODO: Improve accuracy. + FPFourOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) { + code.mulss(operand2, operand3); + code.addss(result, operand2); + }); +} + +void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { + FPFourOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) { + code.vfmadd231sd(result, operand2, operand3); + }); + return; + } + + // TODO: Improve accuracy. + FPFourOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) { + code.mulsd(operand2, operand3); + code.addsd(result, operand2); + }); +} + void EmitX64::EmitFPSqrt32(EmitContext& ctx, IR::Inst* inst) { FPTwoOp32(code, ctx, inst, &Xbyak::CodeGenerator::sqrtss); } diff --git a/src/common/fp_util.h b/src/common/fp_util.h index 9f65e5f9..9469a223 100644 --- a/src/common/fp_util.h +++ b/src/common/fp_util.h @@ -41,6 +41,25 @@ inline boost::optional ProcessNaNs(u32 a, u32 b) { return boost::none; } +/// Given three arguments, return the NaN value which would be returned by an ARM processor. +/// If none of the arguments is a NaN, returns boost::none. +inline boost::optional ProcessNaNs(u32 a, u32 b, u32 c) { + if (IsSNaN(a)) { + return a | 0x00400000; + } else if (IsSNaN(b)) { + return b | 0x00400000; + } else if (IsSNaN(c)) { + return c | 0x00400000; + } else if (IsQNaN(a)) { + return a; + } else if (IsQNaN(b)) { + return b; + } else if (IsQNaN(c)) { + return c; + } + return boost::none; +} + /// Is 64-bit floating point value a QNaN? constexpr bool IsQNaN(u64 value) { return (value & 0x7FF8'0000'0000'0000) == 0x7FF8'0000'0000'0000; @@ -72,5 +91,24 @@ inline boost::optional ProcessNaNs(u64 a, u64 b) { return boost::none; } +/// Given three arguments, return the NaN value which would be returned by an ARM processor. +/// If none of the arguments is a NaN, returns boost::none. +inline boost::optional ProcessNaNs(u64 a, u64 b, u64 c) { + if (IsSNaN(a)) { + return a | 0x0008'0000'0000'0000; + } else if (IsSNaN(b)) { + return b | 0x0008'0000'0000'0000; + } else if (IsSNaN(c)) { + return c | 0x0008'0000'0000'0000; + } else if (IsQNaN(a)) { + return a; + } else if (IsQNaN(b)) { + return b; + } else if (IsQNaN(c)) { + return c; + } + return boost::none; +} + } // namespace Common } // namespace Dynarmic diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 374ea56f..a6047737 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1405,6 +1405,16 @@ U32U64 IREmitter::FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled) } } +U32U64 IREmitter::FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bool fpscr_controlled) { + ASSERT(fpscr_controlled); + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst(Opcode::FPMulAdd32, a, b, c); + } else { + return Inst(Opcode::FPMulAdd64, a, b, c); + } +} + U32U64 IREmitter::FPNeg(const U32U64& a) { if (a.GetType() == Type::U32) { return Inst(Opcode::FPNeg32, a); diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index f5386116..43e5895e 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -258,6 +258,7 @@ public: U32U64 FPMin(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled); + U32U64 FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bool fpscr_controlled); U32U64 FPNeg(const U32U64& a); U32U64 FPSqrt(const U32U64& a); U32U64 FPSub(const U32U64& a, const U32U64& b, bool fpscr_controlled); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index af0184f7..72ba3a16 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -374,6 +374,8 @@ OPCODE(FPMinNumeric32, T::U32, T::U32, T::U OPCODE(FPMinNumeric64, T::U64, T::U64, T::U64 ) OPCODE(FPMul32, T::U32, T::U32, T::U32 ) OPCODE(FPMul64, T::U64, T::U64, T::U64 ) +OPCODE(FPMulAdd32, T::U32, T::U32, T::U32, T::U32 ) +OPCODE(FPMulAdd64, T::U64, T::U64, T::U64, T::U64 ) OPCODE(FPNeg32, T::U32, T::U32 ) OPCODE(FPNeg64, T::U64, T::U64 ) OPCODE(FPSqrt32, T::U32, T::U32 ) diff --git a/tests/A64/fuzz_with_unicorn.cpp b/tests/A64/fuzz_with_unicorn.cpp index afe2e030..1d2a760a 100644 --- a/tests/A64/fuzz_with_unicorn.cpp +++ b/tests/A64/fuzz_with_unicorn.cpp @@ -77,6 +77,8 @@ static u32 GenRandomInst(u64 pc, bool is_last_inst) { "LDLAR", // Dynarmic and QEMU currently differ on how the exclusive monitor's address range works. "STXR", "STLXR", "STXP", "STLXP", "LDXR", "LDAXR", "LDXP", "LDAXP", + // Approximation. Produces inaccurate results. + "FMADD_float", "FMSUB_float", "FNMADD_float", "FNMSUB_float", }; for (const auto& [fn, bitstring] : list) { @@ -89,7 +91,6 @@ static u32 GenRandomInst(u64 pc, bool is_last_inst) { } result.emplace_back(InstructionGenerator{bitstring}); } - return result; }(); @@ -115,6 +116,8 @@ static u32 GenFloatInst(u64 pc, bool is_last_inst) { const std::vector do_not_test { // QEMU's implementation of FCVT is incorrect "FCVT_float", + // Approximation. Produces incorrect results. + "FMADD_float", "FMSUB_float", "FNMADD_float", "FNMSUB_float", }; std::vector result;