IR: Implement FPMulAdd
This commit is contained in:
parent
24e3299276
commit
8c90fcf58e
7 changed files with 207 additions and 1 deletions
|
@ -89,6 +89,7 @@ add_library(dynarmic
|
||||||
frontend/A64/translate/impl/floating_point_conditional_compare.cpp
|
frontend/A64/translate/impl/floating_point_conditional_compare.cpp
|
||||||
frontend/A64/translate/impl/floating_point_conditional_select.cpp
|
frontend/A64/translate/impl/floating_point_conditional_select.cpp
|
||||||
frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp
|
frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp
|
||||||
|
frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp
|
||||||
frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp
|
frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp
|
||||||
frontend/A64/translate/impl/impl.cpp
|
frontend/A64/translate/impl/impl.cpp
|
||||||
frontend/A64/translate/impl/impl.h
|
frontend/A64/translate/impl/impl.h
|
||||||
|
|
|
@ -130,6 +130,35 @@ static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya
|
||||||
code.SwitchToNearCode();
|
code.SwitchToNearCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end) {
|
||||||
|
Xbyak::Label nan;
|
||||||
|
|
||||||
|
code.ucomiss(a, b);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.ucomiss(c, c);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.xor_(code.ABI_PARAM1.cvt32(), code.ABI_PARAM1.cvt32());
|
||||||
|
code.xor_(code.ABI_PARAM2.cvt32(), code.ABI_PARAM2.cvt32());
|
||||||
|
code.xor_(code.ABI_PARAM3.cvt32(), code.ABI_PARAM3.cvt32());
|
||||||
|
code.movd(code.ABI_PARAM1.cvt32(), a);
|
||||||
|
code.movd(code.ABI_PARAM2.cvt32(), b);
|
||||||
|
code.movd(code.ABI_PARAM3.cvt32(), c);
|
||||||
|
code.CallFunction(static_cast<u32(*)(u32, u32, u32)>([](u32 a, u32 b, u32 c) -> u32 {
|
||||||
|
return *Common::ProcessNaNs(a, b, c);
|
||||||
|
}));
|
||||||
|
code.movd(a, code.ABI_RETURN.cvt32());
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
}
|
||||||
|
|
||||||
static void PostProcessNaNs32(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
|
static void PostProcessNaNs32(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
|
||||||
code.movaps(tmp, result);
|
code.movaps(tmp, result);
|
||||||
code.cmpunordps(tmp, tmp);
|
code.cmpunordps(tmp, tmp);
|
||||||
|
@ -168,6 +197,32 @@ static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbya
|
||||||
code.SwitchToNearCode();
|
code.SwitchToNearCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end) {
|
||||||
|
Xbyak::Label nan;
|
||||||
|
|
||||||
|
code.ucomisd(a, b);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.ucomisd(c, c);
|
||||||
|
code.jp(nan, code.T_NEAR);
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
code.L(nan);
|
||||||
|
|
||||||
|
code.sub(rsp, 8);
|
||||||
|
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.movq(code.ABI_PARAM1, a);
|
||||||
|
code.movq(code.ABI_PARAM2, b);
|
||||||
|
code.movq(code.ABI_PARAM3, c);
|
||||||
|
code.CallFunction(static_cast<u64(*)(u64, u64, u64)>([](u64 a, u64 b, u64 c) -> u64 {
|
||||||
|
return *Common::ProcessNaNs(a, b, c);
|
||||||
|
}));
|
||||||
|
code.movq(a, code.ABI_RETURN);
|
||||||
|
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx()));
|
||||||
|
code.add(rsp, 8);
|
||||||
|
|
||||||
|
code.jmp(end, code.T_NEAR);
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
}
|
||||||
|
|
||||||
static void PostProcessNaNs64(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
|
static void PostProcessNaNs64(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
|
||||||
code.movaps(tmp, result);
|
code.movaps(tmp, result);
|
||||||
code.cmpunordpd(tmp, tmp);
|
code.cmpunordpd(tmp, tmp);
|
||||||
|
@ -365,6 +420,72 @@ static void FPTwoOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Funct
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Function>
|
||||||
|
static void FPFourOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end;
|
||||||
|
|
||||||
|
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
Xbyak::Xmm operand3 = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||||
|
Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
DenormalsAreZero32(code, result, gpr_scratch);
|
||||||
|
DenormalsAreZero32(code, operand2, gpr_scratch);
|
||||||
|
DenormalsAreZero32(code, operand3, gpr_scratch);
|
||||||
|
}
|
||||||
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
|
PreProcessNaNs32(code, result, operand2, operand3, end);
|
||||||
|
}
|
||||||
|
fn(result, operand2, operand3);
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
FlushToZero32(code, result, gpr_scratch);
|
||||||
|
}
|
||||||
|
if (ctx.FPSCR_DN()) {
|
||||||
|
DefaultNaN32(code, result);
|
||||||
|
} else if (ctx.AccurateNaN()) {
|
||||||
|
PostProcessNaNs32(code, result, operand2);
|
||||||
|
}
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Function>
|
||||||
|
static void FPFourOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Label end;
|
||||||
|
|
||||||
|
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
Xbyak::Xmm operand3 = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||||
|
Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
DenormalsAreZero64(code, result, gpr_scratch);
|
||||||
|
DenormalsAreZero64(code, operand2, gpr_scratch);
|
||||||
|
DenormalsAreZero64(code, operand3, gpr_scratch);
|
||||||
|
}
|
||||||
|
if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) {
|
||||||
|
PreProcessNaNs64(code, result, operand2, operand3, end);
|
||||||
|
}
|
||||||
|
fn(result, operand2, operand3);
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
FlushToZero64(code, result, gpr_scratch);
|
||||||
|
}
|
||||||
|
if (ctx.FPSCR_DN()) {
|
||||||
|
DefaultNaN64(code, result);
|
||||||
|
} else if (ctx.AccurateNaN()) {
|
||||||
|
PostProcessNaNs64(code, result, operand2);
|
||||||
|
}
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
@ -628,6 +749,36 @@ void EmitX64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
FPThreeOp64(code, ctx, inst, &Xbyak::CodeGenerator::mulsd);
|
FPThreeOp64(code, ctx, inst, &Xbyak::CodeGenerator::mulsd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
||||||
|
FPFourOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
||||||
|
code.vfmadd231ss(result, operand2, operand3);
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Improve accuracy.
|
||||||
|
FPFourOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
||||||
|
code.mulss(operand2, operand3);
|
||||||
|
code.addss(result, operand2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) {
|
||||||
|
FPFourOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
||||||
|
code.vfmadd231sd(result, operand2, operand3);
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Improve accuracy.
|
||||||
|
FPFourOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand2, Xbyak::Xmm operand3) {
|
||||||
|
code.mulsd(operand2, operand3);
|
||||||
|
code.addsd(result, operand2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPSqrt32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPSqrt32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
FPTwoOp32(code, ctx, inst, &Xbyak::CodeGenerator::sqrtss);
|
FPTwoOp32(code, ctx, inst, &Xbyak::CodeGenerator::sqrtss);
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,6 +41,25 @@ inline boost::optional<u32> ProcessNaNs(u32 a, u32 b) {
|
||||||
return boost::none;
|
return boost::none;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Given three arguments, return the NaN value which would be returned by an ARM processor.
|
||||||
|
/// If none of the arguments is a NaN, returns boost::none.
|
||||||
|
inline boost::optional<u32> ProcessNaNs(u32 a, u32 b, u32 c) {
|
||||||
|
if (IsSNaN(a)) {
|
||||||
|
return a | 0x00400000;
|
||||||
|
} else if (IsSNaN(b)) {
|
||||||
|
return b | 0x00400000;
|
||||||
|
} else if (IsSNaN(c)) {
|
||||||
|
return c | 0x00400000;
|
||||||
|
} else if (IsQNaN(a)) {
|
||||||
|
return a;
|
||||||
|
} else if (IsQNaN(b)) {
|
||||||
|
return b;
|
||||||
|
} else if (IsQNaN(c)) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
return boost::none;
|
||||||
|
}
|
||||||
|
|
||||||
/// Is 64-bit floating point value a QNaN?
|
/// Is 64-bit floating point value a QNaN?
|
||||||
constexpr bool IsQNaN(u64 value) {
|
constexpr bool IsQNaN(u64 value) {
|
||||||
return (value & 0x7FF8'0000'0000'0000) == 0x7FF8'0000'0000'0000;
|
return (value & 0x7FF8'0000'0000'0000) == 0x7FF8'0000'0000'0000;
|
||||||
|
@ -72,5 +91,24 @@ inline boost::optional<u64> ProcessNaNs(u64 a, u64 b) {
|
||||||
return boost::none;
|
return boost::none;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Given three arguments, return the NaN value which would be returned by an ARM processor.
|
||||||
|
/// If none of the arguments is a NaN, returns boost::none.
|
||||||
|
inline boost::optional<u64> ProcessNaNs(u64 a, u64 b, u64 c) {
|
||||||
|
if (IsSNaN(a)) {
|
||||||
|
return a | 0x0008'0000'0000'0000;
|
||||||
|
} else if (IsSNaN(b)) {
|
||||||
|
return b | 0x0008'0000'0000'0000;
|
||||||
|
} else if (IsSNaN(c)) {
|
||||||
|
return c | 0x0008'0000'0000'0000;
|
||||||
|
} else if (IsQNaN(a)) {
|
||||||
|
return a;
|
||||||
|
} else if (IsQNaN(b)) {
|
||||||
|
return b;
|
||||||
|
} else if (IsQNaN(c)) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
return boost::none;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Common
|
} // namespace Common
|
||||||
} // namespace Dynarmic
|
} // namespace Dynarmic
|
||||||
|
|
|
@ -1405,6 +1405,16 @@ U32U64 IREmitter::FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U32U64 IREmitter::FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bool fpscr_controlled) {
|
||||||
|
ASSERT(fpscr_controlled);
|
||||||
|
ASSERT(a.GetType() == b.GetType());
|
||||||
|
if (a.GetType() == Type::U32) {
|
||||||
|
return Inst<U32>(Opcode::FPMulAdd32, a, b, c);
|
||||||
|
} else {
|
||||||
|
return Inst<U64>(Opcode::FPMulAdd64, a, b, c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
U32U64 IREmitter::FPNeg(const U32U64& a) {
|
U32U64 IREmitter::FPNeg(const U32U64& a) {
|
||||||
if (a.GetType() == Type::U32) {
|
if (a.GetType() == Type::U32) {
|
||||||
return Inst<U32>(Opcode::FPNeg32, a);
|
return Inst<U32>(Opcode::FPNeg32, a);
|
||||||
|
|
|
@ -258,6 +258,7 @@ public:
|
||||||
U32U64 FPMin(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
U32U64 FPMin(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
||||||
U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
||||||
U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
||||||
|
U32U64 FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bool fpscr_controlled);
|
||||||
U32U64 FPNeg(const U32U64& a);
|
U32U64 FPNeg(const U32U64& a);
|
||||||
U32U64 FPSqrt(const U32U64& a);
|
U32U64 FPSqrt(const U32U64& a);
|
||||||
U32U64 FPSub(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
U32U64 FPSub(const U32U64& a, const U32U64& b, bool fpscr_controlled);
|
||||||
|
|
|
@ -374,6 +374,8 @@ OPCODE(FPMinNumeric32, T::U32, T::U32, T::U
|
||||||
OPCODE(FPMinNumeric64, T::U64, T::U64, T::U64 )
|
OPCODE(FPMinNumeric64, T::U64, T::U64, T::U64 )
|
||||||
OPCODE(FPMul32, T::U32, T::U32, T::U32 )
|
OPCODE(FPMul32, T::U32, T::U32, T::U32 )
|
||||||
OPCODE(FPMul64, T::U64, T::U64, T::U64 )
|
OPCODE(FPMul64, T::U64, T::U64, T::U64 )
|
||||||
|
OPCODE(FPMulAdd32, T::U32, T::U32, T::U32, T::U32 )
|
||||||
|
OPCODE(FPMulAdd64, T::U64, T::U64, T::U64, T::U64 )
|
||||||
OPCODE(FPNeg32, T::U32, T::U32 )
|
OPCODE(FPNeg32, T::U32, T::U32 )
|
||||||
OPCODE(FPNeg64, T::U64, T::U64 )
|
OPCODE(FPNeg64, T::U64, T::U64 )
|
||||||
OPCODE(FPSqrt32, T::U32, T::U32 )
|
OPCODE(FPSqrt32, T::U32, T::U32 )
|
||||||
|
|
|
@ -77,6 +77,8 @@ static u32 GenRandomInst(u64 pc, bool is_last_inst) {
|
||||||
"LDLAR",
|
"LDLAR",
|
||||||
// Dynarmic and QEMU currently differ on how the exclusive monitor's address range works.
|
// Dynarmic and QEMU currently differ on how the exclusive monitor's address range works.
|
||||||
"STXR", "STLXR", "STXP", "STLXP", "LDXR", "LDAXR", "LDXP", "LDAXP",
|
"STXR", "STLXR", "STXP", "STLXP", "LDXR", "LDAXR", "LDXP", "LDAXP",
|
||||||
|
// Approximation. Produces inaccurate results.
|
||||||
|
"FMADD_float", "FMSUB_float", "FNMADD_float", "FNMSUB_float",
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const auto& [fn, bitstring] : list) {
|
for (const auto& [fn, bitstring] : list) {
|
||||||
|
@ -89,7 +91,6 @@ static u32 GenRandomInst(u64 pc, bool is_last_inst) {
|
||||||
}
|
}
|
||||||
result.emplace_back(InstructionGenerator{bitstring});
|
result.emplace_back(InstructionGenerator{bitstring});
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}();
|
}();
|
||||||
|
|
||||||
|
@ -115,6 +116,8 @@ static u32 GenFloatInst(u64 pc, bool is_last_inst) {
|
||||||
const std::vector<std::string> do_not_test {
|
const std::vector<std::string> do_not_test {
|
||||||
// QEMU's implementation of FCVT is incorrect
|
// QEMU's implementation of FCVT is incorrect
|
||||||
"FCVT_float",
|
"FCVT_float",
|
||||||
|
// Approximation. Produces incorrect results.
|
||||||
|
"FMADD_float", "FMSUB_float", "FNMADD_float", "FNMSUB_float",
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<InstructionGenerator> result;
|
std::vector<InstructionGenerator> result;
|
||||||
|
|
Loading…
Add table
Reference in a new issue