ir: Implement FPMulSub

This commit is contained in:
zmt00 2024-02-06 18:49:15 -08:00 committed by merry
parent a32e6f52ef
commit 0785a6d027
11 changed files with 104 additions and 14 deletions

View file

@ -328,6 +328,24 @@ void EmitIR<IR::Opcode::FPMulAdd64>(oaknut::CodeGenerator& code, EmitContext& ct
EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMADD(Dresult, D1, D2, Da); }); EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMADD(Dresult, D1, D2, Da); });
} }
template<>
void EmitIR<IR::Opcode::FPMulSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
(void)code;
(void)ctx;
(void)inst;
ASSERT_FALSE("Unimplemented");
}
template<>
void EmitIR<IR::Opcode::FPMulSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
EmitFourOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& S1, auto& S2) { code.FMSUB(Sresult, S1, S2, Sa); });
}
template<>
void EmitIR<IR::Opcode::FPMulSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMSUB(Dresult, D1, D2, Da); });
}
template<> template<>
void EmitIR<IR::Opcode::FPMulX32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { void EmitIR<IR::Opcode::FPMulX32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMULX(Sresult, Sa, Sb); }); EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMULX(Sresult, Sa, Sb); });

View file

@ -624,9 +624,10 @@ void EmitX64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) {
FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::mulsd); FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::mulsd);
} }
template<size_t fsize> template<size_t fsize, bool negate_product>
static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mcl::unsigned_integer_of_size<fsize>; using FPT = mcl::unsigned_integer_of_size<fsize>;
const auto fallback_fn = negate_product ? &FP::FPMulSub<FPT> : &FP::FPMulAdd<FPT>;
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@ -639,7 +640,11 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
FCODE(vfmadd231s)(result, operand2, operand3); if constexpr (negate_product) {
FCODE(vfnmadd231s)(result, operand2, operand3);
} else {
FCODE(vfmadd231s)(result, operand2, operand3);
}
if (ctx.FPCR().DN()) { if (ctx.FPCR().DN()) {
ForceToDefaultNaN<fsize>(code, result); ForceToDefaultNaN<fsize>(code, result);
} }
@ -657,7 +662,11 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
code.movaps(result, operand1); code.movaps(result, operand1);
FCODE(vfmadd231s)(result, operand2, operand3); if constexpr (negate_product) {
FCODE(vfnmadd231s)(result, operand2, operand3);
} else {
FCODE(vfmadd231s)(result, operand2, operand3);
}
if (needs_rounding_correction && needs_nan_correction) { if (needs_rounding_correction && needs_nan_correction) {
code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
@ -703,11 +712,11 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.sub(rsp, 16 + ABI_SHADOW_SPACE); code.sub(rsp, 16 + ABI_SHADOW_SPACE);
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(qword[rsp + ABI_SHADOW_SPACE], rax); code.mov(qword[rsp + ABI_SHADOW_SPACE], rax);
code.CallFunction(&FP::FPMulAdd<FPT>); code.CallFunction(fallback_fn);
code.add(rsp, 16 + ABI_SHADOW_SPACE); code.add(rsp, 16 + ABI_SHADOW_SPACE);
#else #else
code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPMulAdd<FPT>); code.CallFunction(fallback_fn);
#endif #endif
code.movq(result, code.ABI_RETURN); code.movq(result, code.ABI_RETURN);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
@ -758,6 +767,9 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.ptest(operand2, xmm0); code.ptest(operand2, xmm0);
code.jnz(op2_done); code.jnz(op2_done);
code.vorps(result, operand2, xmm0); code.vorps(result, operand2, xmm0);
if constexpr (negate_product) {
code.xorps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
}
code.jmp(*end); code.jmp(*end);
code.L(op2_done); code.L(op2_done);
@ -769,6 +781,16 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.jmp(*end); code.jmp(*end);
code.L(op3_done); code.L(op3_done);
// at this point, all SNaNs have been handled
// if op1 was not a QNaN and op2 is, negate the result
if constexpr (negate_product) {
FCODE(ucomis)(operand1, operand1);
code.jp(*end);
FCODE(ucomis)(operand2, operand2);
code.jnp(*end);
code.xorps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
}
code.jmp(*end); code.jmp(*end);
} }
}); });
@ -782,6 +804,9 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
if constexpr (negate_product) {
code.xorps(operand2, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
}
FCODE(muls)(operand2, operand3); FCODE(muls)(operand2, operand3);
FCODE(adds)(operand1, operand2); FCODE(adds)(operand1, operand2);
@ -796,24 +821,36 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.mov(qword[rsp + ABI_SHADOW_SPACE], rax); code.mov(qword[rsp + ABI_SHADOW_SPACE], rax);
code.CallFunction(&FP::FPMulAdd<FPT>); code.CallFunction(fallback_fn);
ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE);
#else #else
code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
code.CallFunction(&FP::FPMulAdd<FPT>); code.CallFunction(fallback_fn);
#endif #endif
} }
void EmitX64::EmitFPMulAdd16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPMulAdd16(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<16>(code, ctx, inst); EmitFPMulAdd<16, false>(code, ctx, inst);
} }
void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<32>(code, ctx, inst); EmitFPMulAdd<32, false>(code, ctx, inst);
} }
void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<64>(code, ctx, inst); EmitFPMulAdd<64, false>(code, ctx, inst);
}
void EmitX64::EmitFPMulSub16(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<16, true>(code, ctx, inst);
}
void EmitX64::EmitFPMulSub32(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<32, true>(code, ctx, inst);
}
void EmitX64::EmitFPMulSub64(EmitContext& ctx, IR::Inst* inst) {
EmitFPMulAdd<64, true>(code, ctx, inst);
} }
template<size_t fsize> template<size_t fsize>

View file

@ -78,4 +78,13 @@ template u16 FPMulAdd<u16>(u16 addend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
template u32 FPMulAdd<u32>(u32 addend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); template u32 FPMulAdd<u32>(u32 addend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
template u64 FPMulAdd<u64>(u64 addend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); template u64 FPMulAdd<u64>(u64 addend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
template<typename FPT>
FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
return FPMulAdd<FPT>(minuend, (op1 ^ FPInfo<FPT>::sign_mask), op2, fpcr, fpsr);
}
template u16 FPMulSub<u16>(u16 minuend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
template u32 FPMulSub<u32>(u32 minuend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
template u64 FPMulSub<u64>(u64 minuend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
} // namespace Dynarmic::FP } // namespace Dynarmic::FP

View file

@ -13,4 +13,7 @@ class FPSR;
template<typename FPT> template<typename FPT>
FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
template<typename FPT>
FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
} // namespace Dynarmic::FP } // namespace Dynarmic::FP

View file

@ -293,7 +293,7 @@ bool TranslatorVisitor::vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool
const auto reg_n = ir.GetExtendedRegister(n); const auto reg_n = ir.GetExtendedRegister(n);
const auto reg_m = ir.GetExtendedRegister(m); const auto reg_m = ir.GetExtendedRegister(m);
const auto reg_d = ir.GetExtendedRegister(d); const auto reg_d = ir.GetExtendedRegister(d);
const auto result = ir.FPMulAdd(ir.FPNeg(reg_d), ir.FPNeg(reg_n), reg_m); const auto result = ir.FPMulSub(ir.FPNeg(reg_d), reg_n, reg_m);
ir.SetExtendedRegister(d, result); ir.SetExtendedRegister(d, result);
}); });
} }
@ -333,7 +333,7 @@ bool TranslatorVisitor::vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool s
const auto reg_n = ir.GetExtendedRegister(n); const auto reg_n = ir.GetExtendedRegister(n);
const auto reg_m = ir.GetExtendedRegister(m); const auto reg_m = ir.GetExtendedRegister(m);
const auto reg_d = ir.GetExtendedRegister(d); const auto reg_d = ir.GetExtendedRegister(d);
const auto result = ir.FPMulAdd(reg_d, ir.FPNeg(reg_n), reg_m); const auto result = ir.FPMulSub(reg_d, reg_n, reg_m);
ir.SetExtendedRegister(d, result); ir.SetExtendedRegister(d, result);
}); });
} }

View file

@ -30,7 +30,7 @@ bool TranslatorVisitor::FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd)
const IR::U16U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
const IR::U16U32U64 result = ir.FPMulAdd(operanda, ir.FPNeg(operand1), operand2); const IR::U16U32U64 result = ir.FPMulSub(operanda, operand1, operand2);
V_scalar(*datasize, Vd, result); V_scalar(*datasize, Vd, result);
return true; return true;
} }
@ -44,7 +44,7 @@ bool TranslatorVisitor::FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd
const IR::U16U32U64 operanda = V_scalar(*datasize, Va); const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
const IR::U16U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), ir.FPNeg(operand1), operand2); const IR::U16U32U64 result = ir.FPMulSub(ir.FPNeg(operanda), operand1, operand2);
V_scalar(*datasize, Vd, result); V_scalar(*datasize, Vd, result);
return true; return true;
} }

View file

@ -2190,6 +2190,21 @@ U16U32U64 IREmitter::FPMulAdd(const U16U32U64& a, const U16U32U64& b, const U16U
} }
} }
U16U32U64 IREmitter::FPMulSub(const U16U32U64& a, const U16U32U64& b, const U16U32U64& c) {
ASSERT(a.GetType() == b.GetType());
switch (a.GetType()) {
case Type::U16:
return Inst<U16>(Opcode::FPMulSub16, a, b, c);
case Type::U32:
return Inst<U32>(Opcode::FPMulSub32, a, b, c);
case Type::U64:
return Inst<U64>(Opcode::FPMulSub64, a, b, c);
default:
UNREACHABLE();
}
}
U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) { U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) {
ASSERT(a.GetType() == b.GetType()); ASSERT(a.GetType() == b.GetType());

View file

@ -335,6 +335,7 @@ public:
U32U64 FPMinNumeric(const U32U64& a, const U32U64& b); U32U64 FPMinNumeric(const U32U64& a, const U32U64& b);
U32U64 FPMul(const U32U64& a, const U32U64& b); U32U64 FPMul(const U32U64& a, const U32U64& b);
U16U32U64 FPMulAdd(const U16U32U64& addend, const U16U32U64& op1, const U16U32U64& op2); U16U32U64 FPMulAdd(const U16U32U64& addend, const U16U32U64& op1, const U16U32U64& op2);
U16U32U64 FPMulSub(const U16U32U64& minuend, const U16U32U64& op1, const U16U32U64& op2);
U32U64 FPMulX(const U32U64& a, const U32U64& b); U32U64 FPMulX(const U32U64& a, const U32U64& b);
U16U32U64 FPNeg(const U16U32U64& a); U16U32U64 FPNeg(const U16U32U64& a);
U16U32U64 FPRecipEstimate(const U16U32U64& a); U16U32U64 FPRecipEstimate(const U16U32U64& a);

View file

@ -308,6 +308,9 @@ bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const {
case Opcode::FPMulAdd16: case Opcode::FPMulAdd16:
case Opcode::FPMulAdd32: case Opcode::FPMulAdd32:
case Opcode::FPMulAdd64: case Opcode::FPMulAdd64:
case Opcode::FPMulSub16:
case Opcode::FPMulSub32:
case Opcode::FPMulSub64:
case Opcode::FPRecipEstimate16: case Opcode::FPRecipEstimate16:
case Opcode::FPRecipEstimate32: case Opcode::FPRecipEstimate32:
case Opcode::FPRecipEstimate64: case Opcode::FPRecipEstimate64:

View file

@ -578,6 +578,9 @@ OPCODE(FPMul64, U64, U64,
OPCODE(FPMulAdd16, U16, U16, U16, U16 ) OPCODE(FPMulAdd16, U16, U16, U16, U16 )
OPCODE(FPMulAdd32, U32, U32, U32, U32 ) OPCODE(FPMulAdd32, U32, U32, U32, U32 )
OPCODE(FPMulAdd64, U64, U64, U64, U64 ) OPCODE(FPMulAdd64, U64, U64, U64, U64 )
OPCODE(FPMulSub16, U16, U16, U16, U16 )
OPCODE(FPMulSub32, U32, U32, U32, U32 )
OPCODE(FPMulSub64, U64, U64, U64, U64 )
OPCODE(FPMulX32, U32, U32, U32 ) OPCODE(FPMulX32, U32, U32, U32 )
OPCODE(FPMulX64, U64, U64, U64 ) OPCODE(FPMulX64, U64, U64, U64 )
OPCODE(FPNeg16, U16, U16 ) OPCODE(FPNeg16, U16, U16 )

View file

@ -103,6 +103,7 @@ bool ShouldTestInst(IR::Block& block) {
// Half-precision // Half-precision
case IR::Opcode::FPAbs16: case IR::Opcode::FPAbs16:
case IR::Opcode::FPMulAdd16: case IR::Opcode::FPMulAdd16:
case IR::Opcode::FPMulSub16:
case IR::Opcode::FPNeg16: case IR::Opcode::FPNeg16:
case IR::Opcode::FPRecipEstimate16: case IR::Opcode::FPRecipEstimate16:
case IR::Opcode::FPRecipExponent16: case IR::Opcode::FPRecipExponent16: