diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp index 6cb46dd7..cbd85ccc 100644 --- a/src/backend_x64/emit_x64_floating_point.cpp +++ b/src/backend_x64/emit_x64_floating_point.cpp @@ -36,6 +36,8 @@ namespace mp = Dynarmic::Common::mp; namespace { +const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1); + constexpr u64 f32_negative_zero = 0x80000000u; constexpr u64 f32_nan = 0x7fc00000u; constexpr u64 f32_non_sign_mask = 0x7fffffffu; @@ -669,6 +671,107 @@ void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) { EmitFPMulAdd<64>(code, ctx, inst); } +template +static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const bool do_default_nan = ctx.FPSCR_DN() || !ctx.AccurateNaN(); + + const Xbyak::Xmm op1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm op2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg32 tmp = do_default_nan ? INVALID_REG.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32(); + + Xbyak::Label end, nan, op_are_nans; + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + FCODE(vmuls)(result, op1, op2); + } else { + code.movaps(result, op1); + FCODE(muls)(result, op2); + } + FCODE(ucomis)(result, result); + code.jp(nan, code.T_NEAR); + code.L(end); + + code.SwitchToFarCode(); + code.L(nan); + FCODE(ucomis)(op1, op2); + code.jp(op_are_nans); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vxorps(result, op1, op2); + } else { + code.movaps(result, op1); + code.xorps(result, op2); + } + code.andps(result, code.MConst(xword, FP::FPInfo::sign_mask)); + code.orps(result, code.MConst(xword, FP::FPValue())); + code.jmp(end, code.T_NEAR); + code.L(op_are_nans); + if (do_default_nan) { + code.movaps(result, code.MConst(xword, FP::FPInfo::DefaultNaN())); + code.jmp(end, code.T_NEAR); + } else { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vxorps(xmm0, op1, op2); + } else { + code.movaps(xmm0, op1); + code.xorps(xmm0, op2); + } + + constexpr FPT exponent_mask = FP::FPInfo::exponent_mask; + constexpr FPT mantissa_msb = FP::FPInfo::mantissa_msb; + constexpr u8 mantissa_msb_bit = static_cast(FP::FPInfo::explicit_mantissa_width - 1); + constexpr size_t shift = fsize == 32 ? 0 : 48; + + if constexpr (fsize == 32) { + code.movd(tmp, xmm0); + } else { + code.pextrw(tmp, xmm0, shift / 16); + } + code.and_(tmp, static_cast((exponent_mask | mantissa_msb) >> shift)); + code.cmp(tmp, static_cast(mantissa_msb >> shift)); + code.jne(end, code.T_NEAR); // (op1 != NaN || op2 != NaN) OR (op1 == SNaN && op2 == SNaN) OR (op1 == QNaN && op2 == QNaN) OR (op1 == SNaN && op2 == Inf) OR (op1 == Inf && op2 == SNaN) + + // If we're here there are four cases left: + // op1 == SNaN && op2 == QNaN + // op1 == Inf && op2 == QNaN + // op1 == QNaN && op2 == SNaN <<< The problematic case + // op1 == QNaN && op2 == Inf + + if constexpr (fsize == 32) { + code.movd(tmp, op2); + code.shl(tmp, 32 - mantissa_msb_bit); + } else { + code.movq(tmp.cvt64(), op2); + code.shl(tmp.cvt64(), 64 - mantissa_msb_bit); + } + // If op2 is a SNaN, CF = 0 and ZF = 0. + code.jna(end, code.T_NEAR); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vorps(result, op2, code.MConst(xword, mantissa_msb)); + } else { + code.movaps(result, op2); + code.orps(result, code.MConst(xword, mantissa_msb)); + } + code.jmp(end, code.T_NEAR); + } + code.SwitchToNearCode(); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPMulX32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulX<32>(code, ctx, inst); +} + +void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulX<64>(code, ctx, inst); +} + template static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 399af66d..ae763eb4 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1612,6 +1612,15 @@ U32U64 IREmitter::FPMulAdd(const U32U64& a, const U32U64& b, const U32U64& c, bo } } +U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst(Opcode::FPMulX32, a, b); + } else { + return Inst(Opcode::FPMulX64, a, b); + } +} + U32U64 IREmitter::FPNeg(const U32U64& a) { if (a.GetType() == Type::U32) { return Inst(Opcode::FPNeg32, a); diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index fa7a907e..8bc631d6 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -278,6 +278,7 @@ public: U32U64 FPMinNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPMulAdd(const U32U64& addend, const U32U64& op1, const U32U64& op2, bool fpscr_controlled); + U32U64 FPMulX(const U32U64& a, const U32U64& b); U32U64 FPNeg(const U32U64& a); U32U64 FPRecipEstimate(const U32U64& a); U32U64 FPRecipStepFused(const U32U64& a, const U32U64& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 105da9dc..c885adf8 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -421,6 +421,8 @@ OPCODE(FPMul32, T::U32, T::U32, OPCODE(FPMul64, T::U64, T::U64, T::U64 ) OPCODE(FPMulAdd32, T::U32, T::U32, T::U32, T::U32 ) OPCODE(FPMulAdd64, T::U64, T::U64, T::U64, T::U64 ) +OPCODE(FPMulX32, T::U32, T::U32, T::U32 ) +OPCODE(FPMulX64, T::U64, T::U64, T::U64 ) OPCODE(FPNeg32, T::U32, T::U32 ) OPCODE(FPNeg64, T::U64, T::U64 ) OPCODE(FPRecipEstimate32, T::U32, T::U32 )