diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index 0fa31698..df1c7634 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -370,4 +370,101 @@ void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::subpd); } +void EmitX64::EmitFPVectorU32ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { + code.vcvtudq2ps(xmm, xmm); + } else { + const Xbyak::Address mem_4B000000 = code.MConst(xword, 0x4B0000004B000000, 0x4B0000004B000000); + const Xbyak::Address mem_53000000 = code.MConst(xword, 0x5300000053000000, 0x5300000053000000); + const Xbyak::Address mem_D3000080 = code.MConst(xword, 0xD3000080D3000080, 0xD3000080D3000080); + + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpblendw(tmp, xmm, mem_4B000000, 0b10101010); + code.vpsrld(xmm, xmm, 16); + code.vpblendw(xmm, xmm, mem_53000000, 0b10101010); + code.vaddps(xmm, xmm, mem_D3000080); + code.vaddps(xmm, tmp, xmm); + } else { + const Xbyak::Address mem_0xFFFF = code.MConst(xword, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF); + + code.movdqa(tmp, mem_0xFFFF); + + code.pand(tmp, xmm); + code.por(tmp, mem_4B000000); + code.psrld(xmm, 16); + code.por(xmm, mem_53000000); + code.addps(xmm, mem_D3000080); + code.addps(xmm, tmp); + } + } + + if (ctx.FPSCR_RMode() == FP::RoundingMode::TowardsMinusInfinity) { + code.pand(xmm, code.MConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF)); + } + + ctx.reg_alloc.DefineValue(inst, xmm); +} + +void EmitX64::EmitFPVectorU64ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { + code.vcvtuqq2pd(xmm, xmm); + } else { + const Xbyak::Address unpack = code.MConst(xword, 0x4530000043300000, 0); + const Xbyak::Address subtrahend = code.MConst(xword, 0x4330000000000000, 0x4530000000000000); + + const Xbyak::Xmm unpack_reg = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vmovapd(unpack_reg, unpack); + code.vmovapd(subtrahend_reg, subtrahend); + + code.vunpcklps(tmp1, xmm, unpack_reg); + code.vsubpd(tmp1, tmp1, subtrahend_reg); + + code.vpermilps(xmm, xmm, 0b01001110); + + code.vunpcklps(xmm, xmm, unpack_reg); + code.vsubpd(xmm, xmm, subtrahend_reg); + + code.vhaddpd(xmm, tmp1, xmm); + } else { + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + code.movapd(unpack_reg, unpack); + code.movapd(subtrahend_reg, subtrahend); + + code.pshufd(tmp1, xmm, 0b01001110); + + code.punpckldq(xmm, unpack_reg); + code.subpd(xmm, subtrahend_reg); + code.pshufd(tmp2, xmm, 0b01001110); + code.addpd(xmm, tmp2); + + code.punpckldq(tmp1, unpack_reg); + code.subpd(tmp1, subtrahend_reg); + + code.pshufd(unpack_reg, tmp1, 0b01001110); + code.addpd(unpack_reg, tmp1); + + code.unpcklpd(xmm, unpack_reg); + } + } + + if (ctx.FPSCR_RMode() == FP::RoundingMode::TowardsMinusInfinity) { + code.pand(xmm, code.MConst(xword, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF)); + } + + ctx.reg_alloc.DefineValue(inst, xmm); +} + } // namespace Dynarmic::BackendX64 diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 10a05fab..3f7c6d33 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1620,6 +1620,14 @@ U128 IREmitter::FPVectorS64ToDouble(const U128& a) { return Inst(Opcode::FPVectorS64ToDouble, a); } +U128 IREmitter::FPVectorU32ToSingle(const U128& a) { + return Inst(Opcode::FPVectorU32ToSingle, a); +} + +U128 IREmitter::FPVectorU64ToDouble(const U128& a) { + return Inst(Opcode::FPVectorU64ToDouble, a); +} + void IREmitter::Breakpoint() { Inst(Opcode::Breakpoint); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 20702666..8bc248d0 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -288,6 +288,8 @@ public: U128 FPVectorSub(size_t esize, const U128& a, const U128& b); U128 FPVectorS32ToSingle(const U128& a); U128 FPVectorS64ToDouble(const U128& a); + U128 FPVectorU32ToSingle(const U128& a); + U128 FPVectorU64ToDouble(const U128& a); void Breakpoint(); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 9b570638..83ec4820 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -421,6 +421,8 @@ OPCODE(FPVectorS32ToSingle, T::U128, T::U128 OPCODE(FPVectorS64ToDouble, T::U128, T::U128 ) OPCODE(FPVectorSub32, T::U128, T::U128, T::U128 ) OPCODE(FPVectorSub64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorU32ToSingle, T::U128, T::U128 ) +OPCODE(FPVectorU64ToDouble, T::U128, T::U128 ) // A32 Memory access A32OPC(ClearExclusive, T::Void, )