diff --git a/src/backend/x64/emit_x64.cpp b/src/backend/x64/emit_x64.cpp index 50198aee..191bb050 100644 --- a/src/backend/x64/emit_x64.cpp +++ b/src/backend/x64/emit_x64.cpp @@ -106,6 +106,14 @@ void EmitX64::EmitGetGEFromOp(EmitContext&, IR::Inst*) { ASSERT_MSG(false, "should never happen"); } +void EmitX64::EmitGetUpperFromOp(EmitContext&, IR::Inst*) { + ASSERT_MSG(false, "should never happen"); +} + +void EmitX64::EmitGetLowerFromOp(EmitContext&, IR::Inst*) { + ASSERT_MSG(false, "should never happen"); +} + void EmitX64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 7a032624..8999370a 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -2702,6 +2702,129 @@ void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* i EmitVectorSignedAbsoluteDifference(32, ctx, inst, code); } +void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + + if (upper_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpmulhw(result, x, y); + } else { + code.movdqa(result, x); + code.pmulhw(result, y); + } + + ctx.reg_alloc.DefineValue(upper_inst, result); + ctx.EraseInstruction(upper_inst); + } + + if (lower_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpmullw(result, x, y); + } else { + code.movdqa(result, x); + code.pmullw(result, y); + } + ctx.reg_alloc.DefineValue(lower_inst, result); + ctx.EraseInstruction(lower_inst); + } +} + +void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (lower_inst && !upper_inst && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmulld(result, x, y); + + ctx.reg_alloc.DefineValue(lower_inst, result); + ctx.EraseInstruction(lower_inst); + return; + } + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + if (lower_inst) { + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + code.vpmulld(lower_result, x, y); + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + ctx.EraseInstruction(lower_inst); + } + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmuldq(result, x, y); + code.vpsrlq(x, x, 32); + code.vpsrlq(y, y, 32); + code.vpmuldq(x, x, y); + code.shufps(result, x, 0b11011101); + + ctx.reg_alloc.DefineValue(upper_inst, result); + ctx.EraseInstruction(upper_inst); + return; + } + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + + // calculate sign correction + code.movdqa(tmp, x); + code.movdqa(sign_correction, y); + code.psrad(tmp, 31); + code.psrad(sign_correction, 31); + code.pand(tmp, y); + code.pand(sign_correction, x); + code.paddd(sign_correction, tmp); + code.pand(sign_correction, code.MConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF)); + + // calculate unsigned multiply + code.movdqa(tmp, x); + code.pmuludq(tmp, y); + code.psrlq(x, 32); + code.psrlq(y, 32); + code.pmuludq(x, y); + + // put everything into place + code.pcmpeqw(upper_result, upper_result); + code.pcmpeqw(lower_result, lower_result); + code.psllq(upper_result, 32); + code.psrlq(lower_result, 32); + code.pand(upper_result, x); + code.pand(lower_result, tmp); + code.psrlq(tmp, 32); + code.psllq(x, 32); + code.por(upper_result, tmp); + code.por(lower_result, x); + code.psubd(upper_result, sign_correction); + + if (upper_inst) { + ctx.reg_alloc.DefineValue(upper_inst, upper_result); + ctx.EraseInstruction(upper_inst); + } + if (lower_inst) { + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + ctx.EraseInstruction(lower_inst); + } +} + static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -3473,6 +3596,117 @@ void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code); } +void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + + if (upper_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpmulhuw(result, x, y); + } else { + code.movdqa(result, x); + code.pmulhuw(result, y); + } + + ctx.reg_alloc.DefineValue(upper_inst, result); + ctx.EraseInstruction(upper_inst); + } + + if (lower_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpmullw(result, x, y); + } else { + code.movdqa(result, x); + code.pmullw(result, y); + } + ctx.reg_alloc.DefineValue(lower_inst, result); + ctx.EraseInstruction(lower_inst); + } +} + +void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (lower_inst && !upper_inst && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmulld(result, x, y); + + ctx.reg_alloc.DefineValue(lower_inst, result); + ctx.EraseInstruction(lower_inst); + return; + } + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + if (lower_inst) { + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + code.vpmulld(lower_result, x, y); + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + ctx.EraseInstruction(lower_inst); + } + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmuludq(result, x, y); + code.vpsrlq(x, x, 32); + code.vpsrlq(y, y, 32); + code.vpmuludq(x, x, y); + code.shufps(result, x, 0b11011101); + + ctx.reg_alloc.DefineValue(upper_inst, result); + ctx.EraseInstruction(upper_inst); + return; + } + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + + // calculate unsigned multiply + code.movdqa(tmp, x); + code.pmuludq(tmp, y); + code.psrlq(x, 32); + code.psrlq(y, 32); + code.pmuludq(x, y); + + // put everything into place + code.pcmpeqw(upper_result, upper_result); + code.pcmpeqw(lower_result, lower_result); + code.psllq(upper_result, 32); + code.psrlq(lower_result, 32); + code.pand(upper_result, x); + code.pand(lower_result, tmp); + code.psrlq(tmp, 32); + code.psllq(x, 32); + code.por(upper_result, tmp); + code.por(lower_result, x); + + if (upper_inst) { + ctx.reg_alloc.DefineValue(upper_inst, upper_result); + ctx.EraseInstruction(upper_inst); + } + if (lower_inst) { + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + ctx.EraseInstruction(lower_inst); + } +} + void EmitX64::EmitVectorUnsignedRecipEstimate(EmitContext& ctx, IR::Inst* inst) { EmitOneArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a) { for (size_t i = 0; i < result.size(); i++) { diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index daf6e675..6729eacd 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1527,6 +1527,24 @@ U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, cons return {}; } +UpperAndLower IREmitter::VectorSignedMultiply(size_t esize, const U128& a, const U128& b) { + const Value multiply = [&] { + switch (esize) { + case 16: + return Inst(Opcode::VectorSignedMultiply16, a, b); + case 32: + return Inst(Opcode::VectorSignedMultiply32, a, b); + } + UNREACHABLE(); + return Value{}; + }(); + + return { + Inst(Opcode::GetUpperFromOp, multiply), + Inst(Opcode::GetLowerFromOp, multiply), + }; +} + U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) { switch (esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 9dc58d82..d7764451 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -52,6 +52,11 @@ struct ResultAndGE { U32 ge; }; +struct UpperAndLower { + U128 upper; + U128 lower; +}; + /** * Convenience class to construct a basic block of the intermediate representation. * `block` is the resulting block. @@ -265,6 +270,7 @@ public: U128 VectorShuffleWords(const U128& a, u8 mask); U128 VectorSignExtend(size_t original_esize, const U128& a); U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); + UpperAndLower VectorSignedMultiply(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedAbs(size_t esize, const U128& a); U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index 91fc53d5..78c690f5 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -437,6 +437,8 @@ bool Inst::IsAPseudoOperation() const { case Opcode::GetOverflowFromOp: case Opcode::GetGEFromOp: case Opcode::GetNZCVFromOp: + case Opcode::GetUpperFromOp: + case Opcode::GetLowerFromOp: return true; default: @@ -470,7 +472,7 @@ bool Inst::AreAllArgsImmediates() const { } bool Inst::HasAssociatedPseudoOperation() const { - return carry_inst || overflow_inst || ge_inst || nzcv_inst; + return carry_inst || overflow_inst || ge_inst || nzcv_inst || upper_inst || lower_inst; } Inst* Inst::GetAssociatedPseudoOperation(Opcode opcode) { @@ -488,6 +490,12 @@ Inst* Inst::GetAssociatedPseudoOperation(Opcode opcode) { case Opcode::GetNZCVFromOp: ASSERT(!nzcv_inst || nzcv_inst->GetOpcode() == Opcode::GetNZCVFromOp); return nzcv_inst; + case Opcode::GetUpperFromOp: + ASSERT(!upper_inst || upper_inst->GetOpcode() == Opcode::GetUpperFromOp); + return upper_inst; + case Opcode::GetLowerFromOp: + ASSERT(!lower_inst || lower_inst->GetOpcode() == Opcode::GetLowerFromOp); + return lower_inst; default: break; } @@ -574,6 +582,14 @@ void Inst::Use(const Value& value) { ASSERT_MSG(value.GetInst()->MayGetNZCVFromOp(), "This value doesn't support the GetNZCVFromOp pseduo-op"); value.GetInst()->nzcv_inst = this; break; + case Opcode::GetUpperFromOp: + ASSERT_MSG(!value.GetInst()->upper_inst, "Only one of each type of pseudo-op allowed"); + value.GetInst()->upper_inst = this; + break; + case Opcode::GetLowerFromOp: + ASSERT_MSG(!value.GetInst()->lower_inst, "Only one of each type of pseudo-op allowed"); + value.GetInst()->lower_inst = this; + break; default: break; } @@ -599,6 +615,14 @@ void Inst::UndoUse(const Value& value) { ASSERT(value.GetInst()->nzcv_inst->GetOpcode() == Opcode::GetNZCVFromOp); value.GetInst()->nzcv_inst = nullptr; break; + case Opcode::GetUpperFromOp: + ASSERT(value.GetInst()->upper_inst->GetOpcode() == Opcode::GetUpperFromOp); + value.GetInst()->upper_inst = nullptr; + break; + case Opcode::GetLowerFromOp: + ASSERT(value.GetInst()->lower_inst->GetOpcode() == Opcode::GetLowerFromOp); + value.GetInst()->lower_inst = nullptr; + break; default: break; } diff --git a/src/frontend/ir/microinstruction.h b/src/frontend/ir/microinstruction.h index ee2d2602..089884a2 100644 --- a/src/frontend/ir/microinstruction.h +++ b/src/frontend/ir/microinstruction.h @@ -145,9 +145,13 @@ private: union { Inst* carry_inst = nullptr; Inst* ge_inst; + Inst* upper_inst; }; Inst* overflow_inst = nullptr; - Inst* nzcv_inst = nullptr; + union { + Inst* nzcv_inst = nullptr; + Inst* lower_inst; + }; }; } // namespace Dynarmic::IR diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 3977e0dc..585f712b 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -79,6 +79,8 @@ OPCODE(GetCarryFromOp, U1, Opaqu OPCODE(GetOverflowFromOp, U1, Opaque ) OPCODE(GetGEFromOp, U32, Opaque ) OPCODE(GetNZCVFromOp, NZCV, Opaque ) +OPCODE(GetUpperFromOp, U128, Opaque ) +OPCODE(GetLowerFromOp, U128, Opaque ) OPCODE(NZCVFromPackedFlags, NZCV, U32 ) @@ -396,6 +398,8 @@ OPCODE(VectorSignExtend64, U128, U128 OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 ) OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 ) OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 ) +OPCODE(VectorSignedMultiply16, Void, U128, U128 ) +OPCODE(VectorSignedMultiply32, Void, U128, U128 ) OPCODE(VectorSignedSaturatedAbs8, U128, U128 ) OPCODE(VectorSignedSaturatedAbs16, U128, U128 ) OPCODE(VectorSignedSaturatedAbs32, U128, U128 ) @@ -425,6 +429,8 @@ OPCODE(VectorTableLookup, U128, U128, OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 ) OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 ) OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 ) +OPCODE(VectorUnsignedMultiply16, Void, U128, U128 ) +OPCODE(VectorUnsignedMultiply32, Void, U128, U128 ) OPCODE(VectorUnsignedRecipEstimate, U128, U128 ) OPCODE(VectorUnsignedRecipSqrtEstimate, U128, U128 ) OPCODE(VectorUnsignedSaturatedAccumulateSigned8, U128, U128, U128 )