IR: Implement Vector{Signed,Unsigned}Multiply{16,32}
This commit is contained in:
parent
b6df34cdde
commit
08c0e017a5
7 changed files with 302 additions and 2 deletions
|
@ -106,6 +106,14 @@ void EmitX64::EmitGetGEFromOp(EmitContext&, IR::Inst*) {
|
|||
ASSERT_MSG(false, "should never happen");
|
||||
}
|
||||
|
||||
void EmitX64::EmitGetUpperFromOp(EmitContext&, IR::Inst*) {
|
||||
ASSERT_MSG(false, "should never happen");
|
||||
}
|
||||
|
||||
void EmitX64::EmitGetLowerFromOp(EmitContext&, IR::Inst*) {
|
||||
ASSERT_MSG(false, "should never happen");
|
||||
}
|
||||
|
||||
void EmitX64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
|
|
|
@ -2702,6 +2702,129 @@ void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* i
|
|||
EmitVectorSignedAbsoluteDifference(32, ctx, inst, code);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
|
||||
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||
|
||||
if (upper_inst) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpmulhw(result, x, y);
|
||||
} else {
|
||||
code.movdqa(result, x);
|
||||
code.pmulhw(result, y);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(upper_inst, result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
}
|
||||
|
||||
if (lower_inst) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpmullw(result, x, y);
|
||||
} else {
|
||||
code.movdqa(result, x);
|
||||
code.pmullw(result, y);
|
||||
}
|
||||
ctx.reg_alloc.DefineValue(lower_inst, result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
|
||||
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
if (lower_inst && !upper_inst && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpmulld(result, x, y);
|
||||
|
||||
ctx.reg_alloc.DefineValue(lower_inst, result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
|
||||
if (lower_inst) {
|
||||
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
|
||||
code.vpmulld(lower_result, x, y);
|
||||
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpmuldq(result, x, y);
|
||||
code.vpsrlq(x, x, 32);
|
||||
code.vpsrlq(y, y, 32);
|
||||
code.vpmuldq(x, x, y);
|
||||
code.shufps(result, x, 0b11011101);
|
||||
|
||||
ctx.reg_alloc.DefineValue(upper_inst, result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
return;
|
||||
}
|
||||
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
// calculate sign correction
|
||||
code.movdqa(tmp, x);
|
||||
code.movdqa(sign_correction, y);
|
||||
code.psrad(tmp, 31);
|
||||
code.psrad(sign_correction, 31);
|
||||
code.pand(tmp, y);
|
||||
code.pand(sign_correction, x);
|
||||
code.paddd(sign_correction, tmp);
|
||||
code.pand(sign_correction, code.MConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF));
|
||||
|
||||
// calculate unsigned multiply
|
||||
code.movdqa(tmp, x);
|
||||
code.pmuludq(tmp, y);
|
||||
code.psrlq(x, 32);
|
||||
code.psrlq(y, 32);
|
||||
code.pmuludq(x, y);
|
||||
|
||||
// put everything into place
|
||||
code.pcmpeqw(upper_result, upper_result);
|
||||
code.pcmpeqw(lower_result, lower_result);
|
||||
code.psllq(upper_result, 32);
|
||||
code.psrlq(lower_result, 32);
|
||||
code.pand(upper_result, x);
|
||||
code.pand(lower_result, tmp);
|
||||
code.psrlq(tmp, 32);
|
||||
code.psllq(x, 32);
|
||||
code.por(upper_result, tmp);
|
||||
code.por(lower_result, x);
|
||||
code.psubd(upper_result, sign_correction);
|
||||
|
||||
if (upper_inst) {
|
||||
ctx.reg_alloc.DefineValue(upper_inst, upper_result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
}
|
||||
if (lower_inst) {
|
||||
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
}
|
||||
|
||||
static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
|
@ -3473,6 +3596,117 @@ void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst*
|
|||
EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
|
||||
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||
|
||||
if (upper_inst) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpmulhuw(result, x, y);
|
||||
} else {
|
||||
code.movdqa(result, x);
|
||||
code.pmulhuw(result, y);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(upper_inst, result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
}
|
||||
|
||||
if (lower_inst) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpmullw(result, x, y);
|
||||
} else {
|
||||
code.movdqa(result, x);
|
||||
code.pmullw(result, y);
|
||||
}
|
||||
ctx.reg_alloc.DefineValue(lower_inst, result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
|
||||
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
if (lower_inst && !upper_inst && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpmulld(result, x, y);
|
||||
|
||||
ctx.reg_alloc.DefineValue(lower_inst, result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
|
||||
if (lower_inst) {
|
||||
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
|
||||
code.vpmulld(lower_result, x, y);
|
||||
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpmuludq(result, x, y);
|
||||
code.vpsrlq(x, x, 32);
|
||||
code.vpsrlq(y, y, 32);
|
||||
code.vpmuludq(x, x, y);
|
||||
code.shufps(result, x, 0b11011101);
|
||||
|
||||
ctx.reg_alloc.DefineValue(upper_inst, result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
return;
|
||||
}
|
||||
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
// calculate unsigned multiply
|
||||
code.movdqa(tmp, x);
|
||||
code.pmuludq(tmp, y);
|
||||
code.psrlq(x, 32);
|
||||
code.psrlq(y, 32);
|
||||
code.pmuludq(x, y);
|
||||
|
||||
// put everything into place
|
||||
code.pcmpeqw(upper_result, upper_result);
|
||||
code.pcmpeqw(lower_result, lower_result);
|
||||
code.psllq(upper_result, 32);
|
||||
code.psrlq(lower_result, 32);
|
||||
code.pand(upper_result, x);
|
||||
code.pand(lower_result, tmp);
|
||||
code.psrlq(tmp, 32);
|
||||
code.psllq(x, 32);
|
||||
code.por(upper_result, tmp);
|
||||
code.por(lower_result, x);
|
||||
|
||||
if (upper_inst) {
|
||||
ctx.reg_alloc.DefineValue(upper_inst, upper_result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
}
|
||||
if (lower_inst) {
|
||||
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorUnsignedRecipEstimate(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitOneArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a) {
|
||||
for (size_t i = 0; i < result.size(); i++) {
|
||||
|
|
|
@ -1527,6 +1527,24 @@ U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, cons
|
|||
return {};
|
||||
}
|
||||
|
||||
UpperAndLower IREmitter::VectorSignedMultiply(size_t esize, const U128& a, const U128& b) {
|
||||
const Value multiply = [&] {
|
||||
switch (esize) {
|
||||
case 16:
|
||||
return Inst(Opcode::VectorSignedMultiply16, a, b);
|
||||
case 32:
|
||||
return Inst(Opcode::VectorSignedMultiply32, a, b);
|
||||
}
|
||||
UNREACHABLE();
|
||||
return Value{};
|
||||
}();
|
||||
|
||||
return {
|
||||
Inst<U128>(Opcode::GetUpperFromOp, multiply),
|
||||
Inst<U128>(Opcode::GetLowerFromOp, multiply),
|
||||
};
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
|
|
|
@ -52,6 +52,11 @@ struct ResultAndGE {
|
|||
U32 ge;
|
||||
};
|
||||
|
||||
struct UpperAndLower {
|
||||
U128 upper;
|
||||
U128 lower;
|
||||
};
|
||||
|
||||
/**
|
||||
* Convenience class to construct a basic block of the intermediate representation.
|
||||
* `block` is the resulting block.
|
||||
|
@ -265,6 +270,7 @@ public:
|
|||
U128 VectorShuffleWords(const U128& a, u8 mask);
|
||||
U128 VectorSignExtend(size_t original_esize, const U128& a);
|
||||
U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
|
||||
UpperAndLower VectorSignedMultiply(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedAbs(size_t esize, const U128& a);
|
||||
U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b);
|
||||
|
|
|
@ -437,6 +437,8 @@ bool Inst::IsAPseudoOperation() const {
|
|||
case Opcode::GetOverflowFromOp:
|
||||
case Opcode::GetGEFromOp:
|
||||
case Opcode::GetNZCVFromOp:
|
||||
case Opcode::GetUpperFromOp:
|
||||
case Opcode::GetLowerFromOp:
|
||||
return true;
|
||||
|
||||
default:
|
||||
|
@ -470,7 +472,7 @@ bool Inst::AreAllArgsImmediates() const {
|
|||
}
|
||||
|
||||
bool Inst::HasAssociatedPseudoOperation() const {
|
||||
return carry_inst || overflow_inst || ge_inst || nzcv_inst;
|
||||
return carry_inst || overflow_inst || ge_inst || nzcv_inst || upper_inst || lower_inst;
|
||||
}
|
||||
|
||||
Inst* Inst::GetAssociatedPseudoOperation(Opcode opcode) {
|
||||
|
@ -488,6 +490,12 @@ Inst* Inst::GetAssociatedPseudoOperation(Opcode opcode) {
|
|||
case Opcode::GetNZCVFromOp:
|
||||
ASSERT(!nzcv_inst || nzcv_inst->GetOpcode() == Opcode::GetNZCVFromOp);
|
||||
return nzcv_inst;
|
||||
case Opcode::GetUpperFromOp:
|
||||
ASSERT(!upper_inst || upper_inst->GetOpcode() == Opcode::GetUpperFromOp);
|
||||
return upper_inst;
|
||||
case Opcode::GetLowerFromOp:
|
||||
ASSERT(!lower_inst || lower_inst->GetOpcode() == Opcode::GetLowerFromOp);
|
||||
return lower_inst;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -574,6 +582,14 @@ void Inst::Use(const Value& value) {
|
|||
ASSERT_MSG(value.GetInst()->MayGetNZCVFromOp(), "This value doesn't support the GetNZCVFromOp pseduo-op");
|
||||
value.GetInst()->nzcv_inst = this;
|
||||
break;
|
||||
case Opcode::GetUpperFromOp:
|
||||
ASSERT_MSG(!value.GetInst()->upper_inst, "Only one of each type of pseudo-op allowed");
|
||||
value.GetInst()->upper_inst = this;
|
||||
break;
|
||||
case Opcode::GetLowerFromOp:
|
||||
ASSERT_MSG(!value.GetInst()->lower_inst, "Only one of each type of pseudo-op allowed");
|
||||
value.GetInst()->lower_inst = this;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -599,6 +615,14 @@ void Inst::UndoUse(const Value& value) {
|
|||
ASSERT(value.GetInst()->nzcv_inst->GetOpcode() == Opcode::GetNZCVFromOp);
|
||||
value.GetInst()->nzcv_inst = nullptr;
|
||||
break;
|
||||
case Opcode::GetUpperFromOp:
|
||||
ASSERT(value.GetInst()->upper_inst->GetOpcode() == Opcode::GetUpperFromOp);
|
||||
value.GetInst()->upper_inst = nullptr;
|
||||
break;
|
||||
case Opcode::GetLowerFromOp:
|
||||
ASSERT(value.GetInst()->lower_inst->GetOpcode() == Opcode::GetLowerFromOp);
|
||||
value.GetInst()->lower_inst = nullptr;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -145,9 +145,13 @@ private:
|
|||
union {
|
||||
Inst* carry_inst = nullptr;
|
||||
Inst* ge_inst;
|
||||
Inst* upper_inst;
|
||||
};
|
||||
Inst* overflow_inst = nullptr;
|
||||
Inst* nzcv_inst = nullptr;
|
||||
union {
|
||||
Inst* nzcv_inst = nullptr;
|
||||
Inst* lower_inst;
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Dynarmic::IR
|
||||
|
|
|
@ -79,6 +79,8 @@ OPCODE(GetCarryFromOp, U1, Opaqu
|
|||
OPCODE(GetOverflowFromOp, U1, Opaque )
|
||||
OPCODE(GetGEFromOp, U32, Opaque )
|
||||
OPCODE(GetNZCVFromOp, NZCV, Opaque )
|
||||
OPCODE(GetUpperFromOp, U128, Opaque )
|
||||
OPCODE(GetLowerFromOp, U128, Opaque )
|
||||
|
||||
OPCODE(NZCVFromPackedFlags, NZCV, U32 )
|
||||
|
||||
|
@ -396,6 +398,8 @@ OPCODE(VectorSignExtend64, U128, U128
|
|||
OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 )
|
||||
OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 )
|
||||
OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 )
|
||||
OPCODE(VectorSignedMultiply16, Void, U128, U128 )
|
||||
OPCODE(VectorSignedMultiply32, Void, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAbs8, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAbs16, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAbs32, U128, U128 )
|
||||
|
@ -425,6 +429,8 @@ OPCODE(VectorTableLookup, U128, U128,
|
|||
OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 )
|
||||
OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 )
|
||||
OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 )
|
||||
OPCODE(VectorUnsignedMultiply16, Void, U128, U128 )
|
||||
OPCODE(VectorUnsignedMultiply32, Void, U128, U128 )
|
||||
OPCODE(VectorUnsignedRecipEstimate, U128, U128 )
|
||||
OPCODE(VectorUnsignedRecipSqrtEstimate, U128, U128 )
|
||||
OPCODE(VectorUnsignedSaturatedAccumulateSigned8, U128, U128, U128 )
|
||||
|
|
Loading…
Reference in a new issue