emit_x64_vector: Changes to VectorSignedSaturatedDoublingMultiply
* Return both the upper and lower parts of the multiply if required * SSE2 does not support the pmuldq instruction, do sign correction to an unsigned result instead * Improve port utilisation where possible (punpck instructions were a bottleneck)
This commit is contained in:
parent
08c0e017a5
commit
06b31448aa
8 changed files with 233 additions and 70 deletions
|
@ -3077,74 +3077,189 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR
|
|||
EmitVectorSignedSaturatedAccumulateUnsigned<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) {
|
||||
void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
|
||||
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpmulhw(upper_tmp, x, y);
|
||||
} else {
|
||||
code.movdqa(upper_tmp, x);
|
||||
code.pmulhw(upper_tmp, y);
|
||||
}
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpmullw(lower_tmp, x, y);
|
||||
} else {
|
||||
code.movdqa(lower_tmp, x);
|
||||
code.pmullw(lower_tmp, y);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.Release(x);
|
||||
ctx.reg_alloc.Release(y);
|
||||
|
||||
if (lower_inst) {
|
||||
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpaddw(lower_result, lower_tmp, lower_tmp);
|
||||
} else {
|
||||
code.movdqa(lower_result, lower_tmp);
|
||||
code.paddw(lower_result, lower_result);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
|
||||
if (upper_inst) {
|
||||
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
code.vpsrlw(lower_tmp, lower_tmp, 15);
|
||||
code.vpaddw(upper_tmp, upper_tmp, upper_tmp);
|
||||
code.vpor(upper_result, upper_tmp, lower_tmp);
|
||||
code.vpcmpeqw(upper_tmp, upper_result, code.MConst(xword, 0x8000800080008000, 0x8000800080008000));
|
||||
code.vpxor(upper_result, upper_result, upper_tmp);
|
||||
} else {
|
||||
code.paddw(upper_tmp, upper_tmp);
|
||||
code.psrlw(lower_tmp, 15);
|
||||
code.movdqa(upper_result, upper_tmp);
|
||||
code.por(upper_result, lower_tmp);
|
||||
code.movdqa(upper_tmp, code.MConst(xword, 0x8000800080008000, 0x8000800080008000));
|
||||
code.pcmpeqw(upper_tmp, upper_result);
|
||||
code.pxor(upper_result, upper_tmp);
|
||||
}
|
||||
|
||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.pmovmskb(bit, upper_tmp);
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||
|
||||
ctx.reg_alloc.DefineValue(upper_inst, upper_result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedDoublingMultiply32(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
|
||||
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpmuldq(odds, x, y);
|
||||
code.vpsrlq(x, x, 32);
|
||||
code.vpsrlq(y, y, 32);
|
||||
code.vpmuldq(even, x, y);
|
||||
|
||||
ctx.reg_alloc.Release(x);
|
||||
ctx.reg_alloc.Release(y);
|
||||
|
||||
code.vpaddq(odds, odds, odds);
|
||||
code.vpaddq(even, even, even);
|
||||
|
||||
if (upper_inst) {
|
||||
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpsrlq(upper_result, odds, 32);
|
||||
code.vblendps(upper_result, upper_result, even, 0b1010);
|
||||
|
||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
code.vpcmpeqd(mask, upper_result, code.MConst(xword, 0x8000000080000000, 0x8000000080000000));
|
||||
code.vpxor(upper_result, upper_result, mask);
|
||||
code.pmovmskb(bit, mask);
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||
|
||||
ctx.reg_alloc.Release(mask);
|
||||
ctx.reg_alloc.Release(bit);
|
||||
|
||||
ctx.reg_alloc.DefineValue(upper_inst, upper_result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
}
|
||||
|
||||
if (lower_inst) {
|
||||
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vpsllq(lower_result, even, 32);
|
||||
code.vblendps(lower_result, lower_result, even, 0b0101);
|
||||
|
||||
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
// calculate sign correction
|
||||
code.movdqa(tmp, x);
|
||||
code.pmulhw(tmp, y);
|
||||
code.paddw(tmp, tmp);
|
||||
code.pmullw(y, x);
|
||||
code.psrlw(y, 15);
|
||||
code.por(y, tmp);
|
||||
code.movdqa(sign_correction, y);
|
||||
code.psrad(tmp, 31);
|
||||
code.psrad(sign_correction, 31);
|
||||
code.pand(tmp, y);
|
||||
code.pand(sign_correction, x);
|
||||
code.paddd(sign_correction, tmp);
|
||||
code.pslld(sign_correction, 1);
|
||||
|
||||
code.movdqa(x, code.MConst(xword, 0x8000800080008000, 0x8000800080008000));
|
||||
code.pcmpeqw(x, y);
|
||||
// unsigned multiply
|
||||
code.movdqa(tmp, x);
|
||||
code.pxor(x, y);
|
||||
code.pmuludq(tmp, y);
|
||||
code.psrlq(x, 32);
|
||||
code.psrlq(y, 32);
|
||||
code.pmuludq(x, y);
|
||||
|
||||
// Check if any saturation occurred (i.e. if any halfwords in x were
|
||||
// 0x8000 before saturating
|
||||
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.pmovmskb(mask, tmp);
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
|
||||
// double
|
||||
code.paddq(tmp, tmp);
|
||||
code.paddq(x, x);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, x);
|
||||
// put everything into place
|
||||
code.pcmpeqw(upper_result, upper_result);
|
||||
code.pcmpeqw(lower_result, lower_result);
|
||||
code.psllq(upper_result, 32);
|
||||
code.psrlq(lower_result, 32);
|
||||
code.pand(upper_result, x);
|
||||
code.pand(lower_result, tmp);
|
||||
code.psrlq(tmp, 32);
|
||||
code.psllq(x, 32);
|
||||
code.por(upper_result, tmp);
|
||||
code.por(lower_result, x);
|
||||
code.psubd(upper_result, sign_correction);
|
||||
|
||||
if (upper_inst) {
|
||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
code.movdqa(tmp, code.MConst(xword, 0x8000000080000000, 0x8000000080000000));
|
||||
code.pcmpeqd(tmp, upper_result);
|
||||
code.pxor(upper_result, tmp);
|
||||
code.pmovmskb(bit, tmp);
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||
|
||||
ctx.reg_alloc.DefineValue(upper_inst, upper_result);
|
||||
ctx.EraseInstruction(upper_inst);
|
||||
}
|
||||
if (lower_inst) {
|
||||
ctx.reg_alloc.DefineValue(lower_inst, lower_result);
|
||||
ctx.EraseInstruction(lower_inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.movdqa(tmp1, x);
|
||||
code.punpckldq(tmp1, y);
|
||||
|
||||
code.movdqa(tmp2, y);
|
||||
code.punpckldq(tmp2, x);
|
||||
|
||||
code.pmuldq(tmp2, tmp1);
|
||||
code.paddq(tmp2, tmp2);
|
||||
|
||||
code.movdqa(tmp1, x);
|
||||
code.punpckhdq(tmp1, y);
|
||||
code.punpckhdq(y, x);
|
||||
|
||||
code.pmuldq(y, tmp1);
|
||||
code.paddq(y, y);
|
||||
|
||||
code.pshufd(tmp1, tmp2, 0b11101101);
|
||||
code.pshufd(x, y, 0b11101101);
|
||||
code.punpcklqdq(tmp1, x);
|
||||
|
||||
code.movdqa(x, code.MConst(xword, 0x8000000080000000, 0x8000000080000000));
|
||||
code.pcmpeqd(x, tmp1);
|
||||
code.movdqa(tmp2, x);
|
||||
code.pxor(x, tmp1);
|
||||
|
||||
// Check if any saturation occurred (i.e. if any words in x were
|
||||
// 0x80000000 before saturating
|
||||
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.pmovmskb(mask, tmp2);
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, x);
|
||||
}
|
||||
|
||||
static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
|
|
|
@ -432,7 +432,7 @@ bool TranslatorVisitor::SQDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec V
|
|||
|
||||
const IR::U128 operand1 = V(datasize, Vn);
|
||||
const IR::U128 operand2 = V(datasize, Vm);
|
||||
const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyReturnHigh(esize, operand1, operand2);
|
||||
const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiply(esize, operand1, operand2).upper;
|
||||
|
||||
V(datasize, Vd, result);
|
||||
return true;
|
||||
|
|
|
@ -233,7 +233,7 @@ bool TranslatorVisitor::SQDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, I
|
|||
const IR::U128 operand1 = V(datasize, Vn);
|
||||
const IR::U128 operand2 = V(idxsize, concatenate(Vmhi, Vmlo).ZeroExtend<Vec>());
|
||||
const IR::U128 index_vector = ir.VectorBroadcast(esize, ir.VectorGetElement(esize, operand2, index));
|
||||
const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyReturnHigh(esize, operand1, index_vector);
|
||||
const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiply(esize, operand1, index_vector).upper;
|
||||
|
||||
V(datasize, Vd, result);
|
||||
return true;
|
||||
|
|
|
@ -1575,15 +1575,23 @@ U128 IREmitter::VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128
|
|||
return {};
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b) {
|
||||
UpperAndLower IREmitter::VectorSignedSaturatedDoublingMultiply(size_t esize, const U128& a, const U128& b) {
|
||||
const Value multiply = [&] {
|
||||
switch (esize) {
|
||||
case 16:
|
||||
return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh16, a, b);
|
||||
return Inst(Opcode::VectorSignedSaturatedDoublingMultiply16, a, b);
|
||||
case 32:
|
||||
return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh32, a, b);
|
||||
}
|
||||
return Inst(Opcode::VectorSignedSaturatedDoublingMultiply32, a, b);
|
||||
default:
|
||||
UNREACHABLE();
|
||||
return {};
|
||||
return Value{};
|
||||
}
|
||||
}();
|
||||
|
||||
return {
|
||||
Inst<U128>(Opcode::GetUpperFromOp, multiply),
|
||||
Inst<U128>(Opcode::GetLowerFromOp, multiply),
|
||||
};
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) {
|
||||
|
|
|
@ -273,7 +273,7 @@ public:
|
|||
UpperAndLower VectorSignedMultiply(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedAbs(size_t esize, const U128& a);
|
||||
U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b);
|
||||
UpperAndLower VectorSignedSaturatedDoublingMultiply(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
|
||||
U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);
|
||||
U128 VectorSignedSaturatedNeg(size_t esize, const U128& a);
|
||||
|
|
|
@ -361,8 +361,6 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const {
|
|||
case Opcode::VectorSignedSaturatedNarrowToUnsigned16:
|
||||
case Opcode::VectorSignedSaturatedNarrowToUnsigned32:
|
||||
case Opcode::VectorSignedSaturatedNarrowToUnsigned64:
|
||||
case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh16:
|
||||
case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh32:
|
||||
case Opcode::VectorSignedSaturatedNeg8:
|
||||
case Opcode::VectorSignedSaturatedNeg16:
|
||||
case Opcode::VectorSignedSaturatedNeg32:
|
||||
|
|
|
@ -408,8 +408,8 @@ OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128,
|
|||
OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh16, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh32, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedDoublingMultiply16, Void, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedDoublingMultiply32, Void, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 )
|
||||
|
|
|
@ -494,3 +494,45 @@ TEST_CASE("A64: FRSQRTS", "[a64]") {
|
|||
|
||||
REQUIRE(jit.GetVector(13) == Vector{0xff7fffff, 0});
|
||||
}
|
||||
|
||||
TEST_CASE("A64: SQDMULH.8H (saturate)", "[a64]") {
|
||||
A64TestEnv env;
|
||||
Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}};
|
||||
|
||||
env.code_mem.emplace_back(0x4e62b420); // SQDMULH.8H V0, V1, V2
|
||||
env.code_mem.emplace_back(0x14000000); // B .
|
||||
|
||||
// Make sure that saturating values are tested
|
||||
|
||||
jit.SetPC(0);
|
||||
jit.SetVector(1, {0x7fff80007ffe8001, 0x7fff80007ffe8001});
|
||||
jit.SetVector(2, {0x7fff80007ffe8001, 0x80007fff80017ffe});
|
||||
jit.SetFpsr(0);
|
||||
|
||||
env.ticks_left = 2;
|
||||
jit.Run();
|
||||
|
||||
REQUIRE(jit.GetVector(0) == Vector{0x7ffe7fff7ffc7ffe, 0x8001800180028002});
|
||||
REQUIRE(FP::FPSR{jit.GetFpsr()}.QC() == true);
|
||||
}
|
||||
|
||||
TEST_CASE("A64: SQDMULH.4S (saturate)", "[a64]") {
|
||||
A64TestEnv env;
|
||||
Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}};
|
||||
|
||||
env.code_mem.emplace_back(0x4ea2b420); // SQDMULH.4S V0, V1, V2
|
||||
env.code_mem.emplace_back(0x14000000); // B .
|
||||
|
||||
// Make sure that saturating values are tested
|
||||
|
||||
jit.SetPC(0);
|
||||
jit.SetVector(1, {0x7fffffff80000000, 0x7fffffff80000000});
|
||||
jit.SetVector(2, {0x7fffffff80000000, 0x800000007fffffff});
|
||||
jit.SetFpsr(0);
|
||||
|
||||
env.ticks_left = 2;
|
||||
jit.Run();
|
||||
|
||||
REQUIRE(jit.GetVector(0) == Vector{0x7ffffffe7fffffff, 0x8000000180000001});
|
||||
REQUIRE(FP::FPSR{jit.GetFpsr()}.QC() == true);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue