emit_x64_vector: SSE implementation of VectorSignedSaturatedAccumulateUnsigned{8,16,32}

This commit is contained in:
MerryMage 2018-09-11 19:56:06 +01:00
parent 26df6e5e7b
commit 1492573267

View file

@ -2810,49 +2810,8 @@ void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
}); });
} }
// Simple generic case for 8, 16, and 32-bit values. 64-bit values template<size_t bit_width>
// will need to be special-cased as we can't simply use a larger integral size. static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
template <typename T>
static bool EmitSignedSaturatedAccumulateUnsigned(VectorArray<T>& result, const VectorArray<T>& lhs, const VectorArray<T>& rhs) {
static_assert(std::is_signed_v<T>, "T must be signed.");
static_assert(Common::BitSize<T>() < 64, "T must be less than 64 bits in size.");
bool qc_flag = false;
for (size_t i = 0; i < result.size(); i++) {
// We treat lhs' members as unsigned, so cast to unsigned before signed to inhibit sign-extension.
// We use the unsigned equivalent of T, as we want zero-extension to occur, rather than a plain move.
const s64 x = static_cast<s64>(static_cast<std::make_unsigned_t<T>>(lhs[i]));
const s64 y = rhs[i];
const s64 sum = x + y;
if (sum > std::numeric_limits<T>::max()) {
result[i] = std::numeric_limits<T>::max();
qc_flag = true;
} else if (sum < std::numeric_limits<T>::min()) {
result[i] = std::numeric_limits<T>::min();
qc_flag = true;
} else {
result[i] = static_cast<T>(sum);
}
}
return qc_flag;
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned8(EmitContext& ctx, IR::Inst* inst) {
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s8>);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned16(EmitContext& ctx, IR::Inst* inst) {
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s16>);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned32(EmitContext& ctx, IR::Inst* inst) {
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s32>);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
@ -2863,52 +2822,111 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
switch (bit_width) {
case 8:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
code.vpaddb(result, x, xmm0);
} else {
code.movdqa(result, x);
code.paddb(result, xmm0);
}
break;
case 16:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
code.vpaddw(result, x, xmm0);
} else {
code.movdqa(result, x);
code.paddw(result, xmm0);
}
break;
case 32:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
code.vpaddd(result, x, xmm0);
} else {
code.movdqa(result, x);
code.paddd(result, xmm0);
}
break;
case 64:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
code.vpaddq(result, x, xmm0); code.vpaddq(result, x, xmm0);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
code.paddq(result, xmm0); code.paddq(result, xmm0);
} }
break;
}
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) {
// xmm0 = majority(~y, x, res) // xmm0 = majority(~y, x, res)
// y x res xmm0
// 0 0 0 0
// 0 0 1 1
// 0 1 0 1
// 0 1 1 1
// 1 0 0 0
// 1 0 1 0
// 1 1 0 0
// 1 1 1 1
code.vpternlogd(xmm0, x, result, 0b10001110); code.vpternlogd(xmm0, x, result, 0b10001110);
code.vpsraq(xmm0, xmm0, 63);
code.movdqa(tmp, xmm0);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { } else if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
code.vpor(tmp, x, result); code.vpor(tmp, x, result);
code.pand(x, result); code.pand(x, result);
code.pblendvb(tmp, x); code.vpblendvb(xmm0, tmp, x, xmm0);
code.psrad(tmp, 31);
code.pshufd(tmp, tmp, 0b11110101);
code.movdqa(xmm0, tmp);
} else { } else {
code.movdqa(tmp, x); code.movdqa(tmp, x);
code.por(tmp, result); code.pxor(x, result);
code.pand(x, result); code.pand(tmp, result);
code.pand(x, xmm0); code.pandn(xmm0, x);
code.pandn(xmm0, tmp); code.por(xmm0, tmp);
code.por(xmm0, x); }
ctx.reg_alloc.Release(x);
switch (bit_width) {
case 8:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
code.pcmpeqb(tmp2, tmp2);
code.pxor(tmp, tmp);
code.vpblendvb(xmm0, tmp, tmp2, xmm0);
ctx.reg_alloc.Release(tmp2);
} else {
code.pand(xmm0, code.MConst(xword, 0x8080808080808080, 0x8080808080808080));
code.movdqa(tmp, xmm0);
code.psrlw(tmp, 7);
code.pxor(xmm0, xmm0);
code.psubb(xmm0, tmp);
}
break;
case 16:
code.psraw(xmm0, 15);
break;
case 32:
code.psrad(xmm0, 31);
break;
case 64:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) {
code.vpsraq(xmm0, xmm0, 63);
} else {
code.psrad(xmm0, 31); code.psrad(xmm0, 31);
code.pshufd(xmm0, xmm0, 0b11110101); code.pshufd(xmm0, xmm0, 0b11110101);
}
break;
}
code.movdqa(tmp, xmm0); code.movdqa(tmp, xmm0);
switch (bit_width) {
case 8:
code.paddb(tmp, tmp);
code.psrlw(tmp, 1);
break;
case 16:
code.psrlw(tmp, 1);
break;
case 32:
code.psrld(tmp, 1);
break;
case 64:
code.psrlq(tmp, 1);
break;
} }
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
code.pmovmskb(mask, xmm0); code.pmovmskb(mask, xmm0);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask); code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
code.psrlq(tmp, 1);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
code.pblendvb(result, tmp); code.pblendvb(result, tmp);
} else { } else {
@ -2920,6 +2938,22 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
} }
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned8(EmitContext& ctx, IR::Inst* inst) {
EmitVectorSignedSaturatedAccumulateUnsigned<8>(code, ctx, inst);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned16(EmitContext& ctx, IR::Inst* inst) {
EmitVectorSignedSaturatedAccumulateUnsigned<16>(code, ctx, inst);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned32(EmitContext& ctx, IR::Inst* inst) {
EmitVectorSignedSaturatedAccumulateUnsigned<32>(code, ctx, inst);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR::Inst* inst) {
EmitVectorSignedSaturatedAccumulateUnsigned<64>(code, ctx, inst);
}
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);