emit_x64_vector: Optimize VectorSignedSaturatedAbs
This commit is contained in:
parent
cc9f00645d
commit
0adc972cd9
1 changed files with 61 additions and 75 deletions
|
@ -4064,71 +4064,75 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm data_test = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm();
|
|
||||||
const Xbyak::Address mask = [esize, &code] {
|
// SSE absolute value functions return an unsigned result
|
||||||
|
// this means abs(SIGNED_MIN) returns its value unchanged, leaving the most significant bit set
|
||||||
|
// so doing a movemask operation on the result of abs(data) before processing saturation is enough to determine
|
||||||
|
// if the QC bit needs to be set
|
||||||
|
|
||||||
|
// to perform the actual saturation, either do a minimum operation with a vector of SIGNED_MAX,
|
||||||
|
// or shift in sign bits to create a mask of (msb == 1 ? -1 : 0), then add to the result vector
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 8:
|
case 8: {
|
||||||
return code.Const(xword, 0x8080808080808080, 0x8080808080808080);
|
VectorAbs8(code, ctx, data);
|
||||||
case 16:
|
code.pmovmskb(bit, data);
|
||||||
return code.Const(xword, 0x8000800080008000, 0x8000800080008000);
|
|
||||||
case 32:
|
code.pminub(data, code.BConst<8>(xword, 0x7F));
|
||||||
return code.Const(xword, 0x8000000080000000, 0x8000000080000000);
|
break;
|
||||||
case 64:
|
}
|
||||||
return code.Const(xword, 0x8000000000000000, 0x8000000000000000);
|
case 16: {
|
||||||
|
VectorAbs16(code, ctx, data);
|
||||||
|
code.pmovmskb(bit, data);
|
||||||
|
code.and_(bit, 0xAAAA); // toggle mask bits that aren't the msb of an int16 to 0
|
||||||
|
|
||||||
|
if (code.HasHostFeature(HostFeature::SSE41)) {
|
||||||
|
code.pminuw(data, code.BConst<16>(xword, 0x7FFF));
|
||||||
|
} else {
|
||||||
|
const Xbyak::Xmm tmp = xmm0;
|
||||||
|
code.movdqa(tmp, data);
|
||||||
|
code.psraw(data, 15);
|
||||||
|
code.paddw(data, tmp);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32: {
|
||||||
|
VectorAbs32(code, ctx, data);
|
||||||
|
code.movmskps(bit, data);
|
||||||
|
|
||||||
|
if (code.HasHostFeature(HostFeature::SSE41)) {
|
||||||
|
code.pminud(data, code.BConst<32>(xword, 0x7FFFFFFF));
|
||||||
|
} else {
|
||||||
|
const Xbyak::Xmm tmp = xmm0;
|
||||||
|
code.movdqa(tmp, data);
|
||||||
|
code.psrad(data, 31);
|
||||||
|
code.paddd(data, tmp);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 64: {
|
||||||
|
VectorAbs64(code, ctx, data);
|
||||||
|
code.movmskpd(bit, data);
|
||||||
|
|
||||||
|
const Xbyak::Xmm tmp = xmm0;
|
||||||
|
if (code.HasHostFeature(HostFeature::SSE42)) {
|
||||||
|
// create a -1 mask if msb is set
|
||||||
|
code.pxor(tmp, tmp);
|
||||||
|
code.pcmpgtq(tmp, data);
|
||||||
|
} else {
|
||||||
|
// replace the lower part of each 64-bit value with the upper 32 bits,
|
||||||
|
// then shift in sign bits from there
|
||||||
|
code.pshufd(tmp, data, 0b11110101);
|
||||||
|
code.psrad(tmp, 31);
|
||||||
|
}
|
||||||
|
code.paddq(data, tmp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
}();
|
|
||||||
|
|
||||||
const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const Xbyak::Xmm& y) {
|
|
||||||
switch (esize) {
|
|
||||||
case 8:
|
|
||||||
code.pcmpeqb(x, y);
|
|
||||||
break;
|
|
||||||
case 16:
|
|
||||||
code.pcmpeqw(x, y);
|
|
||||||
break;
|
|
||||||
case 32:
|
|
||||||
code.pcmpeqd(x, y);
|
|
||||||
break;
|
|
||||||
case 64:
|
|
||||||
code.pcmpeqq(x, y);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Keep a copy of the initial data for determining whether or not
|
|
||||||
// to set the Q flag
|
|
||||||
code.movdqa(data_test, data);
|
|
||||||
|
|
||||||
switch (esize) {
|
|
||||||
case 8:
|
|
||||||
VectorAbs8(code, ctx, data);
|
|
||||||
break;
|
|
||||||
case 16:
|
|
||||||
VectorAbs16(code, ctx, data);
|
|
||||||
break;
|
|
||||||
case 32:
|
|
||||||
VectorAbs32(code, ctx, data);
|
|
||||||
break;
|
|
||||||
case 64:
|
|
||||||
VectorAbs64(code, ctx, data);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
code.movdqa(sign, mask);
|
|
||||||
vector_equality(sign, data);
|
|
||||||
code.pxor(data, sign);
|
|
||||||
|
|
||||||
// Check if the initial data contained any elements with the value 0x80.
|
|
||||||
// If any exist, then the Q flag needs to be set.
|
|
||||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
|
||||||
code.movdqa(sign, mask);
|
|
||||||
vector_equality(data_test, sign);
|
|
||||||
code.pmovmskb(bit, data_test);
|
|
||||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, data);
|
ctx.reg_alloc.DefineValue(inst, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4145,25 +4149,7 @@ void EmitX64::EmitVectorSignedSaturatedAbs32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
if (code.HasHostFeature(HostFeature::SSE41)) {
|
|
||||||
EmitVectorSignedSaturatedAbs(64, code, ctx, inst);
|
EmitVectorSignedSaturatedAbs(64, code, ctx, inst);
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& data) {
|
|
||||||
bool qc_flag = false;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < result.size(); i++) {
|
|
||||||
if (static_cast<u64>(data[i]) == 0x8000000000000000) {
|
|
||||||
result[i] = 0x7FFFFFFFFFFFFFFF;
|
|
||||||
qc_flag = true;
|
|
||||||
} else {
|
|
||||||
result[i] = std::abs(data[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return qc_flag;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t bit_width>
|
template<size_t bit_width>
|
||||||
|
|
Loading…
Reference in a new issue