ir: Add opcodes for signed saturated absolute values
This commit is contained in:
parent
27427595b7
commit
0507e47420
5 changed files with 202 additions and 34 deletions
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <bitset>
|
#include <bitset>
|
||||||
|
#include <cstdlib>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
|
@ -308,13 +309,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
|
static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
|
||||||
|
|
||||||
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
|
||||||
|
|
||||||
switch (esize) {
|
|
||||||
case 8:
|
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
code.pabsb(data, data);
|
code.pabsb(data, data);
|
||||||
} else {
|
} else {
|
||||||
|
@ -323,8 +318,9 @@ static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockO
|
||||||
code.psubb(temp, data);
|
code.psubb(temp, data);
|
||||||
code.pminub(data, temp);
|
code.pminub(data, temp);
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case 16:
|
|
||||||
|
static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
code.pabsw(data, data);
|
code.pabsw(data, data);
|
||||||
} else {
|
} else {
|
||||||
|
@ -333,8 +329,9 @@ static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockO
|
||||||
code.psubw(temp, data);
|
code.psubw(temp, data);
|
||||||
code.pmaxsw(data, temp);
|
code.pmaxsw(data, temp);
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case 32:
|
|
||||||
|
static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
code.pabsd(data, data);
|
code.pabsd(data, data);
|
||||||
} else {
|
} else {
|
||||||
|
@ -344,8 +341,9 @@ static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockO
|
||||||
code.pxor(data, temp);
|
code.pxor(data, temp);
|
||||||
code.psubd(data, temp);
|
code.psubd(data, temp);
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case 64:
|
|
||||||
|
static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) {
|
||||||
code.vpabsq(data, data);
|
code.vpabsq(data, data);
|
||||||
} else {
|
} else {
|
||||||
|
@ -355,6 +353,25 @@ static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockO
|
||||||
code.pxor(data, temp);
|
code.pxor(data, temp);
|
||||||
code.psubq(data, temp);
|
code.psubq(data, temp);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
VectorAbs8(code, ctx, data);
|
||||||
|
break;
|
||||||
|
case 16:
|
||||||
|
VectorAbs16(code, ctx, data);
|
||||||
|
break;
|
||||||
|
case 32:
|
||||||
|
VectorAbs32(code, ctx, data);
|
||||||
|
break;
|
||||||
|
case 64:
|
||||||
|
VectorAbs64(code, ctx, data);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2613,6 +2630,133 @@ void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* i
|
||||||
EmitVectorSignedAbsoluteDifference(32, ctx, inst, code);
|
EmitVectorSignedAbsoluteDifference(32, ctx, inst, code);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
const Xbyak::Xmm data_test = ctx.reg_alloc.ScratchXmm();
|
||||||
|
const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm();
|
||||||
|
const Xbyak::Address mask = [esize, &code] {
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
return code.MConst(xword, 0x8080808080808080, 0x8080808080808080);
|
||||||
|
case 16:
|
||||||
|
return code.MConst(xword, 0x8000800080008000, 0x8000800080008000);
|
||||||
|
case 32:
|
||||||
|
return code.MConst(xword, 0x8000000080000000, 0x8000000080000000);
|
||||||
|
case 64:
|
||||||
|
return code.MConst(xword, 0x8000000000000000, 0x8000000000000000);
|
||||||
|
default:
|
||||||
|
UNREACHABLE();
|
||||||
|
return Xbyak::Address{0};
|
||||||
|
}
|
||||||
|
}();
|
||||||
|
|
||||||
|
const u32 test_mask = [esize] {
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
return 0b1111'1111'1111'1111;
|
||||||
|
case 16:
|
||||||
|
return 0b1010'1010'1010'1010;
|
||||||
|
case 32:
|
||||||
|
return 0b1000'1000'1000'1000;
|
||||||
|
case 64:
|
||||||
|
return 0b10000000'10000000;
|
||||||
|
default:
|
||||||
|
UNREACHABLE();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}();
|
||||||
|
|
||||||
|
const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const Xbyak::Xmm& y) {
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
code.pcmpeqb(x, y);
|
||||||
|
break;
|
||||||
|
case 16:
|
||||||
|
code.pcmpeqw(x, y);
|
||||||
|
break;
|
||||||
|
case 32:
|
||||||
|
code.pcmpeqd(x, y);
|
||||||
|
break;
|
||||||
|
case 64:
|
||||||
|
code.pcmpeqq(x, y);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Keep a copy of the initial data for determining whether or not
|
||||||
|
// to set the Q flag
|
||||||
|
code.movdqa(data_test, data);
|
||||||
|
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
VectorAbs8(code, ctx, data);
|
||||||
|
break;
|
||||||
|
case 16:
|
||||||
|
VectorAbs16(code, ctx, data);
|
||||||
|
break;
|
||||||
|
case 32:
|
||||||
|
VectorAbs32(code, ctx, data);
|
||||||
|
break;
|
||||||
|
case 64:
|
||||||
|
VectorAbs64(code, ctx, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
code.movdqa(sign, mask);
|
||||||
|
vector_equality(sign, data);
|
||||||
|
code.pxor(data, sign);
|
||||||
|
|
||||||
|
// Check if the initial data contained any elements with the value 0x80.
|
||||||
|
// If any exist, then the Q flag needs to be set.
|
||||||
|
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
code.movdqa(sign, mask);
|
||||||
|
vector_equality(data_test, sign);
|
||||||
|
code.pmovmskb(bit, data_test);
|
||||||
|
code.test(bit, test_mask);
|
||||||
|
code.setnz(bit.cvt8());
|
||||||
|
|
||||||
|
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8());
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorSignedSaturatedAbs8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitVectorSignedSaturatedAbs(8, code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorSignedSaturatedAbs16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitVectorSignedSaturatedAbs(16, code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorSignedSaturatedAbs32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitVectorSignedSaturatedAbs(32, code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
||||||
|
EmitVectorSignedSaturatedAbs(64, code, ctx, inst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& data) {
|
||||||
|
bool qc_flag = false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < result.size(); i++) {
|
||||||
|
if (static_cast<u64>(data[i]) == 0x8000000000000000) {
|
||||||
|
result[i] = 0x7FFFFFFFFFFFFFFF;
|
||||||
|
qc_flag = true;
|
||||||
|
} else {
|
||||||
|
result[i] = std::abs(data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return qc_flag;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]);
|
const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]);
|
||||||
|
|
|
@ -1496,6 +1496,21 @@ U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, cons
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) {
|
||||||
|
switch (esize) {
|
||||||
|
case 8:
|
||||||
|
return Inst<U128>(Opcode::VectorSignedSaturatedAbs8, a);
|
||||||
|
case 16:
|
||||||
|
return Inst<U128>(Opcode::VectorSignedSaturatedAbs16, a);
|
||||||
|
case 32:
|
||||||
|
return Inst<U128>(Opcode::VectorSignedSaturatedAbs32, a);
|
||||||
|
case 64:
|
||||||
|
return Inst<U128>(Opcode::VectorSignedSaturatedAbs64, a);
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) {
|
U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) {
|
||||||
switch (original_esize) {
|
switch (original_esize) {
|
||||||
case 16:
|
case 16:
|
||||||
|
|
|
@ -263,6 +263,7 @@ public:
|
||||||
U128 VectorShuffleWords(const U128& a, u8 mask);
|
U128 VectorShuffleWords(const U128& a, u8 mask);
|
||||||
U128 VectorSignExtend(size_t original_esize, const U128& a);
|
U128 VectorSignExtend(size_t original_esize, const U128& a);
|
||||||
U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
|
U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
|
||||||
|
U128 VectorSignedSaturatedAbs(size_t esize, const U128& a);
|
||||||
U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
|
U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
|
||||||
U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);
|
U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);
|
||||||
U128 VectorSub(size_t esize, const U128& a, const U128& b);
|
U128 VectorSub(size_t esize, const U128& a, const U128& b);
|
||||||
|
|
|
@ -347,6 +347,10 @@ bool Inst::ReadsFromFPSRCumulativeSaturationBit() const {
|
||||||
bool Inst::WritesToFPSRCumulativeSaturationBit() const {
|
bool Inst::WritesToFPSRCumulativeSaturationBit() const {
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case Opcode::A64OrQC:
|
case Opcode::A64OrQC:
|
||||||
|
case Opcode::VectorSignedSaturatedAbs8:
|
||||||
|
case Opcode::VectorSignedSaturatedAbs16:
|
||||||
|
case Opcode::VectorSignedSaturatedAbs32:
|
||||||
|
case Opcode::VectorSignedSaturatedAbs64:
|
||||||
case Opcode::VectorSignedSaturatedNarrowToSigned16:
|
case Opcode::VectorSignedSaturatedNarrowToSigned16:
|
||||||
case Opcode::VectorSignedSaturatedNarrowToSigned32:
|
case Opcode::VectorSignedSaturatedNarrowToSigned32:
|
||||||
case Opcode::VectorSignedSaturatedNarrowToSigned64:
|
case Opcode::VectorSignedSaturatedNarrowToSigned64:
|
||||||
|
|
|
@ -391,6 +391,10 @@ OPCODE(VectorSignExtend64, U128, U128
|
||||||
OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 )
|
OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 )
|
||||||
OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 )
|
OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 )
|
||||||
OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 )
|
OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 )
|
||||||
|
OPCODE(VectorSignedSaturatedAbs8, U128, U128 )
|
||||||
|
OPCODE(VectorSignedSaturatedAbs16, U128, U128 )
|
||||||
|
OPCODE(VectorSignedSaturatedAbs32, U128, U128 )
|
||||||
|
OPCODE(VectorSignedSaturatedAbs64, U128, U128 )
|
||||||
OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 )
|
OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 )
|
||||||
OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 )
|
OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 )
|
||||||
OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 )
|
OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 )
|
||||||
|
|
Loading…
Reference in a new issue