ir: Add opcodes for signed saturated accumulations of unsigned values

This commit is contained in:
Lioncash 2018-09-08 21:08:34 -04:00 committed by MerryMage
parent 9a3d38d2ee
commit 6f911a26da
5 changed files with 116 additions and 0 deletions

View file

@ -99,6 +99,34 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
} }
template <typename Lambda>
static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
constexpr u32 stack_space = 3 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(nullptr);
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
code.movaps(xword[code.ABI_PARAM2], arg1);
code.movaps(xword[code.ABI_PARAM3], arg2);
code.CallFunction(fn);
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
ctx.reg_alloc.DefineValue(inst, result);
}
template <typename Lambda> template <typename Lambda>
static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda); const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
@ -2764,6 +2792,70 @@ void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
}); });
} }
// Simple generic case for 8, 16, and 32-bit values. 64-bit values
// will need to be special-cased as we can't simply use a larger integral size.
template <typename T>
static bool EmitSignedSaturatedAccumulateUnsigned(VectorArray<T>& result, const VectorArray<T>& lhs, const VectorArray<T>& rhs) {
static_assert(std::is_signed_v<T>, "T must be signed.");
static_assert(sizeof(T) < 64, "T must be less than 64 bits in size.");
bool qc_flag = false;
for (size_t i = 0; i < result.size(); i++) {
// We treat lhs' members as unsigned, so cast to unsigned before signed to inhibit sign-extension.
// We use the unsigned equivalent of T, as we want zero-extension to occur, rather than a plain move.
const s64 x = static_cast<s64>(static_cast<std::make_unsigned_t<T>>(lhs[i]));
const s64 y = rhs[i];
const s64 sum = x + y;
if (sum > std::numeric_limits<T>::max()) {
result[i] = std::numeric_limits<T>::max();
qc_flag = true;
} else if (sum < std::numeric_limits<T>::min()) {
result[i] = std::numeric_limits<T>::min();
qc_flag = true;
} else {
result[i] = static_cast<T>(sum);
}
}
return qc_flag;
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned8(EmitContext& ctx, IR::Inst* inst) {
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s8>);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned16(EmitContext& ctx, IR::Inst* inst) {
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s16>);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned32(EmitContext& ctx, IR::Inst* inst) {
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s32>);
}
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR::Inst* inst) {
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<u64>& rhs) {
bool qc_flag = false;
for (size_t i = 0; i < result.size(); i++) {
const u64 x = lhs[i];
const u64 y = rhs[i];
const u64 res = x + y;
// Check sign bits to determine if an overflow occurred.
if (((x & res) | (~y & res) | (x & ~y)) & 0x8000000000000000) {
result[i] = static_cast<u64>(INT64_MAX);
qc_flag = true;
} else {
result[i] = res;
}
}
return qc_flag;
});
}
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);

View file

@ -1529,6 +1529,21 @@ U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) {
return {}; return {};
} }
U128 IREmitter::VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b) {
switch (esize) {
case 8:
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned8, a, b);
case 16:
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned16, a, b);
case 32:
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned32, a, b);
case 64:
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned64, a, b);
}
UNREACHABLE();
return {};
}
U128 IREmitter::VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b) { U128 IREmitter::VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b) {
switch (esize) { switch (esize) {
case 16: case 16:

View file

@ -265,6 +265,7 @@ public:
U128 VectorSignExtend(size_t original_esize, const U128& a); U128 VectorSignExtend(size_t original_esize, const U128& a);
U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
U128 VectorSignedSaturatedAbs(size_t esize, const U128& a); U128 VectorSignedSaturatedAbs(size_t esize, const U128& a);
U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b);
U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b);
U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);

View file

@ -351,6 +351,10 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const {
case Opcode::VectorSignedSaturatedAbs16: case Opcode::VectorSignedSaturatedAbs16:
case Opcode::VectorSignedSaturatedAbs32: case Opcode::VectorSignedSaturatedAbs32:
case Opcode::VectorSignedSaturatedAbs64: case Opcode::VectorSignedSaturatedAbs64:
case Opcode::VectorSignedSaturatedAccumulateUnsigned8:
case Opcode::VectorSignedSaturatedAccumulateUnsigned16:
case Opcode::VectorSignedSaturatedAccumulateUnsigned32:
case Opcode::VectorSignedSaturatedAccumulateUnsigned64:
case Opcode::VectorSignedSaturatedNarrowToSigned16: case Opcode::VectorSignedSaturatedNarrowToSigned16:
case Opcode::VectorSignedSaturatedNarrowToSigned32: case Opcode::VectorSignedSaturatedNarrowToSigned32:
case Opcode::VectorSignedSaturatedNarrowToSigned64: case Opcode::VectorSignedSaturatedNarrowToSigned64:

View file

@ -397,6 +397,10 @@ OPCODE(VectorSignedSaturatedAbs8, U128, U128
OPCODE(VectorSignedSaturatedAbs16, U128, U128 ) OPCODE(VectorSignedSaturatedAbs16, U128, U128 )
OPCODE(VectorSignedSaturatedAbs32, U128, U128 ) OPCODE(VectorSignedSaturatedAbs32, U128, U128 )
OPCODE(VectorSignedSaturatedAbs64, U128, U128 ) OPCODE(VectorSignedSaturatedAbs64, U128, U128 )
OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128, U128 )
OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 )
OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 )
OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 )
OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh16, U128, U128, U128 ) OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh16, U128, U128, U128 )
OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh32, U128, U128, U128 ) OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh32, U128, U128, U128 )
OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 )