ir: Add opcodes for signed saturated accumulations of unsigned values
This commit is contained in:
parent
9a3d38d2ee
commit
6f911a26da
5 changed files with 116 additions and 0 deletions
|
@ -99,6 +99,34 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
|
|||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
|
||||
template <typename Lambda>
|
||||
static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
|
||||
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
|
||||
constexpr u32 stack_space = 3 * 16;
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
ctx.reg_alloc.EndOfAllocScope();
|
||||
|
||||
ctx.reg_alloc.HostCall(nullptr);
|
||||
code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
|
||||
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
|
||||
|
||||
code.movaps(xword[code.ABI_PARAM2], arg1);
|
||||
code.movaps(xword[code.ABI_PARAM3], arg2);
|
||||
code.CallFunction(fn);
|
||||
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
|
||||
|
||||
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||
|
||||
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
|
||||
template <typename Lambda>
|
||||
static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
|
||||
const auto fn = static_cast<mp::equivalent_function_type_t<Lambda>*>(lambda);
|
||||
|
@ -2764,6 +2792,70 @@ void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
|
|||
});
|
||||
}
|
||||
|
||||
// Simple generic case for 8, 16, and 32-bit values. 64-bit values
|
||||
// will need to be special-cased as we can't simply use a larger integral size.
|
||||
template <typename T>
|
||||
static bool EmitSignedSaturatedAccumulateUnsigned(VectorArray<T>& result, const VectorArray<T>& lhs, const VectorArray<T>& rhs) {
|
||||
static_assert(std::is_signed_v<T>, "T must be signed.");
|
||||
static_assert(sizeof(T) < 64, "T must be less than 64 bits in size.");
|
||||
|
||||
bool qc_flag = false;
|
||||
|
||||
for (size_t i = 0; i < result.size(); i++) {
|
||||
// We treat lhs' members as unsigned, so cast to unsigned before signed to inhibit sign-extension.
|
||||
// We use the unsigned equivalent of T, as we want zero-extension to occur, rather than a plain move.
|
||||
const s64 x = static_cast<s64>(static_cast<std::make_unsigned_t<T>>(lhs[i]));
|
||||
const s64 y = rhs[i];
|
||||
const s64 sum = x + y;
|
||||
|
||||
if (sum > std::numeric_limits<T>::max()) {
|
||||
result[i] = std::numeric_limits<T>::max();
|
||||
qc_flag = true;
|
||||
} else if (sum < std::numeric_limits<T>::min()) {
|
||||
result[i] = std::numeric_limits<T>::min();
|
||||
qc_flag = true;
|
||||
} else {
|
||||
result[i] = static_cast<T>(sum);
|
||||
}
|
||||
}
|
||||
|
||||
return qc_flag;
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned8(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s8>);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned16(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s16>);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned32(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned<s32>);
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<u64>& rhs) {
|
||||
bool qc_flag = false;
|
||||
|
||||
for (size_t i = 0; i < result.size(); i++) {
|
||||
const u64 x = lhs[i];
|
||||
const u64 y = rhs[i];
|
||||
const u64 res = x + y;
|
||||
|
||||
// Check sign bits to determine if an overflow occurred.
|
||||
if (((x & res) | (~y & res) | (x & ~y)) & 0x8000000000000000) {
|
||||
result[i] = static_cast<u64>(INT64_MAX);
|
||||
qc_flag = true;
|
||||
} else {
|
||||
result[i] = res;
|
||||
}
|
||||
}
|
||||
|
||||
return qc_flag;
|
||||
});
|
||||
}
|
||||
|
||||
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
|
|
|
@ -1529,6 +1529,21 @@ U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) {
|
|||
return {};
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b) {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned8, a, b);
|
||||
case 16:
|
||||
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned16, a, b);
|
||||
case 32:
|
||||
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned32, a, b);
|
||||
case 64:
|
||||
return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned64, a, b);
|
||||
}
|
||||
UNREACHABLE();
|
||||
return {};
|
||||
}
|
||||
|
||||
U128 IREmitter::VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b) {
|
||||
switch (esize) {
|
||||
case 16:
|
||||
|
|
|
@ -265,6 +265,7 @@ public:
|
|||
U128 VectorSignExtend(size_t original_esize, const U128& a);
|
||||
U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedAbs(size_t esize, const U128& a);
|
||||
U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b);
|
||||
U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
|
||||
U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);
|
||||
|
|
|
@ -351,6 +351,10 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const {
|
|||
case Opcode::VectorSignedSaturatedAbs16:
|
||||
case Opcode::VectorSignedSaturatedAbs32:
|
||||
case Opcode::VectorSignedSaturatedAbs64:
|
||||
case Opcode::VectorSignedSaturatedAccumulateUnsigned8:
|
||||
case Opcode::VectorSignedSaturatedAccumulateUnsigned16:
|
||||
case Opcode::VectorSignedSaturatedAccumulateUnsigned32:
|
||||
case Opcode::VectorSignedSaturatedAccumulateUnsigned64:
|
||||
case Opcode::VectorSignedSaturatedNarrowToSigned16:
|
||||
case Opcode::VectorSignedSaturatedNarrowToSigned32:
|
||||
case Opcode::VectorSignedSaturatedNarrowToSigned64:
|
||||
|
|
|
@ -397,6 +397,10 @@ OPCODE(VectorSignedSaturatedAbs8, U128, U128
|
|||
OPCODE(VectorSignedSaturatedAbs16, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAbs32, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAbs64, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh16, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh32, U128, U128, U128 )
|
||||
OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 )
|
||||
|
|
Loading…
Add table
Reference in a new issue