diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 5b52861f..e7f09631 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -3445,6 +3445,73 @@ void EmitX64::EmitVectorUnsignedRecipSqrtEstimate(EmitContext& ctx, IR::Inst* in }); } +// Simple generic case for 8, 16, and 32-bit values. 64-bit values +// will need to be special-cased as we can't simply use a larger integral size. +template > +bool EmitVectorUnsignedSaturatedAccumulateSigned(VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { + static_assert(std::is_signed_v, "T must be signed."); + static_assert(sizeof(T) < 64, "T must be less than 64 bits in size."); + + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + // We treat rhs' members as unsigned, so cast to unsigned before signed to inhibit sign-extension. + // We use the unsigned equivalent of T, as we want zero-extension to occur, rather than a plain move. + const s64 x = s64{lhs[i]}; + const s64 y = static_cast(static_cast>(rhs[i])); + const s64 sum = x + y; + + if (sum > std::numeric_limits::max()) { + result[i] = std::numeric_limits::max(); + qc_flag = true; + } else if (sum < 0) { + result[i] = std::numeric_limits::min(); + qc_flag = true; + } else { + result[i] = static_cast(sum); + } + } + + return qc_flag; +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned); +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned); +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned); +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned64([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + const u64 x = lhs[i]; + const u64 y = rhs[i]; + const u64 res = x + y; + + // Check sign bits to determine if an overflow occurred. + if ((~x & y & ~res) & 0x8000000000000000) { + result[i] = UINT64_MAX; + qc_flag = true; + } else if ((x & ~y & res) & 0x8000000000000000) { + result[i] = 0; + qc_flag = true; + } else { + result[i] = res; + } + } + + return qc_flag; + }); +} + void EmitX64::EmitVectorUnsignedSaturatedNarrow16(EmitContext& ctx, IR::Inst* inst) { EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray& result, const VectorArray& a) { bool qc_flag = false; diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 6c14720b..7c79e42d 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1642,6 +1642,21 @@ U128 IREmitter::VectorUnsignedRecipSqrtEstimate(const U128& a) { return Inst(Opcode::VectorUnsignedRecipSqrtEstimate, a); } +U128 IREmitter::VectorUnsignedSaturatedAccumulateSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorUnsignedSaturatedAccumulateSigned8, a, b); + case 16: + return Inst(Opcode::VectorUnsignedSaturatedAccumulateSigned16, a, b); + case 32: + return Inst(Opcode::VectorUnsignedSaturatedAccumulateSigned32, a, b); + case 64: + return Inst(Opcode::VectorUnsignedSaturatedAccumulateSigned64, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorUnsignedSaturatedNarrow(size_t esize, const U128& a) { switch (esize) { case 16: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 7fa48028..79d1d219 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -276,6 +276,7 @@ public: U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); U128 VectorUnsignedRecipEstimate(const U128& a); U128 VectorUnsignedRecipSqrtEstimate(const U128& a); + U128 VectorUnsignedSaturatedAccumulateSigned(size_t esize, const U128& a, const U128& b); U128 VectorUnsignedSaturatedNarrow(size_t esize, const U128& a); U128 VectorZeroExtend(size_t original_esize, const U128& a); U128 VectorZeroUpper(const U128& a); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index 85128b5b..91fc53d5 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -367,6 +367,10 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const { case Opcode::VectorSignedSaturatedNeg16: case Opcode::VectorSignedSaturatedNeg32: case Opcode::VectorSignedSaturatedNeg64: + case Opcode::VectorUnsignedSaturatedAccumulateSigned8: + case Opcode::VectorUnsignedSaturatedAccumulateSigned16: + case Opcode::VectorUnsignedSaturatedAccumulateSigned32: + case Opcode::VectorUnsignedSaturatedAccumulateSigned64: case Opcode::VectorUnsignedSaturatedNarrow16: case Opcode::VectorUnsignedSaturatedNarrow32: case Opcode::VectorUnsignedSaturatedNarrow64: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 5f79c424..d8fbc54a 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -424,6 +424,10 @@ OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 ) OPCODE(VectorUnsignedRecipEstimate, U128, U128 ) OPCODE(VectorUnsignedRecipSqrtEstimate, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned8, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned16, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned32, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned64, U128, U128, U128 ) OPCODE(VectorUnsignedSaturatedNarrow16, U128, U128 ) OPCODE(VectorUnsignedSaturatedNarrow32, U128, U128 ) OPCODE(VectorUnsignedSaturatedNarrow64, U128, U128 )