diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index a9d58c99..6f5dd977 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -2193,6 +2193,73 @@ void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* i EmitVectorSignedAbsoluteDifference(32, ctx, inst, code); } +static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(dest, src); + + switch (original_esize) { + case 16: + code.packsswb(dest, dest); + code.movdqa(sign, src); + code.psraw(sign, 15); + code.packsswb(sign, sign); + code.movdqa(reconstructed, dest); + code.punpcklbw(reconstructed, sign); + break; + case 32: + code.packssdw(dest, dest); + code.movdqa(reconstructed, dest); + code.movdqa(sign, dest); + code.psraw(sign, 15); + code.punpcklwd(reconstructed, sign); + break; + default: + UNREACHABLE(); + break; + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.pxor(reconstructed, src); + code.ptest(reconstructed, reconstructed); + } else { + code.pcmpeqd(reconstructed, src); + code.movmskps(bit, reconstructed); + code.cmp(bit, 0); + } + + code.setnz(bit.cvt8()); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8()); + + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToSigned16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNarrowToSigned(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToSigned32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNarrowToSigned(32, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray& result, const VectorArray& a) { + bool qc_flag = false; + for (size_t i = 0; i < a.size(); ++i) { + const s64 saturated = std::clamp(a[i], -0x80000000, 0x7FFFFFFF); + result[i] = static_cast(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]); diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index 349db178..eca666e7 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -579,7 +579,7 @@ INST(CMEQ_zero_2, "CMEQ (zero)", "0Q001 INST(CMLT_2, "CMLT (zero)", "0Q001110zz100000101010nnnnnddddd") INST(ABS_2, "ABS", "0Q001110zz100000101110nnnnnddddd") INST(XTN, "XTN, XTN2", "0Q001110zz100001001010nnnnnddddd") -//INST(SQXTN_2, "SQXTN, SQXTN2", "0Q001110zz100001010010nnnnnddddd") +INST(SQXTN_2, "SQXTN, SQXTN2", "0Q001110zz100001010010nnnnnddddd") //INST(FCVTN, "FCVTN, FCVTN2", "0Q0011100z100001011010nnnnnddddd") //INST(FCVTL, "FCVTL, FCVTL2", "0Q0011100z100001011110nnnnnddddd") //INST(FRINTN_1, "FRINTN (vector)", "0Q00111001111001100010nnnnnddddd") diff --git a/src/frontend/A64/translate/impl/impl.h b/src/frontend/A64/translate/impl/impl.h index 4664ceec..97c9456d 100644 --- a/src/frontend/A64/translate/impl/impl.h +++ b/src/frontend/A64/translate/impl/impl.h @@ -515,35 +515,20 @@ struct TranslatorVisitor final { // Data Processing - FP and SIMD - Scalar two-register misc bool SUQADD_1(Imm<2> size, Vec Vn, Vec Vd); - bool SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool SQABS_1(Imm<2> size, Vec Vn, Vec Vd); - bool SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool CMGT_zero_1(Imm<2> size, Vec Vn, Vec Vd); - bool CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool CMEQ_zero_1(Imm<2> size, Vec Vn, Vec Vd); - bool CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool CMLT_1(Imm<2> size, Vec Vn, Vec Vd); - bool CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool ABS_1(Imm<2> size, Vec Vn, Vec Vd); - bool ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool SQXTN_1(Imm<2> size, Vec Vn, Reg Rd); - bool SQXTN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd); bool USQADD_1(Imm<2> size, Vec Vn, Vec Vd); - bool USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool SQNEG_1(Imm<2> size, Vec Vn, Vec Vd); - bool SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool CMGE_zero_1(Imm<2> size, Vec Vn, Vec Vd); - bool CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool CMLE_1(Imm<2> size, Vec Vn, Vec Vd); - bool CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool NEG_1(Imm<2> size, Vec Vn, Vec Vd); - bool NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool SQXTUN_1(Imm<2> size, Vec Vn, Reg Rd); - bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); bool UQXTN_1(Imm<2> size, Vec Vn, Reg Rd); - bool UQXTN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd); bool FCVTXN_1(bool sz, Vec Vn, Reg Rd); - bool FCVTXN_2(bool Q, bool sz, Vec Vn, Reg Rd); // Data Processing - FP and SIMD - SIMD Scalar pairwise bool ADDP_pair(Imm<2> size, Vec Vn, Vec Vd); @@ -704,28 +689,6 @@ struct TranslatorVisitor final { bool FMINNMP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); bool FMINP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); - // Data Processing - FP and SIMD - SIMD Two-register misc - bool FRINTN_1(bool Q, Vec Vn, Vec Vd); - bool FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FRINTM_1(bool Q, Vec Vn, Vec Vd); - bool FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FABS_1(bool Q, Vec Vn, Vec Vd); - bool FABS_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FRINTP_1(bool Q, Vec Vn, Vec Vd); - bool FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FRINTZ_1(bool Q, Vec Vn, Vec Vd); - bool FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FRINTA_1(bool Q, Vec Vn, Vec Vd); - bool FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FRINTX_1(bool Q, Vec Vn, Vec Vd); - bool FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FNEG_1(bool Q, Vec Vn, Vec Vd); - bool FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FRINTI_1(bool Q, Vec Vn, Vec Vd); - bool FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd); - bool FSQRT_1(bool Q, Vec Vn, Vec Vd); - bool FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd); - // Data Processing - FP and SIMD - SIMD Three same extra bool SDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); bool UDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); @@ -751,6 +714,41 @@ struct TranslatorVisitor final { bool NOT(bool Q, Vec Vn, Vec Vd); bool RBIT_asimd(bool Q, Vec Vn, Vec Vd); bool URSQRTE(bool Q, bool sz, Vec Vn, Vec Vd); + bool SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool UQXTN_2(bool Q, Imm<2> size, Vec Vn, Reg Rd); + bool FCVTXN_2(bool Q, bool sz, Vec Vn, Reg Rd); + bool FRINTN_1(bool Q, Vec Vn, Vec Vd); + bool FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTM_1(bool Q, Vec Vn, Vec Vd); + bool FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FABS_1(bool Q, Vec Vn, Vec Vd); + bool FABS_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTP_1(bool Q, Vec Vn, Vec Vd); + bool FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTZ_1(bool Q, Vec Vn, Vec Vd); + bool FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTA_1(bool Q, Vec Vn, Vec Vd); + bool FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTX_1(bool Q, Vec Vn, Vec Vd); + bool FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FNEG_1(bool Q, Vec Vn, Vec Vd); + bool FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTI_1(bool Q, Vec Vn, Vec Vd); + bool FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FSQRT_1(bool Q, Vec Vn, Vec Vd); + bool FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd); // Data Processing - FP and SIMD - SIMD across lanes bool SADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd); diff --git a/src/frontend/A64/translate/impl/simd_two_register_misc.cpp b/src/frontend/A64/translate/impl/simd_two_register_misc.cpp index 014127a0..f26f6caa 100644 --- a/src/frontend/A64/translate/impl/simd_two_register_misc.cpp +++ b/src/frontend/A64/translate/impl/simd_two_register_misc.cpp @@ -291,6 +291,22 @@ bool TranslatorVisitor::SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { return true; } +bool TranslatorVisitor::SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + const size_t part = Q ? 1 : 0; + + const IR::U128 operand = V(2 * datasize, Vn); + const IR::U128 result = ir.VectorSignedSaturatedNarrowToSigned(2 * esize, operand); + + Vpart(datasize, Vd, part, result); + return true; +} + bool TranslatorVisitor::NOT(bool Q, Vec Vn, Vec Vd) { const size_t datasize = Q ? 128 : 64; diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 0f335302..5b3055b1 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1292,6 +1292,19 @@ U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, cons return {}; } +U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) { + switch (original_esize) { + case 16: + return Inst(Opcode::VectorSignedSaturatedNarrowToSigned16, a); + case 32: + return Inst(Opcode::VectorSignedSaturatedNarrowToSigned32, a); + case 64: + return Inst(Opcode::VectorSignedSaturatedNarrowToSigned64, a); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a) { switch (original_esize) { case 16: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index c981c915..2e879bc0 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -249,6 +249,7 @@ public: U128 VectorShuffleWords(const U128& a, u8 mask); U128 VectorSignExtend(size_t original_esize, const U128& a); U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); U128 VectorSub(size_t esize, const U128& a, const U128& b); U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index a26dc81a..76a5af53 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -341,6 +341,9 @@ bool Inst::ReadsFromFPSRCumulativeSaturationBit() const { bool Inst::WritesToFPSRCumulativeSaturationBit() const { switch (op) { + case Opcode::VectorSignedSaturatedNarrowToSigned16: + case Opcode::VectorSignedSaturatedNarrowToSigned32: + case Opcode::VectorSignedSaturatedNarrowToSigned64: case Opcode::VectorSignedSaturatedNarrowToUnsigned16: case Opcode::VectorSignedSaturatedNarrowToUnsigned32: case Opcode::VectorSignedSaturatedNarrowToUnsigned64: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 8d55164b..a0bca9a1 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -347,6 +347,9 @@ OPCODE(VectorSignExtend64, T::U128, T::U128 OPCODE(VectorSignedAbsoluteDifference8, T::U128, T::U128, T::U128 ) OPCODE(VectorSignedAbsoluteDifference16, T::U128, T::U128, T::U128 ) OPCODE(VectorSignedAbsoluteDifference32, T::U128, T::U128, T::U128 ) +OPCODE(VectorSignedSaturatedNarrowToSigned16, T::U128, T::U128 ) +OPCODE(VectorSignedSaturatedNarrowToSigned32, T::U128, T::U128 ) +OPCODE(VectorSignedSaturatedNarrowToSigned64, T::U128, T::U128 ) OPCODE(VectorSignedSaturatedNarrowToUnsigned16, T::U128, T::U128 ) OPCODE(VectorSignedSaturatedNarrowToUnsigned32, T::U128, T::U128 ) OPCODE(VectorSignedSaturatedNarrowToUnsigned64, T::U128, T::U128 )