diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 6eec6fce..e71bfec4 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -2981,6 +2981,127 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR:: }); } +static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Address mask = [esize, &code] { + switch (esize) { + case 8: + return code.MConst(xword, 0x8080808080808080, 0x8080808080808080); + case 16: + return code.MConst(xword, 0x8000800080008000, 0x8000800080008000); + case 32: + return code.MConst(xword, 0x8000000080000000, 0x8000000080000000); + case 64: + return code.MConst(xword, 0x8000000000000000, 0x8000000000000000); + default: + UNREACHABLE(); + return Xbyak::Address{0}; + } + }(); + + const u32 test_mask = [esize] { + switch (esize) { + case 8: + return 0b1111'1111'1111'1111; + case 16: + return 0b1010'1010'1010'1010; + case 32: + return 0b1000'1000'1000'1000; + case 64: + return 0b10000000'10000000; + default: + UNREACHABLE(); + return 0; + } + }(); + + const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) { + switch (esize) { + case 8: + code.pcmpeqb(x, y); + break; + case 16: + code.pcmpeqw(x, y); + break; + case 32: + code.pcmpeqd(x, y); + break; + case 64: + code.pcmpeqq(x, y); + break; + } + }; + + code.movdqa(tmp, data); + vector_equality(tmp, mask); + + // Perform negation + code.pxor(zero, zero); + switch (esize) { + case 8: + code.psubsb(zero, data); + break; + case 16: + code.psubsw(zero, data); + break; + case 32: + code.psubd(zero, data); + code.pxor(zero, tmp); + break; + case 64: + code.psubq(zero, data); + code.pxor(zero, tmp); + break; + } + + // Check if any elements matched the mask prior to performing saturation. If so, set the Q bit. + const Xbyak::Reg64 bit = ctx.reg_alloc.ScratchGpr(); + code.pmovmskb(bit, tmp); + code.test(bit.cvt32(), test_mask); + code.setnz(bit.cvt8()); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8()); + + ctx.reg_alloc.DefineValue(inst, zero); +} + +void EmitX64::EmitVectorSignedSaturatedNeg8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNeg(8, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNeg16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNeg(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNeg32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNeg(32, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNeg64(EmitContext& ctx, IR::Inst* inst) { + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + EmitVectorSignedSaturatedNeg(64, code, ctx, inst); + return; + } + + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray& result, const VectorArray& data) { + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + if (static_cast(data[i]) == 0x8000000000000000) { + result[i] = 0x7FFFFFFFFFFFFFFF; + qc_flag = true; + } else { + result[i] = -data[i]; + } + } + + return qc_flag; + }); +} + void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 53025eac..a54038bf 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1566,6 +1566,21 @@ U128 IREmitter::VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, con return {}; } +U128 IREmitter::VectorSignedSaturatedNeg(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst(Opcode::VectorSignedSaturatedNeg8, a); + case 16: + return Inst(Opcode::VectorSignedSaturatedNeg16, a); + case 32: + return Inst(Opcode::VectorSignedSaturatedNeg32, a); + case 64: + return Inst(Opcode::VectorSignedSaturatedNeg64, a); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) { switch (esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 6fa579aa..7a732b14 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -268,6 +268,7 @@ public: U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); + U128 VectorSignedSaturatedNeg(size_t esize, const U128& a); U128 VectorSub(size_t esize, const U128& a, const U128& b); Table VectorTable(std::vector values); U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index fe1cac6c..9e20bd90 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -359,6 +359,10 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const { case Opcode::VectorSignedSaturatedNarrowToUnsigned64: case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh16: case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh32: + case Opcode::VectorSignedSaturatedNeg8: + case Opcode::VectorSignedSaturatedNeg16: + case Opcode::VectorSignedSaturatedNeg32: + case Opcode::VectorSignedSaturatedNeg64: case Opcode::VectorUnsignedSaturatedNarrow16: case Opcode::VectorUnsignedSaturatedNarrow32: case Opcode::VectorUnsignedSaturatedNarrow64: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 9532f38a..3e542c2e 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -405,6 +405,10 @@ OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 OPCODE(VectorSignedSaturatedNarrowToUnsigned16, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToUnsigned32, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToUnsigned64, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg8, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg16, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg32, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg64, U128, U128 ) OPCODE(VectorSub8, U128, U128, U128 ) OPCODE(VectorSub16, U128, U128, U128 ) OPCODE(VectorSub32, U128, U128, U128 )