From fca7eddb9eed833d6ba2aa4564d34f24923f2977 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Thu, 6 Sep 2018 15:50:25 -0400
Subject: [PATCH] A64: Add opcodes for signed saturating negations

---
 src/backend/x64/emit_x64_vector.cpp  | 121 +++++++++++++++++++++++++++
 src/frontend/ir/ir_emitter.cpp       |  15 ++++
 src/frontend/ir/ir_emitter.h         |   1 +
 src/frontend/ir/microinstruction.cpp |   4 +
 src/frontend/ir/opcodes.inc          |   4 +
 5 files changed, 145 insertions(+)

diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp
index 6eec6fce..e71bfec4 100644
--- a/src/backend/x64/emit_x64_vector.cpp
+++ b/src/backend/x64/emit_x64_vector.cpp
@@ -2981,6 +2981,127 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR::
     });
 }
 
+static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(args[0]);
+    const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Address mask = [esize, &code] {
+        switch (esize) {
+        case 8:
+            return code.MConst(xword, 0x8080808080808080, 0x8080808080808080);
+        case 16:
+            return code.MConst(xword, 0x8000800080008000, 0x8000800080008000);
+        case 32:
+            return code.MConst(xword, 0x8000000080000000, 0x8000000080000000);
+        case 64:
+            return code.MConst(xword, 0x8000000000000000, 0x8000000000000000);
+        default:
+            UNREACHABLE();
+            return Xbyak::Address{0};
+        }
+    }();
+
+    const u32 test_mask = [esize] {
+        switch (esize) {
+        case 8:
+            return 0b1111'1111'1111'1111;
+        case 16:
+            return 0b1010'1010'1010'1010;
+        case 32:
+            return 0b1000'1000'1000'1000;
+        case 64:
+            return 0b10000000'10000000;
+        default:
+            UNREACHABLE();
+            return 0;
+        }
+    }();
+
+    const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) {
+        switch (esize) {
+        case 8:
+            code.pcmpeqb(x, y);
+            break;
+        case 16:
+            code.pcmpeqw(x, y);
+            break;
+        case 32:
+            code.pcmpeqd(x, y);
+            break;
+        case 64:
+            code.pcmpeqq(x, y);
+            break;
+        }
+    };
+
+    code.movdqa(tmp, data);
+    vector_equality(tmp, mask);
+
+    // Perform negation
+    code.pxor(zero, zero);
+    switch (esize) {
+    case 8:
+        code.psubsb(zero, data);
+        break;
+    case 16:
+        code.psubsw(zero, data);
+        break;
+    case 32:
+        code.psubd(zero, data);
+        code.pxor(zero, tmp);
+        break;
+    case 64:
+        code.psubq(zero, data);
+        code.pxor(zero, tmp);
+        break;
+    }
+
+    // Check if any elements matched the mask prior to performing saturation. If so, set the Q bit.
+    const Xbyak::Reg64 bit = ctx.reg_alloc.ScratchGpr();
+    code.pmovmskb(bit, tmp);
+    code.test(bit.cvt32(), test_mask);
+    code.setnz(bit.cvt8());
+    code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8());
+
+    ctx.reg_alloc.DefineValue(inst, zero);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg8(EmitContext& ctx, IR::Inst* inst) {
+    EmitVectorSignedSaturatedNeg(8, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg16(EmitContext& ctx, IR::Inst* inst) {
+    EmitVectorSignedSaturatedNeg(16, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg32(EmitContext& ctx, IR::Inst* inst) {
+    EmitVectorSignedSaturatedNeg(32, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg64(EmitContext& ctx, IR::Inst* inst) {
+    if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
+        EmitVectorSignedSaturatedNeg(64, code, ctx, inst);
+        return;
+    }
+
+    EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& data) {
+        bool qc_flag = false;
+
+        for (size_t i = 0; i < result.size(); i++) {
+            if (static_cast<u64>(data[i]) == 0x8000000000000000) {
+                result[i] = 0x7FFFFFFFFFFFFFFF;
+                qc_flag = true;
+            } else {
+                result[i] = -data[i];
+            }
+        }
+
+        return qc_flag;
+    });
+}
+
 void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) {
     EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb);
 }
diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp
index 53025eac..a54038bf 100644
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@@ -1566,6 +1566,21 @@ U128 IREmitter::VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, con
     return {};
 }
 
+U128 IREmitter::VectorSignedSaturatedNeg(size_t esize, const U128& a) {
+    switch (esize) {
+    case 8:
+        return Inst<U128>(Opcode::VectorSignedSaturatedNeg8, a);
+    case 16:
+        return Inst<U128>(Opcode::VectorSignedSaturatedNeg16, a);
+    case 32:
+        return Inst<U128>(Opcode::VectorSignedSaturatedNeg32, a);
+    case 64:
+        return Inst<U128>(Opcode::VectorSignedSaturatedNeg64, a);
+    }
+    UNREACHABLE();
+    return {};
+}
+
 U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) {
     switch (esize) {
     case 8:
diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h
index 6fa579aa..7a732b14 100644
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@@ -268,6 +268,7 @@ public:
     U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b);
     U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
     U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);
+    U128 VectorSignedSaturatedNeg(size_t esize, const U128& a);
     U128 VectorSub(size_t esize, const U128& a, const U128& b);
     Table VectorTable(std::vector<U128> values);
     U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices);
diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp
index fe1cac6c..9e20bd90 100644
--- a/src/frontend/ir/microinstruction.cpp
+++ b/src/frontend/ir/microinstruction.cpp
@@ -359,6 +359,10 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const {
     case Opcode::VectorSignedSaturatedNarrowToUnsigned64:
     case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh16:
     case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh32:
+    case Opcode::VectorSignedSaturatedNeg8:
+    case Opcode::VectorSignedSaturatedNeg16:
+    case Opcode::VectorSignedSaturatedNeg32:
+    case Opcode::VectorSignedSaturatedNeg64:
     case Opcode::VectorUnsignedSaturatedNarrow16:
     case Opcode::VectorUnsignedSaturatedNarrow32:
     case Opcode::VectorUnsignedSaturatedNarrow64:
diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc
index 9532f38a..3e542c2e 100644
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@@ -405,6 +405,10 @@ OPCODE(VectorSignedSaturatedNarrowToSigned64,              U128,           U128
 OPCODE(VectorSignedSaturatedNarrowToUnsigned16,            U128,           U128                                                            )
 OPCODE(VectorSignedSaturatedNarrowToUnsigned32,            U128,           U128                                                            )
 OPCODE(VectorSignedSaturatedNarrowToUnsigned64,            U128,           U128                                                            )
+OPCODE(VectorSignedSaturatedNeg8,                          U128,           U128                                                            )
+OPCODE(VectorSignedSaturatedNeg16,                         U128,           U128                                                            )
+OPCODE(VectorSignedSaturatedNeg32,                         U128,           U128                                                            )
+OPCODE(VectorSignedSaturatedNeg64,                         U128,           U128                                                            )
 OPCODE(VectorSub8,                                         U128,           U128,           U128                                            )
 OPCODE(VectorSub16,                                        U128,           U128,           U128                                            )
 OPCODE(VectorSub32,                                        U128,           U128,           U128                                            )