emit_x64_vector_saturation: AVX512 implementation of VectorUnsignedSaturated{Add,Sub}{32,64}

2021-05-22 11:38:07 -07:00 · 2021-05-22 11:38:07 -07:00 · 332c26d432
commit 332c26d432
parent fa8cc1ac36
1 changed files with 107 additions and 39 deletions
--- a/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp
@ -179,11 +179,24 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst)
    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
    const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]);
    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
    const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
-    // TODO AVX2, AVX-512: vpternlog
+    // TODO AVX2
    if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
        // Do a regular unsigned addition
        code.vpaddd(result, result, addend);
        // Test if an overflow happened
        code.vpcmpud(k1, result, addend, CmpInt::LessThan);
        // Write 0b1111... where overflows have happened
        // This is just a quick way to do this without touching memory
        code.vpternlogd(result | k1, result, result, 0xFF);
        // Set ZF if an overflow happened
        code.ktestb(k1, k1);
    } else {
        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
        code.movaps(tmp, result);
        code.movaps(xmm0, result);
@ -203,6 +216,8 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst)
            code.movmskps(overflow.cvt32(), tmp);
            code.test(overflow.cvt32(), overflow.cvt32());
        }
    }
    code.setnz(overflow);
    code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
@ -214,11 +229,24 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst)
    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
    const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]);
    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
    const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
-    // TODO AVX2, AVX-512: vpternlog
+    // TODO AVX2
    if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
        // Do a regular unsigned addition
        code.vpaddq(result, result, addend);
        // Test if an overflow happened
        code.vpcmpuq(k1, result, addend, CmpInt::LessThan);
        // Write 0b1111... where overflows have happened
        // This is just a quick way to do this without touching memory
        code.vpternlogq(result | k1, result, result, 0xFF);
        // Set ZF if an overflow happened
        code.ktestb(k1, k1);
    } else {
        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
        code.movaps(tmp, result);
        code.movaps(xmm0, result);
@ -239,6 +267,8 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst)
            code.movmskpd(overflow.cvt32(), tmp);
            code.test(overflow.cvt32(), overflow.cvt32());
        }
    }
    code.setnz(overflow);
    code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
@ -261,9 +291,28 @@ void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst)
    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
    const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
    // TODO AVX2, AVX-512: vpternlog
    code.movaps(tmp, result);
    // TODO AVX2
    if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
        // Do a regular unsigned subtraction
        code.vpsubd(result, result, subtrahend);
        // Test if an underflow happened
        code.vpcmpud(k1, result, tmp, CmpInt::GreaterThan);
        // Write 0 where underflows have happened
        code.vpxord(result | k1, result, result);
        // Set ZF if an underflow happened
        code.ktestb(k1, k1);
        code.setnz(overflow);
        code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
        ctx.reg_alloc.DefineValue(inst, result);
        return;
    }
    code.movaps(xmm0, subtrahend);
    code.pxor(tmp, subtrahend);
@ -295,9 +344,28 @@ void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst)
    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
    const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
    // TODO AVX2, AVX-512: vpternlog
    code.movaps(tmp, result);
    // TODO AVX2
    if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
        // Do a regular unsigned subtraction
        code.vpsubq(result, result, subtrahend);
        // Test if an underflow happened
        code.vpcmpuq(k1, result, tmp, CmpInt::GreaterThan);
        // Write 0 where underflows have happened
        code.vpxorq(result | k1, result, result);
        // Set ZF if an underflow happened
        code.ktestb(k1, k1);
        code.setnz(overflow);
        code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
        ctx.reg_alloc.DefineValue(inst, result);
        return;
    }
    code.movaps(xmm0, subtrahend);
    code.pxor(tmp, subtrahend);