From 332c26d432e2af7ce0291dc6ec3f062307ebf3ed Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Sat, 22 May 2021 11:38:07 -0700 Subject: [PATCH] emit_x64_vector_saturation: AVX512 implementation of VectorUnsignedSaturated{Add,Sub}{32,64} --- .../x64/emit_x64_vector_saturation.cpp | 146 +++++++++++++----- 1 file changed, 107 insertions(+), 39 deletions(-) diff --git a/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp b/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp index 2c7b8ff5..6c736aad 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp @@ -179,30 +179,45 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - // TODO AVX2, AVX-512: vpternlog + // TODO AVX2 + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + // Do a regular unsigned addition + code.vpaddd(result, result, addend); - code.movaps(tmp, result); - code.movaps(xmm0, result); + // Test if an overflow happened + code.vpcmpud(k1, result, addend, CmpInt::LessThan); - code.pxor(xmm0, addend); - code.pand(tmp, addend); - code.paddd(result, addend); + // Write 0b1111... where overflows have happened + // This is just a quick way to do this without touching memory + code.vpternlogd(result | k1, result, result, 0xFF); - code.psrld(xmm0, 1); - code.paddd(tmp, xmm0); - code.psrad(tmp, 31); - - code.por(result, tmp); - - if (code.HasHostFeature(HostFeature::SSE41)) { - code.ptest(tmp, tmp); + // Set ZF if an overflow happened + code.ktestb(k1, k1); } else { - code.movmskps(overflow.cvt32(), tmp); - code.test(overflow.cvt32(), overflow.cvt32()); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movaps(tmp, result); + code.movaps(xmm0, result); + + code.pxor(xmm0, addend); + code.pand(tmp, addend); + code.paddd(result, addend); + + code.psrld(xmm0, 1); + code.paddd(tmp, xmm0); + code.psrad(tmp, 31); + + code.por(result, tmp); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(tmp, tmp); + } else { + code.movmskps(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } } + code.setnz(overflow); code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); @@ -214,31 +229,46 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - // TODO AVX2, AVX-512: vpternlog + // TODO AVX2 + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + // Do a regular unsigned addition + code.vpaddq(result, result, addend); - code.movaps(tmp, result); - code.movaps(xmm0, result); + // Test if an overflow happened + code.vpcmpuq(k1, result, addend, CmpInt::LessThan); - code.pxor(xmm0, addend); - code.pand(tmp, addend); - code.paddq(result, addend); + // Write 0b1111... where overflows have happened + // This is just a quick way to do this without touching memory + code.vpternlogq(result | k1, result, result, 0xFF); - code.psrlq(xmm0, 1); - code.paddq(tmp, xmm0); - code.psrad(tmp, 31); - code.pshufd(tmp, tmp, 0b11110101); - - code.por(result, tmp); - - if (code.HasHostFeature(HostFeature::SSE41)) { - code.ptest(tmp, tmp); + // Set ZF if an overflow happened + code.ktestb(k1, k1); } else { - code.movmskpd(overflow.cvt32(), tmp); - code.test(overflow.cvt32(), overflow.cvt32()); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movaps(tmp, result); + code.movaps(xmm0, result); + + code.pxor(xmm0, addend); + code.pand(tmp, addend); + code.paddq(result, addend); + + code.psrlq(xmm0, 1); + code.paddq(tmp, xmm0); + code.psrad(tmp, 31); + code.pshufd(tmp, tmp, 0b11110101); + + code.por(result, tmp); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(tmp, tmp); + } else { + code.movmskpd(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } } + code.setnz(overflow); code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); @@ -261,9 +291,28 @@ void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - // TODO AVX2, AVX-512: vpternlog - code.movaps(tmp, result); + + // TODO AVX2 + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + // Do a regular unsigned subtraction + code.vpsubd(result, result, subtrahend); + + // Test if an underflow happened + code.vpcmpud(k1, result, tmp, CmpInt::GreaterThan); + + // Write 0 where underflows have happened + code.vpxord(result | k1, result, result); + + // Set ZF if an underflow happened + code.ktestb(k1, k1); + + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + ctx.reg_alloc.DefineValue(inst, result); + return; + } + code.movaps(xmm0, subtrahend); code.pxor(tmp, subtrahend); @@ -295,9 +344,28 @@ void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - // TODO AVX2, AVX-512: vpternlog - code.movaps(tmp, result); + + // TODO AVX2 + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + // Do a regular unsigned subtraction + code.vpsubq(result, result, subtrahend); + + // Test if an underflow happened + code.vpcmpuq(k1, result, tmp, CmpInt::GreaterThan); + + // Write 0 where underflows have happened + code.vpxorq(result | k1, result, result); + + // Set ZF if an underflow happened + code.ktestb(k1, k1); + + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + ctx.reg_alloc.DefineValue(inst, result); + return; + } + code.movaps(xmm0, subtrahend); code.pxor(tmp, subtrahend);