From da261772eae1c1958cc4290375d07a151d920a06 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Thu, 26 Jul 2018 10:10:47 +0100 Subject: [PATCH] emit_x64_vector_floating_point: AVX implementation of FPVector{Max,Min} --- .../emit_x64_vector_floating_point.cpp | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index f746dc29..48c735b1 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -504,23 +504,30 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) { template static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ - const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm(); // What we are doing here is handling the case when the inputs are differently signed zeros. // x86-64 treats differently signed zeros as equal while ARM does not. // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero. - code.movaps(neq_mask, result); - code.movaps(anded, result); - FCODE(cmpneqp)(neq_mask, xmm_b); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + FCODE(vcmpeqp)(mask, result, xmm_b); + FCODE(vandp)(anded, result, xmm_b); + FCODE(vmaxp)(result, result, xmm_b); + FCODE(vblendvp)(result, result, anded, mask); + } else { + code.movaps(mask, result); + code.movaps(anded, result); + FCODE(cmpneqp)(mask, xmm_b); - code.andps(anded, xmm_b); - FCODE(maxp)(result, xmm_b); + code.andps(anded, xmm_b); + FCODE(maxp)(result, xmm_b); - code.andps(result, neq_mask); - code.andnps(neq_mask, anded); - code.orps(result, neq_mask); + code.andps(result, mask); + code.andnps(mask, anded); + code.orps(result, mask); + } }); } @@ -535,23 +542,30 @@ void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) { template static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ - const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm(); // What we are doing here is handling the case when the inputs are differently signed zeros. // x86-64 treats differently signed zeros as equal while ARM does not. // Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero. - code.movaps(neq_mask, result); - code.movaps(ored, result); - FCODE(cmpneqp)(neq_mask, xmm_b); + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + FCODE(vcmpeqp)(mask, result, xmm_b); + FCODE(vorp)(ored, result, xmm_b); + FCODE(vminp)(result, result, xmm_b); + FCODE(vblendvp)(result, result, ored, mask); + } else { + code.movaps(mask, result); + code.movaps(ored, result); + FCODE(cmpneqp)(mask, xmm_b); - code.orps(ored, xmm_b); - FCODE(minp)(result, xmm_b); + code.orps(ored, xmm_b); + FCODE(minp)(result, xmm_b); - code.andps(result, neq_mask); - code.andnps(neq_mask, ored); - code.orps(result, neq_mask); + code.andps(result, mask); + code.andnps(mask, ored); + code.orps(result, mask); + } }); }