emit_x64_vector_floating_point: AVX implementation of FPVector{Max,Min}

This commit is contained in:
MerryMage 2018-07-26 10:10:47 +01:00
parent a0d6f0de57
commit da261772ea

View file

@ -504,23 +504,30 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) {
template<size_t fsize> template<size_t fsize>
static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
// What we are doing here is handling the case when the inputs are differently signed zeros. // What we are doing here is handling the case when the inputs are differently signed zeros.
// x86-64 treats differently signed zeros as equal while ARM does not. // x86-64 treats differently signed zeros as equal while ARM does not.
// Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero. // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
code.movaps(neq_mask, result); if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
FCODE(vcmpeqp)(mask, result, xmm_b);
FCODE(vandp)(anded, result, xmm_b);
FCODE(vmaxp)(result, result, xmm_b);
FCODE(vblendvp)(result, result, anded, mask);
} else {
code.movaps(mask, result);
code.movaps(anded, result); code.movaps(anded, result);
FCODE(cmpneqp)(neq_mask, xmm_b); FCODE(cmpneqp)(mask, xmm_b);
code.andps(anded, xmm_b); code.andps(anded, xmm_b);
FCODE(maxp)(result, xmm_b); FCODE(maxp)(result, xmm_b);
code.andps(result, neq_mask); code.andps(result, mask);
code.andnps(neq_mask, anded); code.andnps(mask, anded);
code.orps(result, neq_mask); code.orps(result, mask);
}
}); });
} }
@ -535,23 +542,30 @@ void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) {
template<size_t fsize> template<size_t fsize>
static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
// What we are doing here is handling the case when the inputs are differently signed zeros. // What we are doing here is handling the case when the inputs are differently signed zeros.
// x86-64 treats differently signed zeros as equal while ARM does not. // x86-64 treats differently signed zeros as equal while ARM does not.
// Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero. // Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.
code.movaps(neq_mask, result); if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
FCODE(vcmpeqp)(mask, result, xmm_b);
FCODE(vorp)(ored, result, xmm_b);
FCODE(vminp)(result, result, xmm_b);
FCODE(vblendvp)(result, result, ored, mask);
} else {
code.movaps(mask, result);
code.movaps(ored, result); code.movaps(ored, result);
FCODE(cmpneqp)(neq_mask, xmm_b); FCODE(cmpneqp)(mask, xmm_b);
code.orps(ored, xmm_b); code.orps(ored, xmm_b);
FCODE(minp)(result, xmm_b); FCODE(minp)(result, xmm_b);
code.andps(result, neq_mask); code.andps(result, mask);
code.andnps(neq_mask, ored); code.andnps(mask, ored);
code.orps(result, neq_mask); code.orps(result, mask);
}
}); });
} }