emit_x64_vector_floating_point: Prefer blendvp{s,d} to vblendvp{s,d} where possible
It's a cheaper instruction.
This commit is contained in:
parent
476c0f15da
commit
dfb660cd16
1 changed files with 10 additions and 10 deletions
|
@ -153,10 +153,10 @@ Xbyak::Address GetSmallestNormalVector(BlockOfCode& code) {
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
|
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
|
||||||
if (ctx.FPSCR_DN()) {
|
if (ctx.FPSCR_DN()) {
|
||||||
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm nan_mask = xmm0;
|
||||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) {
|
||||||
FCODE(vcmpunordp)(nan_mask, result, result);
|
FCODE(vcmpunordp)(nan_mask, result, result);
|
||||||
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
|
FCODE(blendvp)(result, GetNaNVector<fsize>(code));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(nan_mask, result);
|
code.movaps(nan_mask, result);
|
||||||
FCODE(cmpordp)(nan_mask, nan_mask);
|
FCODE(cmpordp)(nan_mask, nan_mask);
|
||||||
|
@ -572,7 +572,7 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = xmm0;
|
||||||
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
@ -583,7 +583,7 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
||||||
FCODE(vandp)(anded, result, xmm_b);
|
FCODE(vandp)(anded, result, xmm_b);
|
||||||
FCODE(vmaxp)(result, result, xmm_b);
|
FCODE(vmaxp)(result, result, xmm_b);
|
||||||
FCODE(vblendvp)(result, result, anded, mask);
|
FCODE(blendvp)(result, anded);
|
||||||
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
|
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
|
||||||
} else {
|
} else {
|
||||||
code.movaps(mask, result);
|
code.movaps(mask, result);
|
||||||
|
@ -610,7 +610,7 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
}
|
}
|
||||||
|
|
||||||
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
|
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = xmm0;
|
||||||
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
@ -628,7 +628,7 @@ static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
FCODE(vcmpeqp)(mask, result, xmm_b);
|
FCODE(vcmpeqp)(mask, result, xmm_b);
|
||||||
FCODE(vandp)(anded, result, xmm_b);
|
FCODE(vandp)(anded, result, xmm_b);
|
||||||
FCODE(vmaxp)(result, result, xmm_b);
|
FCODE(vmaxp)(result, result, xmm_b);
|
||||||
FCODE(vblendvp)(result, result, anded, mask);
|
FCODE(blendvp)(result, anded);
|
||||||
} else {
|
} else {
|
||||||
code.movaps(mask, result);
|
code.movaps(mask, result);
|
||||||
code.movaps(anded, result);
|
code.movaps(anded, result);
|
||||||
|
@ -659,7 +659,7 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm xmm_b = ctx.FPSCR_FTZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = xmm0;
|
||||||
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
@ -670,7 +670,7 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
FCODE(vcmpunordp)(nan_mask, result, xmm_b);
|
||||||
FCODE(vorp)(ored, result, xmm_b);
|
FCODE(vorp)(ored, result, xmm_b);
|
||||||
FCODE(vminp)(result, result, xmm_b);
|
FCODE(vminp)(result, result, xmm_b);
|
||||||
FCODE(vblendvp)(result, result, ored, mask);
|
FCODE(blendvp)(result, ored);
|
||||||
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
|
FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
|
||||||
} else {
|
} else {
|
||||||
code.movaps(mask, result);
|
code.movaps(mask, result);
|
||||||
|
@ -697,7 +697,7 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
}
|
}
|
||||||
|
|
||||||
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
|
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b){
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = xmm0;
|
||||||
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
if (ctx.FPSCR_FTZ()) {
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
@ -715,7 +715,7 @@ static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst)
|
||||||
FCODE(vcmpeqp)(mask, result, xmm_b);
|
FCODE(vcmpeqp)(mask, result, xmm_b);
|
||||||
FCODE(vorp)(ored, result, xmm_b);
|
FCODE(vorp)(ored, result, xmm_b);
|
||||||
FCODE(vminp)(result, result, xmm_b);
|
FCODE(vminp)(result, result, xmm_b);
|
||||||
FCODE(vblendvp)(result, result, ored, mask);
|
FCODE(blendvp)(result, ored);
|
||||||
} else {
|
} else {
|
||||||
code.movaps(mask, result);
|
code.movaps(mask, result);
|
||||||
code.movaps(ored, result);
|
code.movaps(ored, result);
|
||||||
|
|
Loading…
Reference in a new issue