emit_x64_packed: Optimize GE flag generation for signed packed add/sub

sum >= 0 is equivalent to sum > -1
This commit is contained in:
Merry 2022-05-17 23:44:38 +01:00
parent b224fad171
commit 2779f24862

View file

@ -47,15 +47,13 @@ void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
if (ge_inst) {
const Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
code.pxor(xmm_ge, xmm_ge);
code.movdqa(saturated_sum, xmm_a);
code.paddsb(saturated_sum, xmm_b);
code.pcmpgtb(xmm_ge, saturated_sum);
code.pcmpeqb(saturated_sum, saturated_sum);
code.pxor(xmm_ge, saturated_sum);
code.pcmpeqb(xmm0, xmm0);
code.movdqa(xmm_ge, xmm_a);
code.paddsb(xmm_ge, xmm_b);
code.pcmpgtb(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.EraseInstruction(ge_inst);
@ -116,15 +114,13 @@ void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
if (ge_inst) {
const Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
code.pxor(xmm_ge, xmm_ge);
code.movdqa(saturated_sum, xmm_a);
code.paddsw(saturated_sum, xmm_b);
code.pcmpgtw(xmm_ge, saturated_sum);
code.pcmpeqw(saturated_sum, saturated_sum);
code.pxor(xmm_ge, saturated_sum);
code.pcmpeqw(xmm0, xmm0);
code.movdqa(xmm_ge, xmm_a);
code.paddsw(xmm_ge, xmm_b);
code.pcmpgtw(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.EraseInstruction(ge_inst);
@ -166,15 +162,13 @@ void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
if (ge_inst) {
const Xbyak::Xmm saturated_sum = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
code.pxor(xmm_ge, xmm_ge);
code.movdqa(saturated_sum, xmm_a);
code.psubsb(saturated_sum, xmm_b);
code.pcmpgtb(xmm_ge, saturated_sum);
code.pcmpeqb(saturated_sum, saturated_sum);
code.pxor(xmm_ge, saturated_sum);
code.pcmpeqb(xmm0, xmm0);
code.movdqa(xmm_ge, xmm_a);
code.psubsb(xmm_ge, xmm_b);
code.pcmpgtb(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.EraseInstruction(ge_inst);
@ -244,15 +238,13 @@ void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
if (ge_inst) {
const Xbyak::Xmm saturated_diff = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
code.pxor(xmm_ge, xmm_ge);
code.movdqa(saturated_diff, xmm_a);
code.psubsw(saturated_diff, xmm_b);
code.pcmpgtw(xmm_ge, saturated_diff);
code.pcmpeqw(saturated_diff, saturated_diff);
code.pxor(xmm_ge, saturated_diff);
code.pcmpeqw(xmm0, xmm0);
code.movdqa(xmm_ge, xmm_a);
code.psubsw(xmm_ge, xmm_b);
code.pcmpgtw(xmm_ge, xmm0);
ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
ctx.EraseInstruction(ge_inst);