emit_x64_data_processing: Use BMI2 shifts where possible

This commit is contained in:
MerryMage 2020-12-28 22:29:44 +00:00
parent ba6654b0e7
commit b47e5ea1e1

View file

@ -291,8 +291,6 @@ void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
auto& shift_arg = args[1];
auto& carry_arg = args[2];
// TODO: Consider using BMI2 instructions like SHLX when arm-in-host flags is implemented.
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
@ -304,6 +302,18 @@ void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result, result);
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
code.shlx(result, operand, shift);
code.xor_(zero, zero);
code.cmp(shift.cvt8(), 32);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@ -398,6 +408,18 @@ void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result.cvt32(), result.cvt32());
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
code.shlx(result, operand, shift);
code.xor_(zero.cvt32(), zero.cvt32());
code.cmp(shift.cvt8(), 64);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@ -405,7 +427,7 @@ void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
// The x64 SHL instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
// ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
code.shl(result, code.cl);
code.xor_(zero.cvt32(), zero.cvt32());
@ -435,6 +457,18 @@ void EmitX64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result, result);
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
code.shrx(result, operand, shift);
code.xor_(zero, zero);
code.cmp(shift.cvt8(), 32);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@ -530,6 +564,18 @@ void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result.cvt32(), result.cvt32());
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
code.shrx(result, operand, shift);
code.xor_(zero.cvt32(), zero.cvt32());
code.cmp(shift.cvt8(), 63);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@ -537,7 +583,7 @@ void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
// The x64 SHR instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
// ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
code.shr(result, code.cl);
code.xor_(zero.cvt32(), zero.cvt32());
@ -563,6 +609,22 @@ void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
code.sar(result, u8(shift < 31 ? shift : 31));
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseScratchGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 const31 = ctx.reg_alloc.ScratchGpr().cvt32();
// The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count.
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
code.mov(const31, 31);
code.cmp(shift.cvt8(), 31);
code.cmovnb(shift, const31);
code.sarx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
@ -574,9 +636,8 @@ void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
code.mov(const31, 31);
code.movzx(code.ecx, code.cl);
code.cmp(code.ecx, u32(31));
code.cmovg(code.ecx, const31);
code.cmp(code.cl, u32(31));
code.cmova(code.ecx, const31);
code.sar(result, code.cl);
ctx.reg_alloc.DefineValue(inst, result);
@ -647,6 +708,18 @@ void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
code.sar(result, u8(shift < 63 ? shift : 63));
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseScratchGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr();
code.mov(const63.cvt32(), 63);
code.cmp(shift.cvt8(), 63);
code.cmovnb(shift, const63);
code.sarx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
@ -658,8 +731,7 @@ void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
// We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63.
code.mov(const63, 63);
code.movzx(code.ecx, code.cl);
code.cmp(code.ecx, u32(63));
code.cmp(code.cl, u32(63));
code.cmovg(code.ecx, const63);
code.sar(result, code.cl);
@ -676,7 +748,15 @@ void EmitX64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) {
auto& carry_arg = args[2];
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
if (shift_arg.IsImmediate() && code.HasBMI2()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.rorx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else if (shift_arg.IsImmediate()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
@ -751,7 +831,15 @@ void EmitX64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) {
auto& operand_arg = args[0];
auto& shift_arg = args[1];
if (shift_arg.IsImmediate()) {
if (shift_arg.IsImmediate() && code.HasBMI2()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
code.rorx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else if (shift_arg.IsImmediate()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);