emit_x64_data_processing: Use BMI2 shifts where possible
This commit is contained in:
parent
ba6654b0e7
commit
b47e5ea1e1
1 changed files with 99 additions and 11 deletions
|
@ -291,8 +291,6 @@ void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
auto& shift_arg = args[1];
|
auto& shift_arg = args[1];
|
||||||
auto& carry_arg = args[2];
|
auto& carry_arg = args[2];
|
||||||
|
|
||||||
// TODO: Consider using BMI2 instructions like SHLX when arm-in-host flags is implemented.
|
|
||||||
|
|
||||||
if (!carry_inst) {
|
if (!carry_inst) {
|
||||||
if (shift_arg.IsImmediate()) {
|
if (shift_arg.IsImmediate()) {
|
||||||
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
|
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
|
||||||
|
@ -304,6 +302,18 @@ void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.xor_(result, result);
|
code.xor_(result, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (code.HasBMI2()) {
|
||||||
|
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
|
||||||
|
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
|
||||||
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
|
code.shlx(result, operand, shift);
|
||||||
|
code.xor_(zero, zero);
|
||||||
|
code.cmp(shift.cvt8(), 32);
|
||||||
|
code.cmovnb(result, zero);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
} else {
|
} else {
|
||||||
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
||||||
|
@ -398,6 +408,18 @@ void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.xor_(result.cvt32(), result.cvt32());
|
code.xor_(result.cvt32(), result.cvt32());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (code.HasBMI2()) {
|
||||||
|
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
|
||||||
|
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
|
||||||
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
||||||
|
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
|
code.shlx(result, operand, shift);
|
||||||
|
code.xor_(zero.cvt32(), zero.cvt32());
|
||||||
|
code.cmp(shift.cvt8(), 64);
|
||||||
|
code.cmovnb(result, zero);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
} else {
|
} else {
|
||||||
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
||||||
|
@ -405,7 +427,7 @@ void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
|
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
// The x64 SHL instruction masks the shift count by 0x1F before performing the shift.
|
// The x64 SHL instruction masks the shift count by 0x1F before performing the shift.
|
||||||
// ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
|
// ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
|
||||||
|
|
||||||
code.shl(result, code.cl);
|
code.shl(result, code.cl);
|
||||||
code.xor_(zero.cvt32(), zero.cvt32());
|
code.xor_(zero.cvt32(), zero.cvt32());
|
||||||
|
@ -435,6 +457,18 @@ void EmitX64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.xor_(result, result);
|
code.xor_(result, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (code.HasBMI2()) {
|
||||||
|
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
|
||||||
|
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
|
||||||
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
|
code.shrx(result, operand, shift);
|
||||||
|
code.xor_(zero, zero);
|
||||||
|
code.cmp(shift.cvt8(), 32);
|
||||||
|
code.cmovnb(result, zero);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
} else {
|
} else {
|
||||||
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
||||||
|
@ -530,6 +564,18 @@ void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.xor_(result.cvt32(), result.cvt32());
|
code.xor_(result.cvt32(), result.cvt32());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (code.HasBMI2()) {
|
||||||
|
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
|
||||||
|
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
|
||||||
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
||||||
|
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
|
code.shrx(result, operand, shift);
|
||||||
|
code.xor_(zero.cvt32(), zero.cvt32());
|
||||||
|
code.cmp(shift.cvt8(), 63);
|
||||||
|
code.cmovnb(result, zero);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
} else {
|
} else {
|
||||||
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
|
||||||
|
@ -537,7 +583,7 @@ void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
|
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
// The x64 SHR instruction masks the shift count by 0x1F before performing the shift.
|
// The x64 SHR instruction masks the shift count by 0x1F before performing the shift.
|
||||||
// ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
|
// ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
|
||||||
|
|
||||||
code.shr(result, code.cl);
|
code.shr(result, code.cl);
|
||||||
code.xor_(zero.cvt32(), zero.cvt32());
|
code.xor_(zero.cvt32(), zero.cvt32());
|
||||||
|
@ -563,6 +609,22 @@ void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
code.sar(result, u8(shift < 31 ? shift : 31));
|
code.sar(result, u8(shift < 31 ? shift : 31));
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (code.HasBMI2()) {
|
||||||
|
const Xbyak::Reg32 shift = ctx.reg_alloc.UseScratchGpr(shift_arg).cvt32();
|
||||||
|
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
|
||||||
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
const Xbyak::Reg32 const31 = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
|
// The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift.
|
||||||
|
// ARM differs from the behaviour: It does not mask the count.
|
||||||
|
|
||||||
|
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
|
||||||
|
code.mov(const31, 31);
|
||||||
|
code.cmp(shift.cvt8(), 31);
|
||||||
|
code.cmovnb(shift, const31);
|
||||||
|
code.sarx(result, operand, shift);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
} else {
|
} else {
|
||||||
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
|
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
|
||||||
|
@ -574,9 +636,8 @@ void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
|
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
|
||||||
code.mov(const31, 31);
|
code.mov(const31, 31);
|
||||||
code.movzx(code.ecx, code.cl);
|
code.cmp(code.cl, u32(31));
|
||||||
code.cmp(code.ecx, u32(31));
|
code.cmova(code.ecx, const31);
|
||||||
code.cmovg(code.ecx, const31);
|
|
||||||
code.sar(result, code.cl);
|
code.sar(result, code.cl);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -647,6 +708,18 @@ void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
code.sar(result, u8(shift < 63 ? shift : 63));
|
code.sar(result, u8(shift < 63 ? shift : 63));
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (code.HasBMI2()) {
|
||||||
|
const Xbyak::Reg64 shift = ctx.reg_alloc.UseScratchGpr(shift_arg);
|
||||||
|
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
|
||||||
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
||||||
|
const Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
|
code.mov(const63.cvt32(), 63);
|
||||||
|
code.cmp(shift.cvt8(), 63);
|
||||||
|
code.cmovnb(shift, const63);
|
||||||
|
code.sarx(result, operand, shift);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
} else {
|
} else {
|
||||||
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
|
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
|
||||||
|
@ -658,8 +731,7 @@ void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
// We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63.
|
// We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63.
|
||||||
code.mov(const63, 63);
|
code.mov(const63, 63);
|
||||||
code.movzx(code.ecx, code.cl);
|
code.cmp(code.cl, u32(63));
|
||||||
code.cmp(code.ecx, u32(63));
|
|
||||||
code.cmovg(code.ecx, const63);
|
code.cmovg(code.ecx, const63);
|
||||||
code.sar(result, code.cl);
|
code.sar(result, code.cl);
|
||||||
|
|
||||||
|
@ -676,7 +748,15 @@ void EmitX64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
auto& carry_arg = args[2];
|
auto& carry_arg = args[2];
|
||||||
|
|
||||||
if (!carry_inst) {
|
if (!carry_inst) {
|
||||||
if (shift_arg.IsImmediate()) {
|
if (shift_arg.IsImmediate() && code.HasBMI2()) {
|
||||||
|
const u8 shift = shift_arg.GetImmediateU8();
|
||||||
|
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
|
||||||
|
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
|
code.rorx(result, operand, shift);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (shift_arg.IsImmediate()) {
|
||||||
const u8 shift = shift_arg.GetImmediateU8();
|
const u8 shift = shift_arg.GetImmediateU8();
|
||||||
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
|
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
|
||||||
|
|
||||||
|
@ -751,7 +831,15 @@ void EmitX64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
auto& operand_arg = args[0];
|
auto& operand_arg = args[0];
|
||||||
auto& shift_arg = args[1];
|
auto& shift_arg = args[1];
|
||||||
|
|
||||||
if (shift_arg.IsImmediate()) {
|
if (shift_arg.IsImmediate() && code.HasBMI2()) {
|
||||||
|
const u8 shift = shift_arg.GetImmediateU8();
|
||||||
|
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
|
||||||
|
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
|
||||||
|
|
||||||
|
code.rorx(result, operand, shift);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
} else if (shift_arg.IsImmediate()) {
|
||||||
const u8 shift = shift_arg.GetImmediateU8();
|
const u8 shift = shift_arg.GetImmediateU8();
|
||||||
const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
|
const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue