block_of_code: Encapsulate CPU feature detection code

This commit is contained in:
MerryMage 2020-06-09 21:25:57 +01:00
parent feddf69cb4
commit f495018f53
14 changed files with 259 additions and 184 deletions

View file

@ -284,7 +284,7 @@ void A32EmitX64::GenTerminalHandlers() {
calculate_location_descriptor(); calculate_location_descriptor();
code.L(rsb_cache_miss); code.L(rsb_cache_miss);
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data())); code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { if (code.HasSSE42()) {
code.crc32(ebp, r12d); code.crc32(ebp, r12d);
} }
code.and_(ebp, fast_dispatch_table_mask); code.and_(ebp, fast_dispatch_table_mask);
@ -302,7 +302,7 @@ void A32EmitX64::GenTerminalHandlers() {
code.align(); code.align();
fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry&(*)(u64)>(); fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry&(*)(u64)>();
code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data())); code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { if (code.HasSSE42()) {
code.crc32(code.ABI_PARAM1.cvt32(), code.ABI_PARAM2.cvt32()); code.crc32(code.ABI_PARAM1.cvt32(), code.ABI_PARAM2.cvt32());
} }
code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask); code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask);
@ -417,7 +417,7 @@ static u32 GetCpsrImpl(A32JitState* jit_state) {
} }
void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { if (code.HasBMI2()) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
@ -456,7 +456,7 @@ static void SetCpsrImpl(u32 value, A32JitState* jit_state) {
void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { if (code.HasBMI2()) {
const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
@ -514,7 +514,7 @@ void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
const u32 imm = args[0].GetImmediateU32(); const u32 imm = args[0].GetImmediateU32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm)); code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { } else if (code.HasBMI2()) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
@ -539,7 +539,7 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm)); code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0)); code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { } else if (code.HasBMI2()) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
@ -666,7 +666,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
ge |= Common::Bit<16>(imm) ? 0x000000FF : 0; ge |= Common::Bit<16>(imm) ? 0x000000FF : 0;
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge); code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { } else if (code.HasBMI2()) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();

View file

@ -154,7 +154,7 @@ void A64EmitX64::GenMemory128Accessors() {
#else #else
code.sub(rsp, 8); code.sub(rsp, 8);
Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code); Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.movq(xmm1, code.ABI_RETURN); code.movq(xmm1, code.ABI_RETURN);
code.pinsrq(xmm1, code.ABI_RETURN2, 1); code.pinsrq(xmm1, code.ABI_RETURN2, 1);
} else { } else {
@ -177,7 +177,7 @@ void A64EmitX64::GenMemory128Accessors() {
code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE);
#else #else
code.sub(rsp, 8); code.sub(rsp, 8);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.movq(code.ABI_PARAM3, xmm1); code.movq(code.ABI_PARAM3, xmm1);
code.pextrq(code.ABI_PARAM4, xmm1, 1); code.pextrq(code.ABI_PARAM4, xmm1, 1);
} else { } else {
@ -328,7 +328,7 @@ void A64EmitX64::GenTerminalHandlers() {
calculate_location_descriptor(); calculate_location_descriptor();
code.L(rsb_cache_miss); code.L(rsb_cache_miss);
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data())); code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { if (code.HasSSE42()) {
code.crc32(rbx, r12d); code.crc32(rbx, r12d);
} }
code.and_(ebp, fast_dispatch_table_mask); code.and_(ebp, fast_dispatch_table_mask);
@ -346,7 +346,7 @@ void A64EmitX64::GenTerminalHandlers() {
code.align(); code.align();
fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry&(*)(u64)>(); fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry&(*)(u64)>();
code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data())); code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { if (code.HasSSE42()) {
code.crc32(code.ABI_PARAM1, code.ABI_PARAM2); code.crc32(code.ABI_PARAM1, code.ABI_PARAM2);
} }
code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask); code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask);

View file

@ -61,7 +61,7 @@ void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const
size_t xmm_offset = frame_info.xmm_offset; size_t xmm_offset = frame_info.xmm_offset;
for (HostLoc xmm : regs) { for (HostLoc xmm : regs) {
if (HostLocIsXMM(xmm)) { if (HostLocIsXMM(xmm)) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm)); code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
} else { } else {
code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm)); code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
@ -83,7 +83,7 @@ void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const
size_t xmm_offset = frame_info.xmm_offset; size_t xmm_offset = frame_info.xmm_offset;
for (HostLoc xmm : regs) { for (HostLoc xmm : regs) {
if (HostLocIsXMM(xmm)) { if (HostLocIsXMM(xmm)) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vmovaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]); code.vmovaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);
} else { } else {
code.movaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]); code.movaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);

View file

@ -315,6 +315,67 @@ void BlockOfCode::EnsurePatchLocationSize(CodePtr begin, size_t size) {
nop(size - current_size); nop(size - current_size);
} }
bool BlockOfCode::HasSSSE3() const {
return DoesCpuSupport(Xbyak::util::Cpu::tSSSE3);
}
bool BlockOfCode::HasSSE41() const {
return DoesCpuSupport(Xbyak::util::Cpu::tSSE41);
}
bool BlockOfCode::HasSSE42() const {
return DoesCpuSupport(Xbyak::util::Cpu::tSSE42);
}
bool BlockOfCode::HasPCLMULQDQ() const {
return DoesCpuSupport(Xbyak::util::Cpu::tPCLMULQDQ);
}
bool BlockOfCode::HasAVX() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX);
}
bool BlockOfCode::HasF16C() const {
return DoesCpuSupport(Xbyak::util::Cpu::tF16C);
}
bool BlockOfCode::HasAESNI() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAESNI);
}
bool BlockOfCode::HasLZCNT() const {
return DoesCpuSupport(Xbyak::util::Cpu::tLZCNT);
}
bool BlockOfCode::HasBMI1() const {
return DoesCpuSupport(Xbyak::util::Cpu::tBMI1);
}
bool BlockOfCode::HasBMI2() const {
return DoesCpuSupport(Xbyak::util::Cpu::tBMI2);
}
bool BlockOfCode::HasFMA() const {
return DoesCpuSupport(Xbyak::util::Cpu::tFMA);
}
bool BlockOfCode::HasAVX2() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX2);
}
bool BlockOfCode::HasAVX512_Skylake() const {
// The feature set formerly known as AVX3.2. (Introduced with Skylake.)
return DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512CD)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512BW)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL);
}
bool BlockOfCode::HasAVX512_BITALG() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX512_BITALG);
}
bool BlockOfCode::DoesCpuSupport([[maybe_unused]] Xbyak::util::Cpu::Type type) const { bool BlockOfCode::DoesCpuSupport([[maybe_unused]] Xbyak::util::Cpu::Type type) const {
#ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION #ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION
return cpu_info.has(type); return cpu_info.has(type);

View file

@ -138,10 +138,23 @@ public:
static const std::array<Xbyak::Reg64, 6> ABI_PARAMS; static const std::array<Xbyak::Reg64, 6> ABI_PARAMS;
#endif #endif
bool DoesCpuSupport(Xbyak::util::Cpu::Type type) const;
JitStateInfo GetJitStateInfo() const { return jsi; } JitStateInfo GetJitStateInfo() const { return jsi; }
bool HasSSSE3() const;
bool HasSSE41() const;
bool HasSSE42() const;
bool HasPCLMULQDQ() const;
bool HasAVX() const;
bool HasF16C() const;
bool HasAESNI() const;
bool HasLZCNT() const;
bool HasBMI1() const;
bool HasBMI2() const;
bool HasFMA() const;
bool HasAVX2() const;
bool HasAVX512_Skylake() const;
bool HasAVX512_BITALG() const;
private: private:
RunCodeCallbacks cb; RunCodeCallbacks cb;
JitStateInfo jsi; JitStateInfo jsi;
@ -165,6 +178,7 @@ private:
void GenRunCode(std::function<void(BlockOfCode&)> rcp); void GenRunCode(std::function<void(BlockOfCode&)> rcp);
Xbyak::util::Cpu cpu_info; Xbyak::util::Cpu cpu_info;
bool DoesCpuSupport(Xbyak::util::Cpu::Type type) const;
}; };
} // namespace Dynarmic::Backend::X64 } // namespace Dynarmic::Backend::X64

View file

@ -56,7 +56,7 @@ void EmitX64::EmitAESEncryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitAESInverseMixColumns(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitAESInverseMixColumns(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAESNI)) { if (code.HasAESNI()) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
code.aesimc(data, data); code.aesimc(data, data);

View file

@ -19,7 +19,7 @@ namespace CRC32 = Common::Crypto::CRC32;
static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) { static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { if (code.HasSSE42()) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[1]).changeBit(data_size); const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[1]).changeBit(data_size);
code.crc32(crc, value); code.crc32(crc, value);
@ -35,7 +35,7 @@ static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) { static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tPCLMULQDQ) && data_size < 32) { if (code.HasPCLMULQDQ() && data_size < 32) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(args[1]); const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -49,7 +49,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
code.movd(xmm_tmp, value.cvt32()); code.movd(xmm_tmp, value.cvt32());
code.pslldq(xmm_tmp, (64 - data_size) / 8); code.pslldq(xmm_tmp, (64 - data_size) / 8);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpclmulqdq(xmm_value, xmm_tmp, xmm_const, 0x00); code.vpclmulqdq(xmm_value, xmm_tmp, xmm_const, 0x00);
code.pclmulqdq(xmm_value, xmm_const, 0x10); code.pclmulqdq(xmm_value, xmm_const, 0x10);
code.pxor(xmm_value, xmm_tmp); code.pxor(xmm_value, xmm_tmp);
@ -66,7 +66,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tPCLMULQDQ) && data_size == 32) { if (code.HasPCLMULQDQ() && data_size == 32) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32(); const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32();
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -87,7 +87,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tPCLMULQDQ) && data_size == 64) { if (code.HasPCLMULQDQ() && data_size == 64) {
const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();

View file

@ -36,7 +36,7 @@ void EmitX64::EmitPack2x64To1x128(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]); const Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.movq(result, lo); code.movq(result, lo);
code.pinsrq(result, hi, 1); code.pinsrq(result, hi, 1);
} else { } else {
@ -791,7 +791,7 @@ static void EmitMaskedShift32(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
} }
if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) { if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { if (code.HasBMI2()) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32(); const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
@ -828,7 +828,7 @@ static void EmitMaskedShift64(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
} }
if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) { if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { if (code.HasBMI2()) {
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg); const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg); const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
@ -1424,7 +1424,7 @@ void EmitX64::EmitByteReverseDual(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tLZCNT)) { if (code.HasLZCNT()) {
const Xbyak::Reg32 source = ctx.reg_alloc.UseGpr(args[0]).cvt32(); const Xbyak::Reg32 source = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
@ -1448,7 +1448,7 @@ void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tLZCNT)) { if (code.HasLZCNT()) {
const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]).cvt64(); const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]).cvt64();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64(); const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();

View file

@ -93,7 +93,7 @@ void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list
code.andps(xmm0, xmm); code.andps(xmm0, xmm);
if constexpr (fsize == 32) { if constexpr (fsize == 32) {
code.pcmpgtd(xmm0, code.MConst(xword, f32_smallest_normal - 1)); code.pcmpgtd(xmm0, code.MConst(xword, f32_smallest_normal - 1));
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { } else if (code.HasSSE42()) {
code.pcmpgtq(xmm0, code.MConst(xword, f64_smallest_normal - 1)); code.pcmpgtq(xmm0, code.MConst(xword, f64_smallest_normal - 1));
} else { } else {
code.pcmpgtd(xmm0, code.MConst(xword, f64_smallest_normal - 1)); code.pcmpgtd(xmm0, code.MConst(xword, f64_smallest_normal - 1));
@ -114,7 +114,7 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch)
template<size_t fsize> template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) { void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
FCODE(vcmpunords)(xmm0, result, result); FCODE(vcmpunords)(xmm0, result, result);
FCODE(blendvp)(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan)); FCODE(blendvp)(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
} else { } else {
@ -202,7 +202,7 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X
// op1 == QNaN && op2 == QNaN is the most common case. With this method // op1 == QNaN && op2 == QNaN is the most common case. With this method
// that case would only require one branch. // that case would only require one branch.
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vxorps(xmm0, op1, op2); code.vxorps(xmm0, op1, op2);
} else { } else {
code.movaps(xmm0, op1); code.movaps(xmm0, op1);
@ -237,7 +237,7 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X
code.jna(end, code.T_NEAR); code.jna(end, code.T_NEAR);
// Silence the SNaN as required by spec. // Silence the SNaN as required by spec.
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vorps(result, op2, code.MConst(xword, mantissa_msb)); code.vorps(result, op2, code.MConst(xword, mantissa_msb));
} else { } else {
code.movaps(result, op2); code.movaps(result, op2);
@ -589,7 +589,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>; using FPT = mp::unsigned_integer_of_size<fsize>;
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { if (code.HasFMA()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end, fallback; Xbyak::Label end, fallback;
@ -682,7 +682,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
Xbyak::Label end, nan, op_are_nans; Xbyak::Label end, nan, op_are_nans;
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
FCODE(vmuls)(result, op1, op2); FCODE(vmuls)(result, op1, op2);
} else { } else {
code.movaps(result, op1); code.movaps(result, op1);
@ -696,7 +696,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.L(nan); code.L(nan);
FCODE(ucomis)(op1, op2); FCODE(ucomis)(op1, op2);
code.jp(op_are_nans); code.jp(op_are_nans);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vxorps(result, op1, op2); code.vxorps(result, op1, op2);
} else { } else {
code.movaps(result, op1); code.movaps(result, op1);
@ -772,7 +772,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
using FPT = mp::unsigned_integer_of_size<fsize>; using FPT = mp::unsigned_integer_of_size<fsize>;
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA)) { if (code.HasFMA()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end, fallback; Xbyak::Label end, fallback;
@ -833,7 +833,7 @@ static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, siz
const bool exact = inst->GetArg(2).GetU1(); const bool exact = inst->GetArg(2).GetU1();
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
if (fsize != 16 && code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && round_imm && !exact) { if (fsize != 16 && code.HasSSE41() && round_imm && !exact) {
if (fsize == 64) { if (fsize == 64) {
FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) { FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) {
code.roundsd(result, result, *round_imm); code.roundsd(result, result, *round_imm);
@ -924,7 +924,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
using FPT = mp::unsigned_integer_of_size<fsize>; using FPT = mp::unsigned_integer_of_size<fsize>;
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end, fallback; Xbyak::Label end, fallback;
@ -1071,7 +1071,7 @@ void EmitX64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tF16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { if (code.HasF16C() && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
@ -1097,7 +1097,7 @@ void EmitX64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tF16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { if (code.HasF16C() && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
@ -1144,7 +1144,7 @@ void EmitX64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) {
const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tF16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { if (code.HasF16C() && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
if (ctx.FPCR().DN()) { if (ctx.FPCR().DN()) {
@ -1209,7 +1209,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && round_imm){ if (code.HasSSE41() && round_imm){
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64(); const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();
@ -1387,7 +1387,7 @@ void EmitX64::EmitFPFixedU32ToSingle(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode()); ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)) { if (code.HasAVX512_Skylake()) {
const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
code.vcvtusi2ss(result, result, from.cvt32()); code.vcvtusi2ss(result, result, from.cvt32());
} else { } else {
@ -1470,7 +1470,7 @@ void EmitX64::EmitFPFixedU32ToDouble(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode()); ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)) { if (code.HasAVX512_Skylake()) {
const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
code.vcvtusi2sd(to, to, from.cvt32()); code.vcvtusi2sd(to, to, from.cvt32());
} else { } else {
@ -1497,7 +1497,7 @@ void EmitX64::EmitFPFixedU64ToDouble(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode()); ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)) { if (code.HasAVX512_Skylake()) {
code.vcvtusi2sd(result, result, from); code.vcvtusi2sd(result, result, from);
} else { } else {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -1528,7 +1528,7 @@ void EmitX64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode()); ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)) { if (code.HasAVX512_Skylake()) {
const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
code.vcvtusi2ss(result, result, from); code.vcvtusi2ss(result, result, from);
} else { } else {

View file

@ -76,7 +76,7 @@ void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
code.paddw(xmm_a, xmm_b); code.paddw(xmm_a, xmm_b);
if (ge_inst) { if (ge_inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
@ -199,7 +199,7 @@ void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
@ -673,7 +673,7 @@ void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
code.por(from, ge); code.por(from, ge);
ctx.reg_alloc.DefineValue(inst, from); ctx.reg_alloc.DefineValue(inst, from);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI1)) { } else if (code.HasBMI1()) {
const Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32(); const Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32();
const Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); const Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32(); const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();

View file

@ -164,7 +164,7 @@ void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pextrb(dest, source, index); code.pextrb(dest, source, index);
} else { } else {
code.pextrw(dest, source, index / 2); code.pextrw(dest, source, index / 2);
@ -204,7 +204,7 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
code.pextrd(dest, source, index); code.pextrd(dest, source, index);
} else { } else {
@ -228,7 +228,7 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr().cvt64(); const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr().cvt64();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
code.pextrq(dest, source, 1); code.pextrq(dest, source, 1);
} else { } else {
@ -246,7 +246,7 @@ void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) {
const u8 index = args[1].GetImmediateU8(); const u8 index = args[1].GetImmediateU8();
const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt8(); const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt8();
code.pinsrb(source_vector, source_elem.cvt32(), index); code.pinsrb(source_vector, source_elem.cvt32(), index);
@ -291,7 +291,7 @@ void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) {
const u8 index = args[1].GetImmediateU8(); const u8 index = args[1].GetImmediateU8();
const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt32(); const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt32();
code.pinsrd(source_vector, source_elem, index); code.pinsrd(source_vector, source_elem, index);
@ -314,7 +314,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
const u8 index = args[1].GetImmediateU8(); const u8 index = args[1].GetImmediateU8();
const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(args[2]); const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(args[2]);
code.pinsrq(source_vector, source_elem, index); code.pinsrq(source_vector, source_elem, index);
@ -337,7 +337,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
} }
static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
code.pabsb(data, data); code.pabsb(data, data);
} else { } else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -348,7 +348,7 @@ static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& da
} }
static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
code.pabsw(data, data); code.pabsw(data, data);
} else { } else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -359,7 +359,7 @@ static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& d
} }
static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
code.pabsd(data, data); code.pabsd(data, data);
} else { } else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -371,7 +371,7 @@ static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& d
} }
static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
code.vpabsq(data, data); code.vpabsq(data, data);
} else { } else {
const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
@ -489,7 +489,7 @@ void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const u8 shift_amount = std::min(args[1].GetImmediateU8(), u8(63)); const u8 shift_amount = std::min(args[1].GetImmediateU8(), u8(63));
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
code.vpsraq(result, result, shift_amount); code.vpsraq(result, result, shift_amount);
} else { } else {
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
@ -543,7 +543,7 @@ void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512BW)) { if (code.HasAVX512_Skylake()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -575,7 +575,7 @@ void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -606,7 +606,7 @@ void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512F) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -640,10 +640,10 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
code.vpbroadcastb(a, a); code.vpbroadcastb(a, a);
code.vmovq(a, a); code.vmovq(a, a);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { } else if (code.HasSSSE3()) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pxor(tmp, tmp); code.pxor(tmp, tmp);
@ -679,9 +679,9 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
code.vpbroadcastb(a, a); code.vpbroadcastb(a, a);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { } else if (code.HasSSSE3()) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.pxor(tmp, tmp); code.pxor(tmp, tmp);
@ -699,7 +699,7 @@ void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
code.vpbroadcastw(a, a); code.vpbroadcastw(a, a);
} else { } else {
code.pshuflw(a, a, 0); code.pshuflw(a, a, 0);
@ -713,7 +713,7 @@ void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
code.vpbroadcastd(a, a); code.vpbroadcastd(a, a);
} else { } else {
code.pshufd(a, a, 0); code.pshufd(a, a, 0);
@ -726,7 +726,7 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
code.vpbroadcastq(a, a); code.vpbroadcastq(a, a);
} else { } else {
code.punpcklqdq(a, a); code.punpcklqdq(a, a);
@ -751,7 +751,7 @@ static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArra
} }
void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -780,7 +780,7 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -812,7 +812,7 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -852,7 +852,7 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512CD) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -903,7 +903,7 @@ void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
code.pshufd(lhs, lhs, 0b10001000); code.pshufd(lhs, lhs, 0b10001000);
code.pshufd(rhs, rhs, 0b10001000); code.pshufd(rhs, rhs, 0b10001000);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pblendw(lhs, rhs, 0b11110000); code.pblendw(lhs, rhs, 0b11110000);
} else { } else {
code.punpcklqdq(lhs, rhs); code.punpcklqdq(lhs, rhs);
@ -956,7 +956,7 @@ void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
code.pshufd(lhs, lhs, 0b11011101); code.pshufd(lhs, lhs, 0b11011101);
code.pshufd(rhs, rhs, 0b11011101); code.pshufd(rhs, rhs, 0b11011101);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pblendw(lhs, rhs, 0b11110000); code.pblendw(lhs, rhs, 0b11110000);
} else { } else {
code.punpcklqdq(lhs, rhs); code.punpcklqdq(lhs, rhs);
@ -992,7 +992,7 @@ void EmitX64::EmitVectorEqual32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqq); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqq);
return; return;
} }
@ -1013,7 +1013,7 @@ void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -1049,7 +1049,7 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -1100,7 +1100,7 @@ void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { if (code.HasSSE42()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq);
return; return;
} }
@ -1473,7 +1473,7 @@ void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512BW)) { if (code.HasAVX512_Skylake()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1501,7 +1501,7 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1529,7 +1529,7 @@ void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX2)) { if (code.HasAVX2()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1557,7 +1557,7 @@ void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
return; return;
} }
@ -1582,7 +1582,7 @@ void EmitX64::EmitVectorMaxS16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
return; return;
} }
@ -1603,12 +1603,12 @@ void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq); EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq);
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1631,7 +1631,7 @@ void EmitX64::EmitVectorMaxU8(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
return; return;
} }
@ -1647,7 +1647,7 @@ void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
return; return;
} }
@ -1674,12 +1674,12 @@ void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq); EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq);
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -1702,7 +1702,7 @@ void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
return; return;
} }
@ -1727,7 +1727,7 @@ void EmitX64::EmitVectorMinS16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
return; return;
} }
@ -1748,12 +1748,12 @@ void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq); EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq);
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
@ -1776,7 +1776,7 @@ void EmitX64::EmitVectorMinU8(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
return; return;
} }
@ -1795,7 +1795,7 @@ void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
return; return;
} }
@ -1824,12 +1824,12 @@ void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq); EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq);
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
@ -1877,7 +1877,7 @@ void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld); EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld);
return; return;
} }
@ -1900,14 +1900,14 @@ void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq); EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq);
return; return;
} }
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr(); const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr();
@ -1953,7 +1953,7 @@ void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512BW)) { if (code.HasAVX512_Skylake()) {
const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -1981,7 +1981,7 @@ void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) {
// TODO: AVX512F implementation // TODO: AVX512F implementation
code.pxor(zeros, zeros); code.pxor(zeros, zeros);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pblendw(a, zeros, 0b10101010); code.pblendw(a, zeros, 0b10101010);
code.packusdw(a, zeros); code.packusdw(a, zeros);
} else { } else {
@ -2048,7 +2048,7 @@ void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.punpcklqdq(xmm_a, xmm_b); code.punpcklqdq(xmm_a, xmm_b);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
code.pxor(tmp, tmp); code.pxor(tmp, tmp);
code.phaddw(xmm_a, tmp); code.phaddw(xmm_a, tmp);
} else { } else {
@ -2071,7 +2071,7 @@ void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.punpcklqdq(xmm_a, xmm_b); code.punpcklqdq(xmm_a, xmm_b);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
code.pxor(tmp, tmp); code.pxor(tmp, tmp);
code.phaddd(xmm_a, tmp); code.phaddd(xmm_a, tmp);
} else { } else {
@ -2109,7 +2109,7 @@ void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
@ -2139,7 +2139,7 @@ void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
@ -2215,7 +2215,7 @@ void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
code.vpsraq(c, a, 32); code.vpsraq(c, a, 32);
code.vpsllq(a, a, 32); code.vpsllq(a, a, 32);
code.vpsraq(a, a, 32); code.vpsraq(a, a, 32);
@ -2332,7 +2332,7 @@ void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp, y, 0b10001000); code.shufps(tmp, y, 0b10001000);
code.shufps(x, y, 0b11011101); code.shufps(x, y, 0b11011101);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pmaxsd(x, tmp); code.pmaxsd(x, tmp);
ctx.reg_alloc.DefineValue(inst, x); ctx.reg_alloc.DefineValue(inst, x);
@ -2372,7 +2372,7 @@ void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp1, y, 0b10001000); code.shufps(tmp1, y, 0b10001000);
code.shufps(x, y, 0b11011101); code.shufps(x, y, 0b11011101);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pmaxud(x, tmp1); code.pmaxud(x, tmp1);
ctx.reg_alloc.DefineValue(inst, x); ctx.reg_alloc.DefineValue(inst, x);
@ -2417,7 +2417,7 @@ void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp, y, 0b10001000); code.shufps(tmp, y, 0b10001000);
code.shufps(x, y, 0b11011101); code.shufps(x, y, 0b11011101);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pminsd(x, tmp); code.pminsd(x, tmp);
ctx.reg_alloc.DefineValue(inst, x); ctx.reg_alloc.DefineValue(inst, x);
@ -2457,7 +2457,7 @@ void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
code.shufps(tmp1, y, 0b10001000); code.shufps(tmp1, y, 0b10001000);
code.shufps(x, y, 0b11011101); code.shufps(x, y, 0b11011101);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pminud(x, tmp1); code.pminud(x, tmp1);
ctx.reg_alloc.DefineValue(inst, x); ctx.reg_alloc.DefineValue(inst, x);
@ -2529,7 +2529,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* ins
} }
void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512_BITALG)) { if (code.HasAVX512_BITALG()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -2539,7 +2539,7 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(args[0]);
@ -2582,7 +2582,7 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
code.pxor(data, high_nibble_reg); code.pxor(data, high_nibble_reg);
code.psrld(high_nibble_reg, 4); code.psrld(high_nibble_reg, 4);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { if (code.HasSSSE3()) {
// High lookup // High lookup
const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_reversed_reg, code.MConst(xword, 0xE060A020C0408000, 0xF070B030D0509010)); code.movdqa(high_reversed_reg, code.MConst(xword, 0xE060A020C0408000, 0xF070B030D0509010));
@ -2822,7 +2822,7 @@ void EmitX64::EmitVectorShuffleWords(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pmovsxbw(a, a); code.pmovsxbw(a, a);
ctx.reg_alloc.DefineValue(inst, a); ctx.reg_alloc.DefineValue(inst, a);
@ -2838,7 +2838,7 @@ void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
code.pmovsxwd(a, a); code.pmovsxwd(a, a);
ctx.reg_alloc.DefineValue(inst, a); ctx.reg_alloc.DefineValue(inst, a);
@ -2856,7 +2856,7 @@ void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pmovsxdq(a, a); code.pmovsxdq(a, a);
} else { } else {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -2878,7 +2878,7 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
code.movq(gpr_tmp, data); code.movq(gpr_tmp, data);
code.sar(gpr_tmp, 63); code.sar(gpr_tmp, 63);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pinsrq(data, gpr_tmp, 1); code.pinsrq(data, gpr_tmp, 1);
} else { } else {
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
@ -2949,7 +2949,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (upper_inst) { if (upper_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpmulhw(result, x, y); code.vpmulhw(result, x, y);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -2962,7 +2962,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (lower_inst) { if (lower_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpmullw(result, x, y); code.vpmullw(result, x, y);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -2979,7 +2979,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (lower_inst && !upper_inst && code.HasAVX()) {
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -2991,7 +2991,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -3148,7 +3148,7 @@ void EmitX64::EmitVectorSignedSaturatedAbs32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorSignedSaturatedAbs(64, code, ctx, inst); EmitVectorSignedSaturatedAbs(64, code, ctx, inst);
return; return;
} }
@ -3183,7 +3183,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
switch (bit_width) { switch (bit_width) {
case 8: case 8:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpaddb(result, x, xmm0); code.vpaddb(result, x, xmm0);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -3191,7 +3191,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
} }
break; break;
case 16: case 16:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpaddw(result, x, xmm0); code.vpaddw(result, x, xmm0);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -3199,7 +3199,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
} }
break; break;
case 32: case 32:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpaddd(result, x, xmm0); code.vpaddd(result, x, xmm0);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -3207,7 +3207,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
} }
break; break;
case 64: case 64:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpaddq(result, x, xmm0); code.vpaddq(result, x, xmm0);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -3216,10 +3216,10 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
break; break;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
// xmm0 = majority(~y, x, res) // xmm0 = majority(~y, x, res)
code.vpternlogd(xmm0, x, result, 0b10001110); code.vpternlogd(xmm0, x, result, 0b10001110);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { } else if (code.HasAVX()) {
code.vpor(tmp, x, result); code.vpor(tmp, x, result);
code.pand(x, result); code.pand(x, result);
code.vpblendvb(xmm0, tmp, x, xmm0); code.vpblendvb(xmm0, tmp, x, xmm0);
@ -3235,7 +3235,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
switch (bit_width) { switch (bit_width) {
case 8: case 8:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
code.pcmpeqb(tmp2, tmp2); code.pcmpeqb(tmp2, tmp2);
code.pxor(tmp, tmp); code.pxor(tmp, tmp);
@ -3256,7 +3256,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
code.psrad(xmm0, 31); code.psrad(xmm0, 31);
break; break;
case 64: case 64:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
code.vpsraq(xmm0, xmm0, 63); code.vpsraq(xmm0, xmm0, 63);
} else { } else {
code.psrad(xmm0, 31); code.psrad(xmm0, 31);
@ -3286,7 +3286,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
code.pmovmskb(mask, xmm0); code.pmovmskb(mask, xmm0);
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask); code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pblendvb(result, tmp); code.pblendvb(result, tmp);
} else { } else {
code.pandn(xmm0, result); code.pandn(xmm0, result);
@ -3323,14 +3323,14 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::
const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpmulhw(upper_tmp, x, y); code.vpmulhw(upper_tmp, x, y);
} else { } else {
code.movdqa(upper_tmp, x); code.movdqa(upper_tmp, x);
code.pmulhw(upper_tmp, y); code.pmulhw(upper_tmp, y);
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpmullw(lower_tmp, x, y); code.vpmullw(lower_tmp, x, y);
} else { } else {
code.movdqa(lower_tmp, x); code.movdqa(lower_tmp, x);
@ -3343,7 +3343,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::
if (lower_inst) { if (lower_inst) {
const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpaddw(lower_result, lower_tmp, lower_tmp); code.vpaddw(lower_result, lower_tmp, lower_tmp);
} else { } else {
code.movdqa(lower_result, lower_tmp); code.movdqa(lower_result, lower_tmp);
@ -3357,7 +3357,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::
if (upper_inst) { if (upper_inst) {
const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpsrlw(lower_tmp, lower_tmp, 15); code.vpsrlw(lower_tmp, lower_tmp, 15);
code.vpaddw(upper_tmp, upper_tmp, upper_tmp); code.vpaddw(upper_tmp, upper_tmp, upper_tmp);
code.vpor(upper_result, upper_tmp, lower_tmp); code.vpor(upper_result, upper_tmp, lower_tmp);
@ -3388,7 +3388,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiply32(EmitContext& ctx, IR::
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm();
@ -3508,7 +3508,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx,
code.punpcklwd(y, y); code.punpcklwd(y, y);
code.pmaddwd(x, y); code.pmaddwd(x, y);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpcmpeqd(y, x, code.MConst(xword, 0x8000000080000000, 0x8000000080000000)); code.vpcmpeqd(y, x, code.MConst(xword, 0x8000000080000000, 0x8000000080000000));
code.vpxor(x, x, y); code.vpxor(x, x, y);
} else { } else {
@ -3530,7 +3530,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpmovsxdq(x, x); code.vpmovsxdq(x, x);
code.vpmovsxdq(y, y); code.vpmovsxdq(y, y);
code.vpmuldq(x, x, y); code.vpmuldq(x, x, y);
@ -3561,7 +3561,7 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
} }
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpcmpeqq(y, x, code.MConst(xword, 0x8000000000000000, 0x8000000000000000)); code.vpcmpeqq(y, x, code.MConst(xword, 0x8000000000000000, 0x8000000000000000));
code.vpxor(x, x, y); code.vpxor(x, x, y);
code.vpmovmskb(bit, y); code.vpmovmskb(bit, y);
@ -3652,7 +3652,7 @@ static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, Blo
code.punpcklbw(reconstructed, zero); code.punpcklbw(reconstructed, zero);
break; break;
case 32: case 32:
ASSERT(code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)); ASSERT(code.HasSSE41());
code.packusdw(dest, dest); // SSE4.1 code.packusdw(dest, dest); // SSE4.1
code.movdqa(reconstructed, dest); code.movdqa(reconstructed, dest);
code.punpcklwd(reconstructed, zero); code.punpcklwd(reconstructed, zero);
@ -3675,7 +3675,7 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned16(EmitContext& ctx, IR::
} }
void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned32(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorSignedSaturatedNarrowToUnsigned(32, code, ctx, inst); EmitVectorSignedSaturatedNarrowToUnsigned(32, code, ctx, inst);
return; return;
} }
@ -3784,7 +3784,7 @@ void EmitX64::EmitVectorSignedSaturatedNeg32(EmitContext& ctx, IR::Inst* inst) {
} }
void EmitX64::EmitVectorSignedSaturatedNeg64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignedSaturatedNeg64(EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
EmitVectorSignedSaturatedNeg(64, code, ctx, inst); EmitVectorSignedSaturatedNeg(64, code, ctx, inst);
return; return;
} }
@ -3955,7 +3955,7 @@ void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) {
// TODO: AVX512VL implementation when available (VPERMB / VPERMI2B / VPERMT2B) // TODO: AVX512VL implementation when available (VPERMB / VPERMI2B / VPERMT2B)
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSSE3) && is_defaults_zero && table_size == 1) { if (code.HasSSSE3() && is_defaults_zero && table_size == 1) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
@ -3966,12 +3966,12 @@ void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && table_size == 1) { if (code.HasSSE41() && table_size == 1) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0x7070707070707070)); code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0x7070707070707070));
} else { } else {
code.movaps(xmm0, indicies); code.movaps(xmm0, indicies);
@ -3984,12 +3984,12 @@ void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && is_defaults_zero && table_size == 2) { if (code.HasSSE41() && is_defaults_zero && table_size == 2) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]); const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0x7070707070707070)); code.vpaddusb(xmm0, indicies, code.MConst(xword, 0x7070707070707070, 0x7070707070707070));
} else { } else {
code.movaps(xmm0, indicies); code.movaps(xmm0, indicies);
@ -4004,7 +4004,7 @@ void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm();
@ -4020,7 +4020,7 @@ void EmitX64::EmitVectorTableLookup(EmitContext& ctx, IR::Inst* inst) {
if (table_index == 0) { if (table_index == 0) {
code.pxor(xmm0, xmm0); code.pxor(xmm0, xmm0);
code.pcmpeqb(xmm0, masked); code.pcmpeqb(xmm0, masked);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { } else if (code.HasAVX()) {
code.vpcmpeqb(xmm0, masked, code.MConst(xword, table_index, table_index)); code.vpcmpeqb(xmm0, masked, code.MConst(xword, table_index, table_index));
} else { } else {
code.movaps(xmm0, code.MConst(xword, table_index, table_index)); code.movaps(xmm0, code.MConst(xword, table_index, table_index));
@ -4101,7 +4101,7 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx,
break; break;
} }
case 32: case 32:
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
@ -4151,7 +4151,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (upper_inst) { if (upper_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpmulhuw(result, x, y); code.vpmulhuw(result, x, y);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -4164,7 +4164,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
if (lower_inst) { if (lower_inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpmullw(result, x, y); code.vpmullw(result, x, y);
} else { } else {
code.movdqa(result, x); code.movdqa(result, x);
@ -4181,7 +4181,7 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (lower_inst && !upper_inst && code.HasAVX()) {
const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -4193,7 +4193,7 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
return; return;
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -4442,7 +4442,7 @@ void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pmovzxbw(a, a); code.pmovzxbw(a, a);
} else { } else {
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
@ -4455,7 +4455,7 @@ void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pmovzxwd(a, a); code.pmovzxwd(a, a);
} else { } else {
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
@ -4468,7 +4468,7 @@ void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.pmovzxdq(a, a); code.pmovzxdq(a, a);
} else { } else {
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();

View file

@ -82,7 +82,7 @@ template<size_t fsize, size_t nargs, typename NaNHandler>
void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, nargs + 1> xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) { void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array<Xbyak::Xmm, nargs + 1> xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) {
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.ptest(nan_mask, nan_mask); code.ptest(nan_mask, nan_mask);
} else { } else {
const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
@ -169,7 +169,7 @@ template<size_t fsize>
void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
if (ctx.FPCR().DN()) { if (ctx.FPCR().DN()) {
const Xbyak::Xmm nan_mask = xmm0; const Xbyak::Xmm nan_mask = xmm0;
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
FCODE(vcmpunordp)(nan_mask, result, result); FCODE(vcmpunordp)(nan_mask, result, result);
FCODE(blendvp)(result, GetNaNVector<fsize>(code)); FCODE(blendvp)(result, GetNaNVector<fsize>(code));
} else { } else {
@ -185,7 +185,7 @@ void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) {
template<size_t fsize> template<size_t fsize>
void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) { void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
const Xbyak::Xmm nan_mask = xmm0; const Xbyak::Xmm nan_mask = xmm0;
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
FCODE(vcmpordp)(nan_mask, result, result); FCODE(vcmpordp)(nan_mask, result, result);
FCODE(vandp)(result, result, nan_mask); FCODE(vandp)(result, result, nan_mask);
} else { } else {
@ -301,7 +301,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
fn(result, xmm_a); fn(result, xmm_a);
} }
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
FCODE(vcmpunordp)(nan_mask, result, result); FCODE(vcmpunordp)(nan_mask, result, result);
} else { } else {
code.movaps(nan_mask, result); code.movaps(nan_mask, result);
@ -588,9 +588,9 @@ void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode()); ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ)) { if (code.HasAVX512_Skylake()) {
code.vcvtqq2pd(xmm, xmm); code.vcvtqq2pd(xmm, xmm);
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { } else if (code.HasSSE41()) {
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
@ -636,7 +636,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode()); ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
code.vcvtudq2ps(xmm, xmm); code.vcvtudq2ps(xmm, xmm);
} else { } else {
const Xbyak::Address mem_4B000000 = code.MConst(xword, 0x4B0000004B000000, 0x4B0000004B000000); const Xbyak::Address mem_4B000000 = code.MConst(xword, 0x4B0000004B000000, 0x4B0000004B000000);
@ -645,7 +645,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vpblendw(tmp, xmm, mem_4B000000, 0b10101010); code.vpblendw(tmp, xmm, mem_4B000000, 0b10101010);
code.vpsrld(xmm, xmm, 16); code.vpsrld(xmm, xmm, 16);
code.vpblendw(xmm, xmm, mem_53000000, 0b10101010); code.vpblendw(xmm, xmm, mem_53000000, 0b10101010);
@ -683,7 +683,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
ASSERT(rounding_mode == ctx.FPCR().RMode()); ASSERT(rounding_mode == ctx.FPCR().RMode());
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)) { if (code.HasAVX512_Skylake()) {
code.vcvtuqq2pd(xmm, xmm); code.vcvtuqq2pd(xmm, xmm);
} else { } else {
const Xbyak::Address unpack = code.MConst(xword, 0x4530000043300000, 0); const Xbyak::Address unpack = code.MConst(xword, 0x4530000043300000, 0);
@ -693,7 +693,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
code.vmovapd(unpack_reg, unpack); code.vmovapd(unpack_reg, unpack);
code.vmovapd(subtrahend_reg, subtrahend); code.vmovapd(subtrahend_reg, subtrahend);
@ -793,7 +793,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask); DenormalsAreZero<fsize>(code, ctx, {result, xmm_b}, mask);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
FCODE(vcmpeqp)(mask, result, xmm_b); FCODE(vcmpeqp)(mask, result, xmm_b);
FCODE(vcmpunordp)(nan_mask, result, xmm_b); FCODE(vcmpunordp)(nan_mask, result, xmm_b);
if constexpr (is_max) { if constexpr (is_max) {
@ -849,7 +849,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
// x86-64 treats differently signed zeros as equal while ARM does not. // x86-64 treats differently signed zeros as equal while ARM does not.
// Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero. // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasAVX()) {
FCODE(vcmpeqp)(mask, result, xmm_b); FCODE(vcmpeqp)(mask, result, xmm_b);
if constexpr (is_max) { if constexpr (is_max) {
FCODE(vandp)(eq, result, xmm_b); FCODE(vandp)(eq, result, xmm_b);
@ -914,7 +914,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
}; };
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -971,7 +971,7 @@ static void EmitFPVectorMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (ctx.FPCR().DN() && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (ctx.FPCR().DN() && code.HasAVX()) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -1120,7 +1120,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
}; };
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -1176,7 +1176,7 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const bool exact = inst->GetArg(2).GetU1(); const bool exact = inst->GetArg(2).GetU1();
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) { if (code.HasSSE41() && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
const u8 round_imm = [&]() -> u8 { const u8 round_imm = [&]() -> u8 {
switch (rounding) { switch (rounding) {
case FP::RoundingMode::ToNearest_TieEven: case FP::RoundingMode::ToNearest_TieEven:
@ -1276,7 +1276,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
}; };
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tFMA) && code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -1364,7 +1364,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
// TODO: AVX512 implementation // TODO: AVX512 implementation
if constexpr (fsize != 16) { if constexpr (fsize != 16) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { if (code.HasSSE41() && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]);

View file

@ -28,7 +28,7 @@ void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
(code.*unsaturated_fn)(xmm0, addend); (code.*unsaturated_fn)(xmm0, addend);
(code.*sub_fn)(xmm0, result); (code.*sub_fn)(xmm0, result);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.ptest(xmm0, xmm0); code.ptest(xmm0, xmm0);
} else { } else {
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
@ -96,7 +96,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
} }
code.pxor(tmp, code.MConst(xword, msb_mask, msb_mask)); code.pxor(tmp, code.MConst(xword, msb_mask, msb_mask));
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.ptest(xmm0, code.MConst(xword, msb_mask, msb_mask)); code.ptest(xmm0, code.MConst(xword, msb_mask, msb_mask));
} else { } else {
if constexpr (esize == 32) { if constexpr (esize == 32) {
@ -109,7 +109,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
code.setnz(overflow); code.setnz(overflow);
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
if constexpr (esize == 32) { if constexpr (esize == 32) {
code.blendvps(result, tmp); code.blendvps(result, tmp);
} else { } else {
@ -196,7 +196,7 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst)
code.por(result, tmp); code.por(result, tmp);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.ptest(tmp, tmp); code.ptest(tmp, tmp);
} else { } else {
code.movmskps(overflow.cvt32(), tmp); code.movmskps(overflow.cvt32(), tmp);
@ -232,7 +232,7 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst)
code.por(result, tmp); code.por(result, tmp);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.ptest(tmp, tmp); code.ptest(tmp, tmp);
} else { } else {
code.movmskpd(overflow.cvt32(), tmp); code.movmskpd(overflow.cvt32(), tmp);
@ -273,7 +273,7 @@ void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst)
code.psubd(tmp, xmm0); code.psubd(tmp, xmm0);
code.psrad(tmp, 31); code.psrad(tmp, 31);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.ptest(tmp, tmp); code.ptest(tmp, tmp);
} else { } else {
code.movmskps(overflow.cvt32(), tmp); code.movmskps(overflow.cvt32(), tmp);
@ -308,7 +308,7 @@ void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst)
code.psrad(tmp, 31); code.psrad(tmp, 31);
code.pshufd(tmp, tmp, 0b11110101); code.pshufd(tmp, tmp, 0b11110101);
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { if (code.HasSSE41()) {
code.ptest(tmp, tmp); code.ptest(tmp, tmp);
} else { } else {
code.movmskpd(overflow.cvt32(), tmp); code.movmskpd(overflow.cvt32(), tmp);

View file

@ -18,7 +18,7 @@ namespace Dynarmic::Backend::X64 {
#define MAYBE_AVX(OPCODE, ...) \ #define MAYBE_AVX(OPCODE, ...) \
[&] { \ [&] { \
if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { \ if (code.HasAVX()) { \
code.v##OPCODE(__VA_ARGS__); \ code.v##OPCODE(__VA_ARGS__); \
} else { \ } else { \
code.OPCODE(__VA_ARGS__); \ code.OPCODE(__VA_ARGS__); \