backend/x64: Touch PEXT/PDEP code
* Use pext/pdep where not previously used * Limit pext/pdep to non-AMD platforms due to slowness on AMD * Use imul/and as alternatives for AMD and non-BMI2 platforms
This commit is contained in:
parent
f495018f53
commit
55bddc767f
5 changed files with 125 additions and 62 deletions
|
@ -412,16 +412,12 @@ void A32EmitX64::EmitA32SetVector(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
}
|
||||
}
|
||||
|
||||
static u32 GetCpsrImpl(A32JitState* jit_state) {
|
||||
return jit_state->Cpsr();
|
||||
}
|
||||
|
||||
void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
||||
if (code.HasBMI2()) {
|
||||
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
if (code.HasFastBMI2()) {
|
||||
// Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
|
||||
// so we load them both at the same time with one 64-bit read. This allows us to
|
||||
// extract all of their bits together at once with one pext.
|
||||
|
@ -431,32 +427,43 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
code.pext(result.cvt64(), result.cvt64(), tmp.cvt64());
|
||||
code.mov(tmp, 0x000f0220);
|
||||
code.pdep(result, result, tmp);
|
||||
} else {
|
||||
code.mov(result, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
|
||||
code.imul(result, result, 0x12);
|
||||
code.and_(result, 0x00000220);
|
||||
|
||||
code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
|
||||
code.and_(tmp, 0x80808080);
|
||||
code.imul(tmp, tmp, 0x00204081);
|
||||
code.shr(tmp, 12);
|
||||
code.and_(tmp, 0x000f0000);
|
||||
code.or_(result, tmp);
|
||||
}
|
||||
|
||||
code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
|
||||
code.shl(tmp, 27);
|
||||
code.or_(result, tmp);
|
||||
|
||||
code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
|
||||
if (code.HasFastBMI2()) {
|
||||
code.mov(tmp, NZCV::x64_mask);
|
||||
code.pext(tmp2, tmp2, tmp);
|
||||
code.shl(tmp2, 28);
|
||||
} else {
|
||||
code.and_(tmp2, NZCV::x64_mask);
|
||||
code.imul(tmp2, tmp2, NZCV::from_x64_multiplier);
|
||||
code.and_(tmp2, NZCV::arm_mask);
|
||||
}
|
||||
code.or_(result, tmp2);
|
||||
|
||||
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
} else {
|
||||
ctx.reg_alloc.HostCall(inst);
|
||||
code.mov(code.ABI_PARAM1, code.r15);
|
||||
code.CallFunction(&GetCpsrImpl);
|
||||
}
|
||||
}
|
||||
|
||||
static void SetCpsrImpl(u32 value, A32JitState* jit_state) {
|
||||
jit_state->SetCpsr(value);
|
||||
}
|
||||
|
||||
void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
if (code.HasBMI2()) {
|
||||
const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
@ -472,8 +479,13 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
// cpsr_nzcv
|
||||
code.mov(tmp, cpsr);
|
||||
code.shr(tmp, 28);
|
||||
if (code.HasFastBMI2()) {
|
||||
code.mov(tmp2, NZCV::x64_mask);
|
||||
code.pdep(tmp, tmp, tmp2);
|
||||
} else {
|
||||
code.imul(tmp, tmp, NZCV::to_x64_multiplier);
|
||||
code.and_(tmp, NZCV::x64_mask);
|
||||
}
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp);
|
||||
|
||||
// cpsr_jaifm
|
||||
|
@ -481,10 +493,12 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
code.and_(tmp, 0x07F0FDDF);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
|
||||
|
||||
if (code.HasFastBMI2()) {
|
||||
// cpsr_et and cpsr_ge
|
||||
static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
|
||||
// This mask is 0x7FFF0000, because we do not want the MSB to be sign extended to the upper dword.
|
||||
static_assert((A32::LocationDescriptor::FPSCR_MODE_MASK & ~0x7FFF0000) == 0);
|
||||
|
||||
code.and_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0x7FFF0000));
|
||||
code.mov(tmp, 0x000f0220);
|
||||
code.pext(cpsr, cpsr, tmp);
|
||||
|
@ -497,14 +511,21 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
code.xor_(tmp.cvt64(), tmp2.cvt64());
|
||||
code.or_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp.cvt64());
|
||||
} else {
|
||||
ctx.reg_alloc.HostCall(nullptr, args[0]);
|
||||
code.and_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0xFFFF0000));
|
||||
code.mov(tmp, cpsr);
|
||||
code.and_(tmp, 0x00000220);
|
||||
code.imul(tmp, tmp, 0x00900000);
|
||||
code.shr(tmp, 28);
|
||||
code.or_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp);
|
||||
|
||||
if (config.always_little_endian) {
|
||||
code.and_(code.ABI_PARAM1, 0xFFFFFDFF);
|
||||
}
|
||||
|
||||
code.mov(code.ABI_PARAM2, code.r15);
|
||||
code.CallFunction(&SetCpsrImpl);
|
||||
code.and_(cpsr, 0x000f0000);
|
||||
code.shr(cpsr, 16);
|
||||
code.imul(cpsr, cpsr, 0x00204081);
|
||||
code.and_(cpsr, 0x01010101);
|
||||
code.mov(tmp, 0x80808080);
|
||||
code.sub(tmp, cpsr);
|
||||
code.xor_(tmp, 0x80808080);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -514,7 +535,7 @@ void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
const u32 imm = args[0].GetImmediateU32();
|
||||
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
|
||||
} else if (code.HasBMI2()) {
|
||||
} else if (code.HasFastBMI2()) {
|
||||
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
|
@ -539,7 +560,7 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
|
||||
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
|
||||
} else if (code.HasBMI2()) {
|
||||
} else if (code.HasFastBMI2()) {
|
||||
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
|
@ -666,7 +687,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
|
|||
ge |= Common::Bit<16>(imm) ? 0x000000FF : 0;
|
||||
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge);
|
||||
} else if (code.HasBMI2()) {
|
||||
} else if (code.HasFastBMI2()) {
|
||||
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
|
@ -802,13 +823,24 @@ void A32EmitX64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
|
||||
void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
if (code.HasFastBMI2()) {
|
||||
const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[0]).cvt32();
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
code.mov(tmp, NZCV::x64_mask);
|
||||
code.pext(tmp, value, tmp);
|
||||
code.shl(tmp, 28);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], tmp);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const Xbyak::Reg32 value = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
|
||||
code.and_(value, 0b11000001'00000001);
|
||||
code.imul(value, value, 0b00010000'00100001);
|
||||
code.shl(value, 16);
|
||||
code.and_(value, 0xF0000000);
|
||||
|
||||
code.and_(value, NZCV::x64_mask);
|
||||
code.imul(value, value, NZCV::from_x64_multiplier);
|
||||
code.and_(value, NZCV::arm_mask);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], value);
|
||||
}
|
||||
|
||||
|
|
|
@ -381,9 +381,18 @@ void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
|
|||
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
|
||||
|
||||
if (code.HasFastBMI2()) {
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.mov(tmp, NZCV::x64_mask);
|
||||
code.pext(nzcv_raw, nzcv_raw, tmp);
|
||||
code.shl(nzcv_raw, 28);
|
||||
} else {
|
||||
code.and_(nzcv_raw, NZCV::x64_mask);
|
||||
code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier);
|
||||
code.and_(nzcv_raw, NZCV::arm_mask);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, nzcv_raw);
|
||||
}
|
||||
|
||||
|
@ -392,8 +401,14 @@ void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
|
|||
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
|
||||
code.shr(nzcv_raw, 28);
|
||||
if (code.HasFastBMI2()) {
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.mov(tmp, NZCV::x64_mask);
|
||||
code.pdep(nzcv_raw, nzcv_raw, tmp);
|
||||
} else {
|
||||
code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier);
|
||||
code.and_(nzcv_raw, NZCV::x64_mask);
|
||||
}
|
||||
code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw);
|
||||
}
|
||||
|
||||
|
|
|
@ -355,6 +355,10 @@ bool BlockOfCode::HasBMI2() const {
|
|||
return DoesCpuSupport(Xbyak::util::Cpu::tBMI2);
|
||||
}
|
||||
|
||||
bool BlockOfCode::HasFastBMI2() const {
|
||||
return DoesCpuSupport(Xbyak::util::Cpu::tBMI2) && !DoesCpuSupport(Xbyak::util::Cpu::tAMD);
|
||||
}
|
||||
|
||||
bool BlockOfCode::HasFMA() const {
|
||||
return DoesCpuSupport(Xbyak::util::Cpu::tFMA);
|
||||
}
|
||||
|
|
|
@ -150,6 +150,7 @@ public:
|
|||
bool HasLZCNT() const;
|
||||
bool HasBMI1() const;
|
||||
bool HasBMI2() const;
|
||||
bool HasFastBMI2() const;
|
||||
bool HasFMA() const;
|
||||
bool HasAVX2() const;
|
||||
bool HasAVX512_Skylake() const;
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include "backend/x64/block_of_code.h"
|
||||
#include "backend/x64/emit_x64.h"
|
||||
#include "backend/x64/nzcv_util.h"
|
||||
#include "backend/x64/perf_map.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/bit_util.h"
|
||||
|
@ -158,13 +159,23 @@ void EmitX64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) {
|
|||
value |= Common::Bit<29>(args[0].GetImmediateU32()) ? (1 << 8) : 0;
|
||||
value |= Common::Bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0;
|
||||
code.mov(nzcv, value);
|
||||
ctx.reg_alloc.DefineValue(inst, nzcv);
|
||||
} else if (code.HasFastBMI2()) {
|
||||
const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
code.shr(nzcv, 28);
|
||||
code.mov(tmp, NZCV::x64_mask);
|
||||
code.pdep(nzcv, nzcv, tmp);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, nzcv);
|
||||
} else {
|
||||
const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
// TODO: Optimize
|
||||
|
||||
code.shr(nzcv, 28);
|
||||
code.imul(nzcv, nzcv, 0b00010000'10000001);
|
||||
code.and_(nzcv.cvt8(), 1);
|
||||
code.imul(nzcv, nzcv, NZCV::to_x64_multiplier);
|
||||
code.and_(nzcv, NZCV::x64_mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, nzcv);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue