a32_jitstate: Optimize runtime location descriptor calculation
Calculation is now one unaligned 64-bit load.
This commit is contained in:
parent
0de3993373
commit
6e2cd35e4f
5 changed files with 67 additions and 76 deletions
|
@ -217,16 +217,14 @@ void A32EmitX64::GenMemoryAccessors() {
|
|||
}
|
||||
|
||||
void A32EmitX64::GenTerminalHandlers() {
|
||||
// PC ends up in ebp, location_descriptor ends up in rbx
|
||||
// location_descriptor ends up in rbx
|
||||
const auto calculate_location_descriptor = [this] {
|
||||
// This calculation has to match up with IREmitter::PushRSB
|
||||
// TODO: Optimization is available here based on known state of fpcr_mode and cpsr_et.
|
||||
code.mov(ecx, MJitStateReg(A32::Reg::PC));
|
||||
code.mov(ebp, ecx);
|
||||
code.shl(rcx, 32);
|
||||
code.mov(ebx, dword[r15 + offsetof(A32JitState, fpcr_mode)]);
|
||||
code.or_(ebx, dword[r15 + offsetof(A32JitState, cpsr_et)]);
|
||||
code.or_(rbx, rcx);
|
||||
constexpr size_t offsetof_pc = offsetof(A32JitState, Reg) + 15 * sizeof(u32);
|
||||
static_assert(offsetof_pc + 4 == offsetof(A32JitState, cpsr_et));
|
||||
static_assert(offsetof_pc + 5 == offsetof(A32JitState, padding));
|
||||
static_assert(offsetof_pc + 6 == offsetof(A32JitState, fpcr_mode));
|
||||
code.mov(rbx, qword[r15 + offsetof_pc]);
|
||||
};
|
||||
|
||||
Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
|
||||
|
@ -254,10 +252,11 @@ void A32EmitX64::GenTerminalHandlers() {
|
|||
calculate_location_descriptor();
|
||||
code.L(rsb_cache_miss);
|
||||
code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
|
||||
code.mov(rbp, rbx);
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) {
|
||||
code.crc32(ebp, r12d);
|
||||
code.crc32(rbp, r12);
|
||||
}
|
||||
code.and_(ebp, fast_dispatch_table_mask);
|
||||
code.and_(rbp, fast_dispatch_table_mask);
|
||||
code.lea(rbp, ptr[r12 + rbp]);
|
||||
code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
|
||||
code.jne(fast_dispatch_cache_miss);
|
||||
|
@ -349,18 +348,16 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
// Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
|
||||
// so we load them both at the same time with one 64-bit read. This allows us to
|
||||
// extract all of their bits together at once with one pext.
|
||||
static_assert(offsetof(A32JitState, cpsr_et) + 4 == offsetof(A32JitState, cpsr_ge));
|
||||
code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, cpsr_et)]);
|
||||
code.mov(tmp.cvt64(), 0x80808080'00000003ull);
|
||||
code.pext(result.cvt64(), result.cvt64(), tmp.cvt64());
|
||||
code.mov(tmp, 0x000f0220);
|
||||
code.pdep(result, result, tmp);
|
||||
code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
|
||||
code.mov(tmp, 0x80808080);
|
||||
code.pext(result, result, tmp);
|
||||
code.shr(result, 16);
|
||||
code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
|
||||
code.shl(tmp, 27);
|
||||
code.or_(result, tmp);
|
||||
code.movzx(tmp, code.byte[r15 + offsetof(A32JitState, cpsr_et)]);
|
||||
code.shl(tmp, 5);
|
||||
code.or_(result, tmp);
|
||||
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
|
||||
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
|
||||
|
||||
|
@ -382,7 +379,6 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
|
||||
const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
|
||||
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
// cpsr_q
|
||||
code.bt(cpsr, 27);
|
||||
|
@ -398,18 +394,21 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
code.and_(tmp, 0x07F0FDDF);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
|
||||
|
||||
// cpsr_et and cpsr_ge
|
||||
static_assert(offsetof(A32JitState, cpsr_et) + 4 == offsetof(A32JitState, cpsr_ge));
|
||||
code.mov(tmp, 0x000f0220);
|
||||
code.pext(cpsr, cpsr, tmp);
|
||||
code.mov(tmp.cvt64(), 0x01010101'00000003ull);
|
||||
code.pdep(cpsr.cvt64(), cpsr.cvt64(), tmp.cvt64());
|
||||
// cpsr_et
|
||||
code.mov(tmp, cpsr);
|
||||
code.shr(tmp, 5);
|
||||
code.and_(tmp, 0x11);
|
||||
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], tmp.cvt8());
|
||||
|
||||
// cpsr_ge
|
||||
code.shr(cpsr, 16);
|
||||
code.mov(tmp, 0x01010101);
|
||||
code.pdep(cpsr, cpsr, tmp);
|
||||
// We perform SWAR partitioned subtraction here, to negate the GE bytes.
|
||||
code.mov(tmp.cvt64(), 0x80808080'00000003ull);
|
||||
code.mov(tmp2.cvt64(), tmp.cvt64());
|
||||
code.sub(tmp.cvt64(), cpsr.cvt64());
|
||||
code.xor_(tmp.cvt64(), tmp2.cvt64());
|
||||
code.mov(qword[r15 + offsetof(A32JitState, cpsr_et)], tmp.cvt64());
|
||||
code.mov(tmp, 0x80808080);
|
||||
code.sub(tmp, cpsr);
|
||||
code.xor_(tmp, 0x80808080);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp);
|
||||
} else {
|
||||
ctx.reg_alloc.HostCall(nullptr, args[0]);
|
||||
code.mov(code.ABI_PARAM2, code.r15);
|
||||
|
@ -660,11 +659,11 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
const u32 new_pc = arg.GetImmediateU32();
|
||||
const u32 mask = Common::Bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC;
|
||||
u32 et = 0;
|
||||
et |= ctx.Location().EFlag() ? 2 : 0;
|
||||
et |= Common::Bit<0>(new_pc) ? 1 : 0;
|
||||
et |= ctx.Location().EFlag() ? 0x10 : 0;
|
||||
et |= Common::Bit<0>(new_pc) ? 0x01 : 0;
|
||||
|
||||
code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], et);
|
||||
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], u8(et));
|
||||
} else {
|
||||
if (ctx.Location().EFlag()) {
|
||||
const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32();
|
||||
|
@ -673,8 +672,8 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
|
||||
code.mov(mask, new_pc);
|
||||
code.and_(mask, 1);
|
||||
code.lea(et, ptr[mask.cvt64() + 2]);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], et);
|
||||
code.lea(et, ptr[mask.cvt64() + 0x10]);
|
||||
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], et.cvt8());
|
||||
code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
|
||||
code.and_(new_pc, mask);
|
||||
code.mov(MJitStateReg(A32::Reg::PC), new_pc);
|
||||
|
@ -684,7 +683,7 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
|
|||
|
||||
code.mov(mask, new_pc);
|
||||
code.and_(mask, 1);
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], mask);
|
||||
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], mask.cvt8());
|
||||
code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
|
||||
code.and_(new_pc, mask);
|
||||
code.mov(MJitStateReg(A32::Reg::PC), new_pc);
|
||||
|
@ -1261,17 +1260,17 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescri
|
|||
code.ReturnFromRunCode();
|
||||
}
|
||||
|
||||
static u32 CalculateCpsr_et(const IR::LocationDescriptor& arg) {
|
||||
static u8 CalculateCpsr_et(const IR::LocationDescriptor& arg) {
|
||||
const A32::LocationDescriptor desc{arg};
|
||||
u32 et = 0;
|
||||
et |= desc.EFlag() ? 2 : 0;
|
||||
et |= desc.TFlag() ? 1 : 0;
|
||||
u8 et = 0;
|
||||
et |= desc.EFlag() ? 0x10 : 0;
|
||||
et |= desc.TFlag() ? 0x01 : 0;
|
||||
return et;
|
||||
}
|
||||
|
||||
void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location) {
|
||||
if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) {
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
|
||||
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
|
||||
}
|
||||
|
||||
code.cmp(qword[r15 + offsetof(A32JitState, cycles_remaining)], 0);
|
||||
|
@ -1296,7 +1295,7 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDesc
|
|||
|
||||
void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location) {
|
||||
if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) {
|
||||
code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
|
||||
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
|
||||
}
|
||||
|
||||
patch_information[terminal.next].jmp.emplace_back(code.getCurr());
|
||||
|
|
|
@ -262,16 +262,7 @@ void Context::SetFpscr(std::uint32_t value) {
|
|||
}
|
||||
|
||||
void TransferJitState(A32JitState& dest, const A32JitState& src, bool reset_rsb) {
|
||||
dest.cpsr_ge = src.cpsr_ge;
|
||||
dest.cpsr_et = src.cpsr_et;
|
||||
dest.cpsr_q = src.cpsr_q;
|
||||
dest.cpsr_nzcv = src.cpsr_nzcv;
|
||||
dest.cpsr_jaifm = src.cpsr_jaifm;
|
||||
dest.Reg = src.Reg;
|
||||
dest.ExtReg = src.ExtReg;
|
||||
dest.guest_MXCSR = src.guest_MXCSR;
|
||||
dest.fpcr_mode = src.fpcr_mode;
|
||||
dest.fpsr_nzcv = src.fpsr_nzcv;
|
||||
dest = src;
|
||||
if (reset_rsb) {
|
||||
dest.ResetRSB();
|
||||
} else {
|
||||
|
|
|
@ -46,7 +46,7 @@ namespace Dynarmic::BackendX64 {
|
|||
u32 A32JitState::Cpsr() const {
|
||||
ASSERT((cpsr_nzcv & ~0xF0000000) == 0);
|
||||
ASSERT((cpsr_q & ~1) == 0);
|
||||
ASSERT((cpsr_et & ~3) == 0);
|
||||
ASSERT((cpsr_et & ~0x11) == 0);
|
||||
ASSERT((cpsr_jaifm & ~0x010001DF) == 0);
|
||||
|
||||
u32 cpsr = 0;
|
||||
|
@ -61,8 +61,7 @@ u32 A32JitState::Cpsr() const {
|
|||
cpsr |= Common::Bit<15>(cpsr_ge) ? 1 << 17 : 0;
|
||||
cpsr |= Common::Bit<7>(cpsr_ge) ? 1 << 16 : 0;
|
||||
// E flag, T flag
|
||||
cpsr |= Common::Bit<1>(cpsr_et) ? 1 << 9 : 0;
|
||||
cpsr |= Common::Bit<0>(cpsr_et) ? 1 << 5 : 0;
|
||||
cpsr |= static_cast<u32>(cpsr_et) << 5;
|
||||
// Other flags
|
||||
cpsr |= cpsr_jaifm;
|
||||
|
||||
|
@ -81,9 +80,7 @@ void A32JitState::SetCpsr(u32 cpsr) {
|
|||
cpsr_ge |= Common::Bit<17>(cpsr) ? 0x0000FF00 : 0;
|
||||
cpsr_ge |= Common::Bit<16>(cpsr) ? 0x000000FF : 0;
|
||||
// E flag, T flag
|
||||
cpsr_et = 0;
|
||||
cpsr_et |= Common::Bit<9>(cpsr) ? 2 : 0;
|
||||
cpsr_et |= Common::Bit<5>(cpsr) ? 1 : 0;
|
||||
cpsr_et = static_cast<u8>((cpsr >> 5) & 0x11);
|
||||
// Other flags
|
||||
cpsr_jaifm = cpsr & 0x07F0FDDF;
|
||||
}
|
||||
|
@ -154,10 +151,12 @@ constexpr u32 FPSCR_MODE_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK;
|
|||
constexpr u32 FPSCR_NZCV_MASK = 0xF0000000;
|
||||
|
||||
u32 A32JitState::Fpscr() const {
|
||||
ASSERT((fpcr_mode & ~FPSCR_MODE_MASK) == 0);
|
||||
const u32 fpcr_mode_shifted = static_cast<u32>(fpcr_mode) << 16;
|
||||
|
||||
ASSERT((fpcr_mode_shifted & ~FPSCR_MODE_MASK) == 0);
|
||||
ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0);
|
||||
|
||||
u32 FPSCR = fpcr_mode | fpsr_nzcv;
|
||||
u32 FPSCR = fpcr_mode_shifted | fpsr_nzcv;
|
||||
FPSCR |= (guest_MXCSR & 0b0000000000001); // IOC = IE
|
||||
FPSCR |= (guest_MXCSR & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE
|
||||
FPSCR |= fpsr_exc;
|
||||
|
@ -166,7 +165,7 @@ u32 A32JitState::Fpscr() const {
|
|||
}
|
||||
|
||||
void A32JitState::SetFpscr(u32 FPSCR) {
|
||||
fpcr_mode = FPSCR & FPSCR_MODE_MASK;
|
||||
fpcr_mode = static_cast<u16>((FPSCR & FPSCR_MODE_MASK) >> 16);
|
||||
fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK;
|
||||
guest_MXCSR = 0;
|
||||
|
||||
|
@ -188,7 +187,7 @@ void A32JitState::SetFpscr(u32 FPSCR) {
|
|||
}
|
||||
|
||||
u64 A32JitState::GetUniqueHash() const noexcept {
|
||||
return cpsr_et | fpcr_mode | (static_cast<u64>(Reg[15]) << 32);
|
||||
return (static_cast<u64>(cpsr_et) << 32) | (static_cast<u64>(fpcr_mode) << 48) | Reg[15];
|
||||
}
|
||||
|
||||
} // namespace Dynarmic::BackendX64
|
||||
|
|
|
@ -29,12 +29,17 @@ struct A32JitState {
|
|||
std::array<u32, 16> Reg{}; // Current register file.
|
||||
// TODO: Mode-specific register sets unimplemented.
|
||||
|
||||
u32 cpsr_et = 0;
|
||||
// Location Descriptor related (the order of fields is important)
|
||||
u8 cpsr_et = 0; ///< Format: 000E000T
|
||||
u8 padding = 0;
|
||||
u16 fpcr_mode = 0; ///< Top 16 bits of FPCR
|
||||
u64 GetUniqueHash() const noexcept;
|
||||
|
||||
// CPSR fields
|
||||
u32 cpsr_ge = 0;
|
||||
u32 cpsr_q = 0;
|
||||
u32 cpsr_nzcv = 0;
|
||||
u32 cpsr_jaifm = 0;
|
||||
|
||||
u32 Cpsr() const;
|
||||
void SetCpsr(u32 cpsr);
|
||||
|
||||
|
@ -69,12 +74,9 @@ struct A32JitState {
|
|||
|
||||
u32 fpsr_exc = 0;
|
||||
u32 fpsr_qc = 0; // Dummy value
|
||||
u32 fpcr_mode = 0;
|
||||
u32 fpsr_nzcv = 0;
|
||||
u32 Fpscr() const;
|
||||
void SetFpscr(u32 FPSCR);
|
||||
|
||||
u64 GetUniqueHash() const noexcept;
|
||||
};
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
|
|
@ -32,10 +32,10 @@ public:
|
|||
: arm_pc(arm_pc), cpsr(cpsr.Value() & CPSR_MODE_MASK), fpscr(fpscr.Value() & FPSCR_MODE_MASK) {}
|
||||
|
||||
explicit LocationDescriptor(const IR::LocationDescriptor& o) {
|
||||
arm_pc = o.Value() >> 32;
|
||||
cpsr.T(o.Value() & 1);
|
||||
cpsr.E(o.Value() & 2);
|
||||
fpscr = o.Value() & FPSCR_MODE_MASK;
|
||||
arm_pc = static_cast<u32>(o.Value());
|
||||
cpsr.T(o.Value() & (u64(0x01) << 32));
|
||||
cpsr.E(o.Value() & (u64(0x10) << 32));
|
||||
fpscr = static_cast<u32>(o.Value() >> 32) & FPSCR_MODE_MASK;
|
||||
}
|
||||
|
||||
u32 PC() const { return arm_pc; }
|
||||
|
@ -82,10 +82,10 @@ public:
|
|||
u64 UniqueHash() const noexcept {
|
||||
// This value MUST BE UNIQUE.
|
||||
// This calculation has to match up with EmitX64::EmitTerminalPopRSBHint
|
||||
const u64 pc_u64 = u64(arm_pc) << 32;
|
||||
const u64 fpscr_u64 = u64(fpscr.Value());
|
||||
const u64 t_u64 = cpsr.T() ? 1 : 0;
|
||||
const u64 e_u64 = cpsr.E() ? 2 : 0;
|
||||
const u64 pc_u64 = u64(arm_pc);
|
||||
const u64 fpscr_u64 = u64(fpscr.Value()) << 32;
|
||||
const u64 t_u64 = cpsr.T() ? u64(0x01) << 32 : 0;
|
||||
const u64 e_u64 = cpsr.E() ? u64(0x10) << 32 : 0;
|
||||
return pc_u64 | fpscr_u64 | t_u64 | e_u64;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue