a32_jitstate: Consolidate upper bits of location descriptor into upper_location_descriptor

Also solves a performance regression initially introduced by b6e8297e369f2dc4758bafe944e51efb8d1a2552,
primarily due to excessively mismatched load/store sizes causing less than optimal load-to-store forwarding.
This commit is contained in:
MerryMage 2019-07-25 11:40:40 +01:00
parent e41a7dc678
commit 2f2a859615
5 changed files with 86 additions and 97 deletions

View file

@ -219,11 +219,11 @@ void A32EmitX64::GenTerminalHandlers() {
// location_descriptor ends up in rbx // location_descriptor ends up in rbx
const auto calculate_location_descriptor = [this] { const auto calculate_location_descriptor = [this] {
// This calculation has to match up with IREmitter::PushRSB // This calculation has to match up with IREmitter::PushRSB
constexpr size_t offsetof_pc = offsetof(A32JitState, Reg) + 15 * sizeof(u32); code.mov(ebx, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
static_assert(offsetof_pc + 4 == offsetof(A32JitState, cpsr_et)); code.shl(rbx, 32);
static_assert(offsetof_pc + 5 == offsetof(A32JitState, cpsr_it)); code.mov(ecx, MJitStateReg(A32::Reg::PC));
static_assert(offsetof_pc + 6 == offsetof(A32JitState, fpcr_mode)); code.mov(ebp, ecx);
code.mov(rbx, qword[r15 + offsetof_pc]); code.or_(rbx, rcx);
}; };
Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss; Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
@ -347,16 +347,18 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_ge)]); // Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
code.mov(tmp, 0x80808080); // so we load them both at the same time with one 64-bit read. This allows us to
code.pext(result, result, tmp); // extract all of their bits together at once with one pext.
code.shr(result, 16); static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
code.mov(tmp.cvt64(), 0x80808080'00000003ull);
code.pext(result.cvt64(), result.cvt64(), tmp.cvt64());
code.mov(tmp, 0x000f0220);
code.pdep(result, result, tmp);
code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]); code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
code.shl(tmp, 27); code.shl(tmp, 27);
code.or_(result, tmp); code.or_(result, tmp);
code.movzx(tmp, code.byte[r15 + offsetof(A32JitState, cpsr_et)]);
code.shl(tmp, 5);
code.or_(result, tmp);
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]); code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]); code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
@ -378,6 +380,7 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
// cpsr_q // cpsr_q
code.bt(cpsr, 27); code.bt(cpsr, 27);
@ -393,21 +396,19 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.and_(tmp, 0x07F0FDDF); code.and_(tmp, 0x07F0FDDF);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp); code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
// cpsr_et // cpsr_et and cpsr_ge
code.mov(tmp, cpsr); static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
code.shr(tmp, 5); code.and_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0xFFFF0000));
code.and_(tmp, 0x11); code.mov(tmp, 0x000f0220);
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], tmp.cvt8()); code.pext(cpsr, cpsr, tmp);
code.mov(tmp.cvt64(), 0x01010101'00000003ull);
// cpsr_ge code.pdep(cpsr.cvt64(), cpsr.cvt64(), tmp.cvt64());
code.shr(cpsr, 16);
code.mov(tmp, 0x01010101);
code.pdep(cpsr, cpsr, tmp);
// We perform SWAR partitioned subtraction here, to negate the GE bytes. // We perform SWAR partitioned subtraction here, to negate the GE bytes.
code.mov(tmp, 0x80808080); code.mov(tmp.cvt64(), 0x80808080'00000003ull);
code.sub(tmp, cpsr); code.mov(tmp2.cvt64(), tmp.cvt64());
code.xor_(tmp, 0x80808080); code.sub(tmp.cvt64(), cpsr.cvt64());
code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp); code.xor_(tmp.cvt64(), tmp2.cvt64());
code.or_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp.cvt64());
} else { } else {
ctx.reg_alloc.HostCall(nullptr, args[0]); ctx.reg_alloc.HostCall(nullptr, args[0]);
code.mov(code.ABI_PARAM2, code.r15); code.mov(code.ABI_PARAM2, code.r15);
@ -644,6 +645,8 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto& arg = args[0]; auto& arg = args[0];
const u32 upper_without_t = (ctx.Location().UniqueHash() >> 32) & 0xFFFFFFFE;
// Pseudocode: // Pseudocode:
// if (new_pc & 1) { // if (new_pc & 1) {
// new_pc &= 0xFFFFFFFE; // new_pc &= 0xFFFFFFFE;
@ -657,36 +660,22 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
if (arg.IsImmediate()) { if (arg.IsImmediate()) {
const u32 new_pc = arg.GetImmediateU32(); const u32 new_pc = arg.GetImmediateU32();
const u32 mask = Common::Bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC; const u32 mask = Common::Bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC;
u32 et = 0; const u32 new_upper = upper_without_t | (Common::Bit<0>(new_pc) ? 1 : 0);
et |= ctx.Location().EFlag() ? 0x10 : 0;
et |= Common::Bit<0>(new_pc) ? 0x01 : 0;
code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask); code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask);
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], u8(et)); code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
} else {
if (ctx.Location().EFlag()) {
const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32();
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 et = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(mask, new_pc);
code.and_(mask, 1);
code.lea(et, ptr[mask.cvt64() + 0x10]);
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], et.cvt8());
code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
code.and_(new_pc, mask);
code.mov(MJitStateReg(A32::Reg::PC), new_pc);
} else { } else {
const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32(); const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32();
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 new_upper = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(mask, new_pc); code.mov(mask, new_pc);
code.and_(mask, 1); code.and_(mask, 1);
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], mask.cvt8()); code.lea(new_upper, ptr[mask.cvt64() + upper_without_t]);
code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
code.and_(new_pc, mask); code.and_(new_pc, mask);
code.mov(MJitStateReg(A32::Reg::PC), new_pc); code.mov(MJitStateReg(A32::Reg::PC), new_pc);
} code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
} }
} }
@ -1259,17 +1248,13 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescri
code.ReturnFromRunCode(); code.ReturnFromRunCode();
} }
static u8 CalculateCpsr_et(const IR::LocationDescriptor& arg) { static u32 CalculateUpper(const IR::LocationDescriptor& arg) {
const A32::LocationDescriptor desc{arg}; return static_cast<u32>(arg.Value() >> 32);
u8 et = 0;
et |= desc.EFlag() ? 0x10 : 0;
et |= desc.TFlag() ? 0x01 : 0;
return et;
} }
void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location) { void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location) {
if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) { if (CalculateUpper(terminal.next) != CalculateUpper(initial_location)) {
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next)); code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], CalculateUpper(terminal.next));
} }
code.cmp(qword[r15 + offsetof(A32JitState, cycles_remaining)], 0); code.cmp(qword[r15 + offsetof(A32JitState, cycles_remaining)], 0);
@ -1293,8 +1278,8 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDesc
} }
void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location) { void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location) {
if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) { if (CalculateUpper(terminal.next) != CalculateUpper(initial_location)) {
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next)); code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], CalculateUpper(terminal.next));
} }
patch_information[terminal.next].jmp.emplace_back(code.getCurr()); patch_information[terminal.next].jmp.emplace_back(code.getCurr());

View file

@ -115,7 +115,7 @@ private:
u32 pc = jit_state.Reg[15]; u32 pc = jit_state.Reg[15];
A32::PSR cpsr{jit_state.Cpsr()}; A32::PSR cpsr{jit_state.Cpsr()};
A32::FPSCR fpscr{jit_state.fpcr_mode}; A32::FPSCR fpscr{jit_state.upper_location_descriptor};
A32::LocationDescriptor descriptor{pc, cpsr, fpscr}; A32::LocationDescriptor descriptor{pc, cpsr, fpscr};
return this_.GetBasicBlock(descriptor).entrypoint; return this_.GetBasicBlock(descriptor).entrypoint;

View file

@ -46,10 +46,9 @@ namespace Dynarmic::BackendX64 {
*/ */
u32 A32JitState::Cpsr() const { u32 A32JitState::Cpsr() const {
ASSERT((cpsr_nzcv & ~0xF0000000) == 0); DEBUG_ASSERT((cpsr_nzcv & ~0xF0000000) == 0);
ASSERT((cpsr_q & ~1) == 0); DEBUG_ASSERT((cpsr_q & ~1) == 0);
ASSERT((cpsr_et & ~0x11) == 0); DEBUG_ASSERT((cpsr_jaifm & ~0x010001DF) == 0);
ASSERT((cpsr_jaifm & ~0x010001DF) == 0);
u32 cpsr = 0; u32 cpsr = 0;
@ -63,10 +62,11 @@ u32 A32JitState::Cpsr() const {
cpsr |= Common::Bit<15>(cpsr_ge) ? 1 << 17 : 0; cpsr |= Common::Bit<15>(cpsr_ge) ? 1 << 17 : 0;
cpsr |= Common::Bit<7>(cpsr_ge) ? 1 << 16 : 0; cpsr |= Common::Bit<7>(cpsr_ge) ? 1 << 16 : 0;
// E flag, T flag // E flag, T flag
cpsr |= static_cast<u32>(cpsr_et) << 5; cpsr |= Common::Bit<1>(upper_location_descriptor) ? 1 << 9 : 0;
cpsr |= Common::Bit<0>(upper_location_descriptor) ? 1 << 5 : 0;
// IT state // IT state
cpsr |= static_cast<u32>(cpsr_it & 0b11111100) << 8; cpsr |= static_cast<u32>(upper_location_descriptor & 0b11111100'00000000);
cpsr |= static_cast<u32>(cpsr_it & 0b00000011) << 25; cpsr |= static_cast<u32>(upper_location_descriptor & 0b00000011'00000000) << 17;
// Other flags // Other flags
cpsr |= cpsr_jaifm; cpsr |= cpsr_jaifm;
@ -84,12 +84,15 @@ void A32JitState::SetCpsr(u32 cpsr) {
cpsr_ge |= Common::Bit<18>(cpsr) ? 0x00FF0000 : 0; cpsr_ge |= Common::Bit<18>(cpsr) ? 0x00FF0000 : 0;
cpsr_ge |= Common::Bit<17>(cpsr) ? 0x0000FF00 : 0; cpsr_ge |= Common::Bit<17>(cpsr) ? 0x0000FF00 : 0;
cpsr_ge |= Common::Bit<16>(cpsr) ? 0x000000FF : 0; cpsr_ge |= Common::Bit<16>(cpsr) ? 0x000000FF : 0;
upper_location_descriptor &= 0xFFFF0000;
// E flag, T flag // E flag, T flag
cpsr_et = static_cast<u8>((cpsr >> 5) & 0x11); upper_location_descriptor |= Common::Bit<9>(cpsr) ? 2 : 0;
upper_location_descriptor |= Common::Bit<5>(cpsr) ? 1 : 0;
// IT state // IT state
cpsr_it = 0; upper_location_descriptor |= (cpsr >> 0) & 0b11111100'00000000;
cpsr_it |= static_cast<u8>((cpsr >> 8) & 0b11111100); upper_location_descriptor |= (cpsr >> 17) & 0b00000011'00000000;
cpsr_it |= static_cast<u8>((cpsr >> 25) & 0b00000011);
// Other flags // Other flags
cpsr_jaifm = cpsr & 0x010001DF; cpsr_jaifm = cpsr & 0x010001DF;
} }
@ -160,12 +163,11 @@ constexpr u32 FPSCR_MODE_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK;
constexpr u32 FPSCR_NZCV_MASK = 0xF0000000; constexpr u32 FPSCR_NZCV_MASK = 0xF0000000;
u32 A32JitState::Fpscr() const { u32 A32JitState::Fpscr() const {
const u32 fpcr_mode_shifted = static_cast<u32>(fpcr_mode) << 16; DEBUG_ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0);
ASSERT((fpcr_mode_shifted & ~FPSCR_MODE_MASK) == 0); const u32 fpcr_mode = static_cast<u32>(upper_location_descriptor) & FPSCR_MODE_MASK;
ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0);
u32 FPSCR = fpcr_mode_shifted | fpsr_nzcv; u32 FPSCR = fpcr_mode | fpsr_nzcv;
FPSCR |= (guest_MXCSR & 0b0000000000001); // IOC = IE FPSCR |= (guest_MXCSR & 0b0000000000001); // IOC = IE
FPSCR |= (guest_MXCSR & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE FPSCR |= (guest_MXCSR & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE
FPSCR |= fpsr_exc; FPSCR |= fpsr_exc;
@ -174,7 +176,12 @@ u32 A32JitState::Fpscr() const {
} }
void A32JitState::SetFpscr(u32 FPSCR) { void A32JitState::SetFpscr(u32 FPSCR) {
fpcr_mode = static_cast<u16>((FPSCR & FPSCR_MODE_MASK) >> 16); // Ensure that only upper half of upper_location_descriptor is used for FPSCR bits.
static_assert((FPSCR_MODE_MASK & 0xFFFF0000) == FPSCR_MODE_MASK);
upper_location_descriptor &= 0x0000FFFF;
upper_location_descriptor |= FPSCR & FPSCR_MODE_MASK;
fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK; fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK;
guest_MXCSR = 0; guest_MXCSR = 0;

View file

@ -29,16 +29,8 @@ struct A32JitState {
std::array<u32, 16> Reg{}; // Current register file. std::array<u32, 16> Reg{}; // Current register file.
// TODO: Mode-specific register sets unimplemented. // TODO: Mode-specific register sets unimplemented.
// Location Descriptor related (the order of fields is important) u32 upper_location_descriptor = 0;
u8 cpsr_et = 0; ///< Format: 000E000T
u8 cpsr_it = 0; ///< Format: ccccmmmm
u16 fpcr_mode = 0; ///< Top 16 bits of FPCR
u64 GetUniqueHash() const noexcept {
const u64 upper_half = u64(cpsr_et) | (u64(cpsr_it) << 8) | (u64(fpcr_mode) << 16);
return (upper_half << 32) | Reg[15];
}
// CPSR fields
u32 cpsr_ge = 0; u32 cpsr_ge = 0;
u32 cpsr_q = 0; u32 cpsr_q = 0;
u32 cpsr_nzcv = 0; u32 cpsr_nzcv = 0;
@ -80,6 +72,10 @@ struct A32JitState {
u32 fpsr_nzcv = 0; u32 fpsr_nzcv = 0;
u32 Fpscr() const; u32 Fpscr() const;
void SetFpscr(u32 FPSCR); void SetFpscr(u32 FPSCR);
u64 GetUniqueHash() const noexcept {
return (static_cast<u64>(upper_location_descriptor) << 32) | (static_cast<u64>(Reg[15]));
}
}; };
#ifdef _MSC_VER #ifdef _MSC_VER

View file

@ -34,10 +34,10 @@ public:
explicit LocationDescriptor(const IR::LocationDescriptor& o) { explicit LocationDescriptor(const IR::LocationDescriptor& o) {
arm_pc = static_cast<u32>(o.Value()); arm_pc = static_cast<u32>(o.Value());
cpsr.T(o.Value() & (u64(0x01) << 32)); cpsr.T((o.Value() >> 32) & 1);
cpsr.E(o.Value() & (u64(0x10) << 32)); cpsr.E((o.Value() >> 32) & 2);
fpscr = (o.Value() >> 32) & FPSCR_MODE_MASK;
cpsr.IT(ITState{static_cast<u8>(o.Value() >> 40)}); cpsr.IT(ITState{static_cast<u8>(o.Value() >> 40)});
fpscr = static_cast<u32>(o.Value() >> 32) & FPSCR_MODE_MASK;
} }
u32 PC() const { return arm_pc; } u32 PC() const { return arm_pc; }
@ -92,12 +92,13 @@ public:
u64 UniqueHash() const noexcept { u64 UniqueHash() const noexcept {
// This value MUST BE UNIQUE. // This value MUST BE UNIQUE.
// This calculation has to match up with EmitX64::EmitTerminalPopRSBHint // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint
const u64 pc_u64 = u64(arm_pc); const u64 pc_u64 = arm_pc;
const u64 fpscr_u64 = u64(fpscr.Value()) << 32; const u64 fpscr_u64 = fpscr.Value();
const u64 it_u64 = u64(cpsr.IT().Value()) << 40; const u64 t_u64 = cpsr.T() ? 1 : 0;
const u64 t_u64 = cpsr.T() ? u64(0x01) << 32 : 0; const u64 e_u64 = cpsr.E() ? 2 : 0;
const u64 e_u64 = cpsr.E() ? u64(0x10) << 32 : 0; const u64 it_u64 = u64(cpsr.IT().Value()) << 8;
return pc_u64 | fpscr_u64 | it_u64 | t_u64 | e_u64; const u64 upper = (fpscr_u64 | t_u64 | e_u64 | it_u64) << 32;
return pc_u64 | upper;
} }
operator IR::LocationDescriptor() const { operator IR::LocationDescriptor() const {