From 6e2cd35e4f76822f6efa0b18950a2edd2e028196 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sun, 5 May 2019 23:17:15 +0100 Subject: [PATCH] a32_jitstate: Optimize runtime location descriptor calculation Calculation is now one unaligned 64-bit load. --- src/backend/x64/a32_emit_x64.cpp | 85 +++++++++++++------------- src/backend/x64/a32_interface.cpp | 11 +--- src/backend/x64/a32_jitstate.cpp | 19 +++--- src/backend/x64/a32_jitstate.h | 12 ++-- src/frontend/A32/location_descriptor.h | 16 ++--- 5 files changed, 67 insertions(+), 76 deletions(-) diff --git a/src/backend/x64/a32_emit_x64.cpp b/src/backend/x64/a32_emit_x64.cpp index 2c1ea673..a9871870 100644 --- a/src/backend/x64/a32_emit_x64.cpp +++ b/src/backend/x64/a32_emit_x64.cpp @@ -217,16 +217,14 @@ void A32EmitX64::GenMemoryAccessors() { } void A32EmitX64::GenTerminalHandlers() { - // PC ends up in ebp, location_descriptor ends up in rbx + // location_descriptor ends up in rbx const auto calculate_location_descriptor = [this] { // This calculation has to match up with IREmitter::PushRSB - // TODO: Optimization is available here based on known state of fpcr_mode and cpsr_et. - code.mov(ecx, MJitStateReg(A32::Reg::PC)); - code.mov(ebp, ecx); - code.shl(rcx, 32); - code.mov(ebx, dword[r15 + offsetof(A32JitState, fpcr_mode)]); - code.or_(ebx, dword[r15 + offsetof(A32JitState, cpsr_et)]); - code.or_(rbx, rcx); + constexpr size_t offsetof_pc = offsetof(A32JitState, Reg) + 15 * sizeof(u32); + static_assert(offsetof_pc + 4 == offsetof(A32JitState, cpsr_et)); + static_assert(offsetof_pc + 5 == offsetof(A32JitState, padding)); + static_assert(offsetof_pc + 6 == offsetof(A32JitState, fpcr_mode)); + code.mov(rbx, qword[r15 + offsetof_pc]); }; Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss; @@ -254,10 +252,11 @@ void A32EmitX64::GenTerminalHandlers() { calculate_location_descriptor(); code.L(rsb_cache_miss); code.mov(r12, reinterpret_cast(fast_dispatch_table.data())); + code.mov(rbp, rbx); if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) { - code.crc32(ebp, r12d); + code.crc32(rbp, r12); } - code.and_(ebp, fast_dispatch_table_mask); + code.and_(rbp, fast_dispatch_table_mask); code.lea(rbp, ptr[r12 + rbp]); code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); code.jne(fast_dispatch_cache_miss); @@ -349,18 +348,16 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); - // Here we observe that cpsr_et and cpsr_ge are right next to each other in memory, - // so we load them both at the same time with one 64-bit read. This allows us to - // extract all of their bits together at once with one pext. - static_assert(offsetof(A32JitState, cpsr_et) + 4 == offsetof(A32JitState, cpsr_ge)); - code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, cpsr_et)]); - code.mov(tmp.cvt64(), 0x80808080'00000003ull); - code.pext(result.cvt64(), result.cvt64(), tmp.cvt64()); - code.mov(tmp, 0x000f0220); - code.pdep(result, result, tmp); + code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_ge)]); + code.mov(tmp, 0x80808080); + code.pext(result, result, tmp); + code.shr(result, 16); code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]); code.shl(tmp, 27); code.or_(result, tmp); + code.movzx(tmp, code.byte[r15 + offsetof(A32JitState, cpsr_et)]); + code.shl(tmp, 5); + code.or_(result, tmp); code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]); code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]); @@ -382,7 +379,6 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); - const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32(); // cpsr_q code.bt(cpsr, 27); @@ -398,18 +394,21 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) { code.and_(tmp, 0x07F0FDDF); code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp); - // cpsr_et and cpsr_ge - static_assert(offsetof(A32JitState, cpsr_et) + 4 == offsetof(A32JitState, cpsr_ge)); - code.mov(tmp, 0x000f0220); - code.pext(cpsr, cpsr, tmp); - code.mov(tmp.cvt64(), 0x01010101'00000003ull); - code.pdep(cpsr.cvt64(), cpsr.cvt64(), tmp.cvt64()); + // cpsr_et + code.mov(tmp, cpsr); + code.shr(tmp, 5); + code.and_(tmp, 0x11); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], tmp.cvt8()); + + // cpsr_ge + code.shr(cpsr, 16); + code.mov(tmp, 0x01010101); + code.pdep(cpsr, cpsr, tmp); // We perform SWAR partitioned subtraction here, to negate the GE bytes. - code.mov(tmp.cvt64(), 0x80808080'00000003ull); - code.mov(tmp2.cvt64(), tmp.cvt64()); - code.sub(tmp.cvt64(), cpsr.cvt64()); - code.xor_(tmp.cvt64(), tmp2.cvt64()); - code.mov(qword[r15 + offsetof(A32JitState, cpsr_et)], tmp.cvt64()); + code.mov(tmp, 0x80808080); + code.sub(tmp, cpsr); + code.xor_(tmp, 0x80808080); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp); } else { ctx.reg_alloc.HostCall(nullptr, args[0]); code.mov(code.ABI_PARAM2, code.r15); @@ -660,11 +659,11 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) { const u32 new_pc = arg.GetImmediateU32(); const u32 mask = Common::Bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC; u32 et = 0; - et |= ctx.Location().EFlag() ? 2 : 0; - et |= Common::Bit<0>(new_pc) ? 1 : 0; + et |= ctx.Location().EFlag() ? 0x10 : 0; + et |= Common::Bit<0>(new_pc) ? 0x01 : 0; code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask); - code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], et); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], u8(et)); } else { if (ctx.Location().EFlag()) { const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32(); @@ -673,8 +672,8 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) { code.mov(mask, new_pc); code.and_(mask, 1); - code.lea(et, ptr[mask.cvt64() + 2]); - code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], et); + code.lea(et, ptr[mask.cvt64() + 0x10]); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], et.cvt8()); code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC code.and_(new_pc, mask); code.mov(MJitStateReg(A32::Reg::PC), new_pc); @@ -684,7 +683,7 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) { code.mov(mask, new_pc); code.and_(mask, 1); - code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], mask); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], mask.cvt8()); code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC code.and_(new_pc, mask); code.mov(MJitStateReg(A32::Reg::PC), new_pc); @@ -1261,17 +1260,17 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescri code.ReturnFromRunCode(); } -static u32 CalculateCpsr_et(const IR::LocationDescriptor& arg) { +static u8 CalculateCpsr_et(const IR::LocationDescriptor& arg) { const A32::LocationDescriptor desc{arg}; - u32 et = 0; - et |= desc.EFlag() ? 2 : 0; - et |= desc.TFlag() ? 1 : 0; + u8 et = 0; + et |= desc.EFlag() ? 0x10 : 0; + et |= desc.TFlag() ? 0x01 : 0; return et; } void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location) { if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) { - code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next)); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next)); } code.cmp(qword[r15 + offsetof(A32JitState, cycles_remaining)], 0); @@ -1296,7 +1295,7 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDesc void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location) { if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) { - code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next)); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next)); } patch_information[terminal.next].jmp.emplace_back(code.getCurr()); diff --git a/src/backend/x64/a32_interface.cpp b/src/backend/x64/a32_interface.cpp index 511fb796..381aa7f1 100644 --- a/src/backend/x64/a32_interface.cpp +++ b/src/backend/x64/a32_interface.cpp @@ -262,16 +262,7 @@ void Context::SetFpscr(std::uint32_t value) { } void TransferJitState(A32JitState& dest, const A32JitState& src, bool reset_rsb) { - dest.cpsr_ge = src.cpsr_ge; - dest.cpsr_et = src.cpsr_et; - dest.cpsr_q = src.cpsr_q; - dest.cpsr_nzcv = src.cpsr_nzcv; - dest.cpsr_jaifm = src.cpsr_jaifm; - dest.Reg = src.Reg; - dest.ExtReg = src.ExtReg; - dest.guest_MXCSR = src.guest_MXCSR; - dest.fpcr_mode = src.fpcr_mode; - dest.fpsr_nzcv = src.fpsr_nzcv; + dest = src; if (reset_rsb) { dest.ResetRSB(); } else { diff --git a/src/backend/x64/a32_jitstate.cpp b/src/backend/x64/a32_jitstate.cpp index 407c67bd..cb469424 100644 --- a/src/backend/x64/a32_jitstate.cpp +++ b/src/backend/x64/a32_jitstate.cpp @@ -46,7 +46,7 @@ namespace Dynarmic::BackendX64 { u32 A32JitState::Cpsr() const { ASSERT((cpsr_nzcv & ~0xF0000000) == 0); ASSERT((cpsr_q & ~1) == 0); - ASSERT((cpsr_et & ~3) == 0); + ASSERT((cpsr_et & ~0x11) == 0); ASSERT((cpsr_jaifm & ~0x010001DF) == 0); u32 cpsr = 0; @@ -61,8 +61,7 @@ u32 A32JitState::Cpsr() const { cpsr |= Common::Bit<15>(cpsr_ge) ? 1 << 17 : 0; cpsr |= Common::Bit<7>(cpsr_ge) ? 1 << 16 : 0; // E flag, T flag - cpsr |= Common::Bit<1>(cpsr_et) ? 1 << 9 : 0; - cpsr |= Common::Bit<0>(cpsr_et) ? 1 << 5 : 0; + cpsr |= static_cast(cpsr_et) << 5; // Other flags cpsr |= cpsr_jaifm; @@ -81,9 +80,7 @@ void A32JitState::SetCpsr(u32 cpsr) { cpsr_ge |= Common::Bit<17>(cpsr) ? 0x0000FF00 : 0; cpsr_ge |= Common::Bit<16>(cpsr) ? 0x000000FF : 0; // E flag, T flag - cpsr_et = 0; - cpsr_et |= Common::Bit<9>(cpsr) ? 2 : 0; - cpsr_et |= Common::Bit<5>(cpsr) ? 1 : 0; + cpsr_et = static_cast((cpsr >> 5) & 0x11); // Other flags cpsr_jaifm = cpsr & 0x07F0FDDF; } @@ -154,10 +151,12 @@ constexpr u32 FPSCR_MODE_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK; constexpr u32 FPSCR_NZCV_MASK = 0xF0000000; u32 A32JitState::Fpscr() const { - ASSERT((fpcr_mode & ~FPSCR_MODE_MASK) == 0); + const u32 fpcr_mode_shifted = static_cast(fpcr_mode) << 16; + + ASSERT((fpcr_mode_shifted & ~FPSCR_MODE_MASK) == 0); ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0); - u32 FPSCR = fpcr_mode | fpsr_nzcv; + u32 FPSCR = fpcr_mode_shifted | fpsr_nzcv; FPSCR |= (guest_MXCSR & 0b0000000000001); // IOC = IE FPSCR |= (guest_MXCSR & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE FPSCR |= fpsr_exc; @@ -166,7 +165,7 @@ u32 A32JitState::Fpscr() const { } void A32JitState::SetFpscr(u32 FPSCR) { - fpcr_mode = FPSCR & FPSCR_MODE_MASK; + fpcr_mode = static_cast((FPSCR & FPSCR_MODE_MASK) >> 16); fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK; guest_MXCSR = 0; @@ -188,7 +187,7 @@ void A32JitState::SetFpscr(u32 FPSCR) { } u64 A32JitState::GetUniqueHash() const noexcept { - return cpsr_et | fpcr_mode | (static_cast(Reg[15]) << 32); + return (static_cast(cpsr_et) << 32) | (static_cast(fpcr_mode) << 48) | Reg[15]; } } // namespace Dynarmic::BackendX64 diff --git a/src/backend/x64/a32_jitstate.h b/src/backend/x64/a32_jitstate.h index 0f6920b6..732bdccc 100644 --- a/src/backend/x64/a32_jitstate.h +++ b/src/backend/x64/a32_jitstate.h @@ -29,12 +29,17 @@ struct A32JitState { std::array Reg{}; // Current register file. // TODO: Mode-specific register sets unimplemented. - u32 cpsr_et = 0; + // Location Descriptor related (the order of fields is important) + u8 cpsr_et = 0; ///< Format: 000E000T + u8 padding = 0; + u16 fpcr_mode = 0; ///< Top 16 bits of FPCR + u64 GetUniqueHash() const noexcept; + + // CPSR fields u32 cpsr_ge = 0; u32 cpsr_q = 0; u32 cpsr_nzcv = 0; u32 cpsr_jaifm = 0; - u32 Cpsr() const; void SetCpsr(u32 cpsr); @@ -69,12 +74,9 @@ struct A32JitState { u32 fpsr_exc = 0; u32 fpsr_qc = 0; // Dummy value - u32 fpcr_mode = 0; u32 fpsr_nzcv = 0; u32 Fpscr() const; void SetFpscr(u32 FPSCR); - - u64 GetUniqueHash() const noexcept; }; #ifdef _MSC_VER diff --git a/src/frontend/A32/location_descriptor.h b/src/frontend/A32/location_descriptor.h index 0269b7a8..842e44bd 100644 --- a/src/frontend/A32/location_descriptor.h +++ b/src/frontend/A32/location_descriptor.h @@ -32,10 +32,10 @@ public: : arm_pc(arm_pc), cpsr(cpsr.Value() & CPSR_MODE_MASK), fpscr(fpscr.Value() & FPSCR_MODE_MASK) {} explicit LocationDescriptor(const IR::LocationDescriptor& o) { - arm_pc = o.Value() >> 32; - cpsr.T(o.Value() & 1); - cpsr.E(o.Value() & 2); - fpscr = o.Value() & FPSCR_MODE_MASK; + arm_pc = static_cast(o.Value()); + cpsr.T(o.Value() & (u64(0x01) << 32)); + cpsr.E(o.Value() & (u64(0x10) << 32)); + fpscr = static_cast(o.Value() >> 32) & FPSCR_MODE_MASK; } u32 PC() const { return arm_pc; } @@ -82,10 +82,10 @@ public: u64 UniqueHash() const noexcept { // This value MUST BE UNIQUE. // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint - const u64 pc_u64 = u64(arm_pc) << 32; - const u64 fpscr_u64 = u64(fpscr.Value()); - const u64 t_u64 = cpsr.T() ? 1 : 0; - const u64 e_u64 = cpsr.E() ? 2 : 0; + const u64 pc_u64 = u64(arm_pc); + const u64 fpscr_u64 = u64(fpscr.Value()) << 32; + const u64 t_u64 = cpsr.T() ? u64(0x01) << 32 : 0; + const u64 e_u64 = cpsr.E() ? u64(0x10) << 32 : 0; return pc_u64 | fpscr_u64 | t_u64 | e_u64; }