backend/x64: Reduce conversions required for cpsr_nzcv

The guest program often accesses the NZCV flags directly much less
often than we need to use them for jumps and other such uses.

Therefore, we store our flags in cpsr_nzcv in a x64-friendly format.

This allows for a reduction in conditional jump related code.
This commit is contained in:
MerryMage 2020-05-06 22:08:38 +01:00
parent f4922a97f6
commit 8b3bc92bce
7 changed files with 213 additions and 189 deletions

View file

@ -19,6 +19,7 @@
#include "backend/x64/block_of_code.h" #include "backend/x64/block_of_code.h"
#include "backend/x64/devirtualize.h" #include "backend/x64/devirtualize.h"
#include "backend/x64/emit_x64.h" #include "backend/x64/emit_x64.h"
#include "backend/x64/nzcv_util.h"
#include "backend/x64/perf_map.h" #include "backend/x64/perf_map.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/bit_util.h" #include "common/bit_util.h"
@ -307,6 +308,12 @@ void A32EmitX64::GenTerminalHandlers() {
} }
} }
void A32EmitX64::EmitA32SetCheckBit(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
code.mov(code.byte[r15 + offsetof(A32JitState, check_bit)], to_store);
}
void A32EmitX64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) {
const A32::Reg reg = inst->GetArg(0).GetA32RegRef(); const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
@ -384,6 +391,7 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
// Here we observe that cpsr_et and cpsr_ge are right next to each other in memory, // Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
// so we load them both at the same time with one 64-bit read. This allows us to // so we load them both at the same time with one 64-bit read. This allows us to
@ -397,7 +405,11 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]); code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
code.shl(tmp, 27); code.shl(tmp, 27);
code.or_(result, tmp); code.or_(result, tmp);
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]); code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.mov(tmp, NZCV::x64_mask);
code.pext(tmp2, tmp2, tmp);
code.shl(tmp2, 28);
code.or_(result, tmp2);
code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]); code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
@ -430,7 +442,9 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
// cpsr_nzcv // cpsr_nzcv
code.mov(tmp, cpsr); code.mov(tmp, cpsr);
code.and_(tmp, 0xF0000000); code.shr(tmp, 28);
code.mov(tmp2, NZCV::x64_mask);
code.pdep(tmp, tmp, tmp2);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp); code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp);
// cpsr_jaifm // cpsr_jaifm
@ -470,11 +484,21 @@ void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
if (args[0].IsImmediate()) { if (args[0].IsImmediate()) {
const u32 imm = args[0].GetImmediateU32(); const u32 imm = args[0].GetImmediateU32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], u32(imm & 0xF0000000)); code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
code.shr(a, 28);
code.mov(b, NZCV::x64_mask);
code.pdep(a, a, b);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
} else { } else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.and_(a, 0xF0000000); code.shr(a, 28);
code.imul(a, a, NZCV::to_x64_multiplier);
code.and_(a, NZCV::x64_mask);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a); code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
} }
} }
@ -484,129 +508,90 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
if (args[0].IsImmediate()) { if (args[0].IsImmediate()) {
const u32 imm = args[0].GetImmediateU32(); const u32 imm = args[0].GetImmediateU32();
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], u32(imm & 0xF0000000)); code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0)); code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
} else if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
code.shr(a, 28);
code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
code.mov(b, NZCV::x64_mask);
code.pdep(a, a, b);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
} else { } else {
const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.bt(a, 27); code.shr(a, 28);
code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]); code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
code.and_(a, 0xF0000000); code.imul(a, a, NZCV::to_x64_multiplier);
code.and_(a, NZCV::x64_mask);
code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a); code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
} }
} }
void A32EmitX64::EmitA32GetNFlag(A32EmitContext& ctx, IR::Inst* inst) { static void EmitGetFlag(BlockOfCode& code, A32EmitContext& ctx, IR::Inst* inst, size_t flag_bit) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]); code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.shr(result, 31); if (flag_bit != 0) {
code.shr(result, static_cast<int>(flag_bit));
}
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
} }
static void EmitSetFlag(BlockOfCode& code, A32EmitContext& ctx, IR::Inst* inst, size_t flag_bit) {
const u32 flag_mask = 1u << flag_bit;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
if (args[0].GetImmediateU1()) {
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], flag_mask);
} else {
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
}
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
if (flag_bit != 0) {
code.shl(to_store, static_cast<int>(flag_bit));
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store);
} else {
code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store.cvt8());
}
}
}
void A32EmitX64::EmitA32GetNFlag(A32EmitContext& ctx, IR::Inst* inst) {
EmitGetFlag(code, ctx, inst, NZCV::x64_n_flag_bit);
}
void A32EmitX64::EmitA32SetNFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32SetNFlag(A32EmitContext& ctx, IR::Inst* inst) {
constexpr size_t flag_bit = 31; EmitSetFlag(code, ctx, inst, NZCV::x64_n_flag_bit);
constexpr u32 flag_mask = 1u << flag_bit;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
if (args[0].GetImmediateU1()) {
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], flag_mask);
} else {
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
}
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.shl(to_store, flag_bit);
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store);
}
} }
void A32EmitX64::EmitA32GetZFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32GetZFlag(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); EmitGetFlag(code, ctx, inst, NZCV::x64_z_flag_bit);
code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.shr(result, 30);
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result);
} }
void A32EmitX64::EmitA32SetZFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32SetZFlag(A32EmitContext& ctx, IR::Inst* inst) {
constexpr size_t flag_bit = 30; EmitSetFlag(code, ctx, inst, NZCV::x64_z_flag_bit);
constexpr u32 flag_mask = 1u << flag_bit;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
if (args[0].GetImmediateU1()) {
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], flag_mask);
} else {
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
}
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.shl(to_store, flag_bit);
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store);
}
}
void A32EmitX64::EmitA32SetCheckBit(A32EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
code.mov(code.byte[r15 + offsetof(A32JitState, check_bit)], to_store);
} }
void A32EmitX64::EmitA32GetCFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32GetCFlag(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); EmitGetFlag(code, ctx, inst, NZCV::x64_c_flag_bit);
code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.shr(result, 29);
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result);
} }
void A32EmitX64::EmitA32SetCFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32SetCFlag(A32EmitContext& ctx, IR::Inst* inst) {
constexpr size_t flag_bit = 29; EmitSetFlag(code, ctx, inst, NZCV::x64_c_flag_bit);
constexpr u32 flag_mask = 1u << flag_bit;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
if (args[0].GetImmediateU1()) {
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], flag_mask);
} else {
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
}
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.shl(to_store, flag_bit);
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store);
}
} }
void A32EmitX64::EmitA32GetVFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32GetVFlag(A32EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); EmitGetFlag(code, ctx, inst, NZCV::x64_v_flag_bit);
code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
code.shr(result, 28);
code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result);
} }
void A32EmitX64::EmitA32SetVFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32SetVFlag(A32EmitContext& ctx, IR::Inst* inst) {
constexpr size_t flag_bit = 28; EmitSetFlag(code, ctx, inst, NZCV::x64_v_flag_bit);
constexpr u32 flag_mask = 1u << flag_bit;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (args[0].IsImmediate()) {
if (args[0].GetImmediateU1()) {
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], flag_mask);
} else {
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
}
} else {
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.shl(to_store, flag_bit);
code.and_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], ~flag_mask);
code.or_(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store);
}
} }
void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) {

View file

@ -5,6 +5,7 @@
#include "backend/x64/a32_jitstate.h" #include "backend/x64/a32_jitstate.h"
#include "backend/x64/block_of_code.h" #include "backend/x64/block_of_code.h"
#include "backend/x64/nzcv_util.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/bit_util.h" #include "common/bit_util.h"
#include "common/common_types.h" #include "common/common_types.h"
@ -45,14 +46,14 @@ namespace Dynarmic::Backend::X64 {
*/ */
u32 A32JitState::Cpsr() const { u32 A32JitState::Cpsr() const {
DEBUG_ASSERT((cpsr_nzcv & ~0xF0000000) == 0); DEBUG_ASSERT((cpsr_nzcv & ~NZCV::x64_mask) == 0);
DEBUG_ASSERT((cpsr_q & ~1) == 0); DEBUG_ASSERT((cpsr_q & ~1) == 0);
DEBUG_ASSERT((cpsr_jaifm & ~0x010001DF) == 0); DEBUG_ASSERT((cpsr_jaifm & ~0x010001DF) == 0);
u32 cpsr = 0; u32 cpsr = 0;
// NZCV flags // NZCV flags
cpsr |= cpsr_nzcv; cpsr |= NZCV::FromX64(cpsr_nzcv);
// Q flag // Q flag
cpsr |= cpsr_q ? 1 << 27 : 0; cpsr |= cpsr_q ? 1 << 27 : 0;
// GE flags // GE flags
@ -74,7 +75,7 @@ u32 A32JitState::Cpsr() const {
void A32JitState::SetCpsr(u32 cpsr) { void A32JitState::SetCpsr(u32 cpsr) {
// NZCV flags // NZCV flags
cpsr_nzcv = cpsr & 0xF0000000; cpsr_nzcv = NZCV::ToX64(cpsr);
// Q flag // Q flag
cpsr_q = Common::Bit<27>(cpsr) ? 1 : 0; cpsr_q = Common::Bit<27>(cpsr) ? 1 : 0;
// GE flags // GE flags

View file

@ -15,6 +15,7 @@
#include "backend/x64/block_of_code.h" #include "backend/x64/block_of_code.h"
#include "backend/x64/devirtualize.h" #include "backend/x64/devirtualize.h"
#include "backend/x64/emit_x64.h" #include "backend/x64/emit_x64.h"
#include "backend/x64/nzcv_util.h"
#include "backend/x64/perf_map.h" #include "backend/x64/perf_map.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/bit_util.h" #include "common/bit_util.h"
@ -371,7 +372,7 @@ void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) {
void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) { void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(result, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]); code.mov(result, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
code.shr(result, 29); code.shr(result, NZCV::x64_c_flag_bit);
code.and_(result, 1); code.and_(result, 1);
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
} }
@ -380,6 +381,9 @@ void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32(); const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32();
code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]); code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
code.and_(nzcv_raw, NZCV::x64_mask);
code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier);
code.and_(nzcv_raw, NZCV::arm_mask);
ctx.reg_alloc.DefineValue(inst, nzcv_raw); ctx.reg_alloc.DefineValue(inst, nzcv_raw);
} }
@ -387,17 +391,15 @@ void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.and_(nzcv_raw, 0xF0000000); code.shr(nzcv_raw, 28);
code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier);
code.and_(nzcv_raw, NZCV::x64_mask);
code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw); code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw);
} }
void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) { void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst); auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
code.and_(to_store, 0b11000001'00000001);
code.imul(to_store, to_store, 0b00010000'00100001);
code.shl(to_store, 16);
code.and_(to_store, 0xF0000000);
code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], to_store); code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], to_store);
} }

View file

@ -9,6 +9,7 @@
#include <xbyak.h> #include <xbyak.h>
#include "backend/x64/nzcv_util.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "frontend/A64/location_descriptor.h" #include "frontend/A64/location_descriptor.h"
@ -33,10 +34,10 @@ struct A64JitState {
u32 cpsr_nzcv = 0; u32 cpsr_nzcv = 0;
u32 GetPstate() const { u32 GetPstate() const {
return cpsr_nzcv; return NZCV::FromX64(cpsr_nzcv);
} }
void SetPstate(u32 new_pstate) { void SetPstate(u32 new_pstate) {
cpsr_nzcv = new_pstate & 0xF0000000; cpsr_nzcv = NZCV::ToX64(new_pstate);
} }
alignas(16) std::array<u64, 64> vec{}; // Extension registers. alignas(16) std::array<u64, 64> vec{}; // Extension registers.

View file

@ -174,115 +174,82 @@ void EmitX64::EmitAddCycles(size_t cycles) {
} }
Xbyak::Label EmitX64::EmitCond(IR::Cond cond) { Xbyak::Label EmitX64::EmitCond(IR::Cond cond) {
Xbyak::Label label; Xbyak::Label pass;
const Xbyak::Reg32 cpsr = eax; code.mov(eax, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
code.mov(cpsr, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
constexpr size_t n_shift = 31; // sahf restores SF, ZF, CF
constexpr size_t z_shift = 30; // add al, 0x7F restores OF
constexpr size_t c_shift = 29;
constexpr size_t v_shift = 28;
constexpr u32 n_mask = 1u << n_shift;
constexpr u32 z_mask = 1u << z_shift;
constexpr u32 c_mask = 1u << c_shift;
constexpr u32 v_mask = 1u << v_shift;
switch (cond) { switch (cond) {
case IR::Cond::EQ: //z case IR::Cond::EQ: //z
code.test(cpsr, z_mask); code.sahf();
code.jnz(label); code.jz(pass);
break; break;
case IR::Cond::NE: //!z case IR::Cond::NE: //!z
code.test(cpsr, z_mask); code.sahf();
code.jz(label); code.jnz(pass);
break; break;
case IR::Cond::CS: //c case IR::Cond::CS: //c
code.test(cpsr, c_mask); code.sahf();
code.jnz(label); code.jc(pass);
break; break;
case IR::Cond::CC: //!c case IR::Cond::CC: //!c
code.test(cpsr, c_mask); code.sahf();
code.jz(label); code.jnc(pass);
break; break;
case IR::Cond::MI: //n case IR::Cond::MI: //n
code.test(cpsr, n_mask); code.sahf();
code.jnz(label); code.js(pass);
break; break;
case IR::Cond::PL: //!n case IR::Cond::PL: //!n
code.test(cpsr, n_mask); code.sahf();
code.jz(label); code.jns(pass);
break; break;
case IR::Cond::VS: //v case IR::Cond::VS: //v
code.test(cpsr, v_mask); code.add(al, 0x7F);
code.jnz(label); code.jo(pass);
break; break;
case IR::Cond::VC: //!v case IR::Cond::VC: //!v
code.test(cpsr, v_mask); code.add(al, 0x7F);
code.jz(label); code.jno(pass);
break; break;
case IR::Cond::HI: { //c & !z case IR::Cond::HI: //c & !z
code.and_(cpsr, z_mask | c_mask); code.sahf();
code.cmp(cpsr, c_mask); code.cmc();
code.je(label); code.ja(pass);
break; break;
} case IR::Cond::LS: //!c | z
case IR::Cond::LS: { //!c | z code.sahf();
code.and_(cpsr, z_mask | c_mask); code.cmc();
code.cmp(cpsr, c_mask); code.jna(pass);
code.jne(label);
break; break;
} case IR::Cond::GE: // n == v
case IR::Cond::GE: { // n == v code.add(al, 0x7F);
code.and_(cpsr, n_mask | v_mask); code.sahf();
code.jz(label); code.jge(pass);
code.cmp(cpsr, n_mask | v_mask);
code.je(label);
break; break;
} case IR::Cond::LT: // n != v
case IR::Cond::LT: { // n != v code.add(al, 0x7F);
Xbyak::Label fail; code.sahf();
code.and_(cpsr, n_mask | v_mask); code.jl(pass);
code.jz(fail);
code.cmp(cpsr, n_mask | v_mask);
code.jne(label);
code.L(fail);
break; break;
} case IR::Cond::GT: // !z & (n == v)
case IR::Cond::GT: { // !z & (n == v) code.add(al, 0x7F);
const Xbyak::Reg32 tmp1 = ebx; code.sahf();
const Xbyak::Reg32 tmp2 = esi; code.jg(pass);
code.mov(tmp1, cpsr);
code.mov(tmp2, cpsr);
code.shr(tmp1, n_shift);
code.shr(tmp2, v_shift);
code.shr(cpsr, z_shift);
code.xor_(tmp1, tmp2);
code.or_(tmp1, cpsr);
code.test(tmp1, 1);
code.jz(label);
break; break;
} case IR::Cond::LE: // z | (n != v)
case IR::Cond::LE: { // z | (n != v) code.add(al, 0x7F);
const Xbyak::Reg32 tmp1 = ebx; code.sahf();
const Xbyak::Reg32 tmp2 = esi; code.jle(pass);
code.mov(tmp1, cpsr);
code.mov(tmp2, cpsr);
code.shr(tmp1, n_shift);
code.shr(tmp2, v_shift);
code.shr(cpsr, z_shift);
code.xor_(tmp1, tmp2);
code.or_(tmp1, cpsr);
code.test(tmp1, 1);
code.jnz(label);
break; break;
}
default: default:
ASSERT_MSG(false, "Unknown cond {}", static_cast<size_t>(cond)); ASSERT_MSG(false, "Unknown cond {}", static_cast<size_t>(cond));
break; break;
} }
return label; return pass;
} }
EmitX64::BlockDescriptor EmitX64::RegisterBlock(const IR::LocationDescriptor& descriptor, CodePtr entrypoint, size_t size) { EmitX64::BlockDescriptor EmitX64::RegisterBlock(const IR::LocationDescriptor& descriptor, CodePtr entrypoint, size_t size) {

View file

@ -126,56 +126,71 @@ static void EmitConditionalSelect(BlockOfCode& code, EmitContext& ctx, IR::Inst*
const Xbyak::Reg else_ = ctx.reg_alloc.UseScratchGpr(args[2]).changeBit(bitsize); const Xbyak::Reg else_ = ctx.reg_alloc.UseScratchGpr(args[2]).changeBit(bitsize);
code.mov(nzcv, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]); code.mov(nzcv, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
// TODO: Flag optimization
code.shr(nzcv, 28); // sahf restores SF, ZF, CF
code.imul(nzcv, nzcv, 0b00010000'10000001); // add al, 0x7F restores OF
code.and_(nzcv.cvt8(), 1);
code.add(nzcv.cvt8(), 0x7F); // restore OF
code.sahf(); // restore SF, ZF, CF
switch (args[0].GetImmediateCond()) { switch (args[0].GetImmediateCond()) {
case IR::Cond::EQ: //z case IR::Cond::EQ: //z
code.sahf();
code.cmovz(else_, then_); code.cmovz(else_, then_);
break; break;
case IR::Cond::NE: //!z case IR::Cond::NE: //!z
code.sahf();
code.cmovnz(else_, then_); code.cmovnz(else_, then_);
break; break;
case IR::Cond::CS: //c case IR::Cond::CS: //c
code.sahf();
code.cmovc(else_, then_); code.cmovc(else_, then_);
break; break;
case IR::Cond::CC: //!c case IR::Cond::CC: //!c
code.sahf();
code.cmovnc(else_, then_); code.cmovnc(else_, then_);
break; break;
case IR::Cond::MI: //n case IR::Cond::MI: //n
code.sahf();
code.cmovs(else_, then_); code.cmovs(else_, then_);
break; break;
case IR::Cond::PL: //!n case IR::Cond::PL: //!n
code.sahf();
code.cmovns(else_, then_); code.cmovns(else_, then_);
break; break;
case IR::Cond::VS: //v case IR::Cond::VS: //v
code.add(nzcv.cvt8(), 0x7F);
code.cmovo(else_, then_); code.cmovo(else_, then_);
break; break;
case IR::Cond::VC: //!v case IR::Cond::VC: //!v
code.add(nzcv.cvt8(), 0x7F);
code.cmovno(else_, then_); code.cmovno(else_, then_);
break; break;
case IR::Cond::HI: //c & !z case IR::Cond::HI: //c & !z
code.sahf();
code.cmc(); code.cmc();
code.cmova(else_, then_); code.cmova(else_, then_);
break; break;
case IR::Cond::LS: //!c | z case IR::Cond::LS: //!c | z
code.sahf();
code.cmc(); code.cmc();
code.cmovna(else_, then_); code.cmovna(else_, then_);
break; break;
case IR::Cond::GE: // n == v case IR::Cond::GE: // n == v
code.add(nzcv.cvt8(), 0x7F);
code.sahf();
code.cmovge(else_, then_); code.cmovge(else_, then_);
break; break;
case IR::Cond::LT: // n != v case IR::Cond::LT: // n != v
code.add(nzcv.cvt8(), 0x7F);
code.sahf();
code.cmovl(else_, then_); code.cmovl(else_, then_);
break; break;
case IR::Cond::GT: // !z & (n == v) case IR::Cond::GT: // !z & (n == v)
code.add(nzcv.cvt8(), 0x7F);
code.sahf();
code.cmovg(else_, then_); code.cmovg(else_, then_);
break; break;
case IR::Cond::LE: // z | (n != v) case IR::Cond::LE: // z | (n != v)
code.add(nzcv.cvt8(), 0x7F);
code.sahf();
code.cmovle(else_, then_); code.cmovle(else_, then_);
break; break;
case IR::Cond::AL: case IR::Cond::AL:

View file

@ -0,0 +1,53 @@
/* This file is part of the dynarmic project.
* Copyright (c) 2016 MerryMage
* SPDX-License-Identifier: 0BSD
*/
#pragma once
#include "common/common_types.h"
#include "common/bit_util.h"
namespace Dynarmic::Backend::X64::NZCV {
constexpr u32 arm_mask = 0xF000'0000;
constexpr u32 x64_mask = 0xC101;
constexpr size_t x64_n_flag_bit = 15;
constexpr size_t x64_z_flag_bit = 14;
constexpr size_t x64_c_flag_bit = 8;
constexpr size_t x64_v_flag_bit = 0;
/// This is a constant used to create the x64 flags format from the ARM format.
/// NZCV * multiplier: NZCV0NZCV000NZCV
/// x64_flags format: NZ-----C-------V
constexpr u32 to_x64_multiplier = 0x1081;
/// This is a constant used to create the ARM format from the x64 flags format.
constexpr u32 from_x64_multiplier = 0x1021'0000;
inline u32 ToX64(u32 nzcv) {
/* Naive implementation:
u32 x64_flags = 0;
x64_flags |= Common::Bit<31>(cpsr) ? 1 << 15 : 0;
x64_flags |= Common::Bit<30>(cpsr) ? 1 << 14 : 0;
x64_flags |= Common::Bit<29>(cpsr) ? 1 << 8 : 0;
x64_flags |= Common::Bit<28>(cpsr) ? 1 : 0;
return x64_flags;
*/
return ((nzcv >> 28) * to_x64_multiplier) & x64_mask;
}
inline u32 FromX64(u32 x64_flags) {
/* Naive implementation:
u32 nzcv = 0;
nzcv |= Common::Bit<15>(x64_flags) ? 1 << 31 : 0;
nzcv |= Common::Bit<14>(x64_flags) ? 1 << 30 : 0;
nzcv |= Common::Bit<8>(x64_flags) ? 1 << 29 : 0;
nzcv |= Common::Bit<0>(x64_flags) ? 1 << 28 : 0;
return nzcv;
*/
return ((x64_flags & x64_mask) * from_x64_multiplier) & arm_mask;
}
} // namespace Dynarmic::Backend::X64::NZCV