diff --git a/src/backend_x64/emit_x64_data_processing.cpp b/src/backend_x64/emit_x64_data_processing.cpp index 489d60b1..8746ca09 100644 --- a/src/backend_x64/emit_x64_data_processing.cpp +++ b/src/backend_x64/emit_x64_data_processing.cpp @@ -28,6 +28,25 @@ void EmitX64::EmitPack2x32To1x64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, lo); } +void EmitX64::EmitPack2x64To1x128(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Xbyak::Reg64 lo = ctx.reg_alloc.UseGpr(args[0]); + Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]); + Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { + code.movq(result, lo); + code.pinsrq(result, hi, 1); + } else { + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movq(result, lo); + code.movq(tmp, hi); + code.punpcklqdq(result, tmp); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + void EmitX64::EmitLeastSignificantWord(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.DefineValue(inst, args[0]); diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index 4ce46a1d..0a44228b 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -137,12 +137,12 @@ INST(LDx_mult_2, "LDx (multiple structures)", "0Q001 // Loads and stores - Load/Store Exclusive INST(STXR, "STXRB, STXRH, STXR", "zz001000000sssss011111nnnnnttttt") INST(STLXR, "STLXRB, STLXRH, STLXR", "zz001000000sssss111111nnnnnttttt") -//INST(STXP, "STXP", "1z001000001sssss0uuuuunnnnnttttt") -//INST(STLXP, "STLXP", "1z001000001sssss1uuuuunnnnnttttt") +INST(STXP, "STXP", "1z001000001sssss0uuuuunnnnnttttt") +INST(STLXP, "STLXP", "1z001000001sssss1uuuuunnnnnttttt") INST(LDXR, "LDXRB, LDXRH, LDXR", "zz00100001011111011111nnnnnttttt") INST(LDAXR, "LDAXRB, LDAXRH, LDAXR", "zz00100001011111111111nnnnnttttt") -//INST(LDXP, "LDXP", "1z001000011111110uuuuunnnnnttttt") -//INST(LDAXP, "LDAXP", "1z001000011111111uuuuunnnnnttttt") +INST(LDXP, "LDXP", "1z001000011111110uuuuunnnnnttttt") +INST(LDAXP, "LDAXP", "1z001000011111111uuuuunnnnnttttt") INST(STLLR, "STLLRB, STLLRH, STLLR", "zz00100010011111011111nnnnnttttt") INST(STLR, "STLRB, STLRH, STLR", "zz00100010011111111111nnnnnttttt") INST(LDLAR, "LDLARB, LDLARH, LDLAR", "zz00100011011111011111nnnnnttttt") diff --git a/src/frontend/A64/translate/impl/load_store_exclusive.cpp b/src/frontend/A64/translate/impl/load_store_exclusive.cpp index 3b4563c5..78316fac 100644 --- a/src/frontend/A64/translate/impl/load_store_exclusive.cpp +++ b/src/frontend/A64/translate/impl/load_store_exclusive.cpp @@ -10,20 +10,24 @@ namespace Dynarmic::A64 { -static bool ExclusiveSharedDecodeAndOperation(TranslatorVisitor& tv, IREmitter& ir, size_t size, bool L, bool o0, boost::optional Rs, Reg Rn, Reg Rt) { +static bool ExclusiveSharedDecodeAndOperation(TranslatorVisitor& tv, IREmitter& ir, bool pair, size_t size, bool L, bool o0, boost::optional Rs, boost::optional Rt2, Reg Rn, Reg Rt) { // Shared Decode const AccType acctype = o0 ? AccType::ORDERED : AccType::ATOMIC; const MemOp memop = L ? MemOp::LOAD : MemOp::STORE; const size_t elsize = 8 << size; const size_t regsize = elsize == 64 ? 64 : 32; - const size_t datasize = elsize; + const size_t datasize = pair ? elsize * 2 : elsize; // Operation const size_t dbytes = datasize / 8; - if (memop == MemOp::STORE && *Rs == Rn && Rn != Reg::R31) { + if (memop == MemOp::LOAD && pair && Rt == *Rt2) { + return tv.UnpredictableInstruction(); + } else if (memop == MemOp::STORE && (*Rs == Rt || (pair && *Rs == *Rt2))) { + return tv.UnpredictableInstruction(); + } else if (memop == MemOp::STORE && *Rs == Rn && Rn != Reg::R31) { return tv.UnpredictableInstruction(); } @@ -37,15 +41,30 @@ static bool ExclusiveSharedDecodeAndOperation(TranslatorVisitor& tv, IREmitter& switch (memop) { case MemOp::STORE: { - IR::UAny data = tv.X(datasize, Rt); + IR::UAnyU128 data; + if (pair && elsize == 64) { + data = ir.Pack2x64To1x128(tv.X(64, Rt), tv.X(64, *Rt2)); + } else if (pair && elsize == 32) { + data = ir.Pack2x32To1x64(tv.X(32, Rt), tv.X(32, *Rt2)); + } else { + data = tv.X(datasize, Rt); + } IR::U32 status = tv.ExclusiveMem(address, dbytes, acctype, data); tv.X(32, *Rs, status); break; } case MemOp::LOAD: { ir.SetExclusive(address, dbytes); - IR::UAny data = tv.Mem(address, dbytes, acctype); - tv.X(regsize, Rt, tv.ZeroExtend(data, regsize)); + IR::UAnyU128 data = tv.Mem(address, dbytes, acctype); + if (pair && elsize == 64) { + tv.X(64, Rt, ir.VectorGetElement(64, data, 0)); + tv.X(64, *Rt2, ir.VectorGetElement(64, data, 1)); + } else if (pair && elsize == 32) { + tv.X(32, Rt, ir.LeastSignificantWord(data)); + tv.X(32, *Rt2, ir.MostSignificantWord(data).result); + } else { + tv.X(regsize, Rt, tv.ZeroExtend(data, regsize)); + } break; } default: @@ -56,31 +75,67 @@ static bool ExclusiveSharedDecodeAndOperation(TranslatorVisitor& tv, IREmitter& } bool TranslatorVisitor::STXR(Imm<2> sz, Reg Rs, Reg Rn, Reg Rt) { + const bool pair = false; const size_t size = sz.ZeroExtend(); const bool L = 0; const bool o0 = 0; - return ExclusiveSharedDecodeAndOperation(*this, ir, size, L, o0, Rs, Rn, Rt); + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, Rs, {}, Rn, Rt); } bool TranslatorVisitor::STLXR(Imm<2> sz, Reg Rs, Reg Rn, Reg Rt) { + const bool pair = false; const size_t size = sz.ZeroExtend(); const bool L = 0; const bool o0 = 1; - return ExclusiveSharedDecodeAndOperation(*this, ir, size, L, o0, Rs, Rn, Rt); + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, Rs, {}, Rn, Rt); +} + +bool TranslatorVisitor::STXP(Imm<1> sz, Reg Rs, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend(); + const bool L = 0; + const bool o0 = 0; + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, Rs, Rt2, Rn, Rt); +} + +bool TranslatorVisitor::STLXP(Imm<1> sz, Reg Rs, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend(); + const bool L = 0; + const bool o0 = 1; + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, Rs, Rt2, Rn, Rt); } bool TranslatorVisitor::LDXR(Imm<2> sz, Reg Rn, Reg Rt) { + const bool pair = false; const size_t size = sz.ZeroExtend(); const bool L = 1; const bool o0 = 0; - return ExclusiveSharedDecodeAndOperation(*this, ir, size, L, o0, {}, Rn, Rt); + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, {}, {}, Rn, Rt); } bool TranslatorVisitor::LDAXR(Imm<2> sz, Reg Rn, Reg Rt) { + const bool pair = false; const size_t size = sz.ZeroExtend(); const bool L = 1; const bool o0 = 1; - return ExclusiveSharedDecodeAndOperation(*this, ir, size, L, o0, {}, Rn, Rt); + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, {}, {}, Rn, Rt); +} + +bool TranslatorVisitor::LDXP(Imm<1> sz, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend(); + const bool L = 1; + const bool o0 = 0; + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, {}, Rt2, Rn, Rt); +} + +bool TranslatorVisitor::LDAXP(Imm<1> sz, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend(); + const bool L = 1; + const bool o0 = 1; + return ExclusiveSharedDecodeAndOperation(*this, ir, pair, size, L, o0, {}, Rt2, Rn, Rt); } static bool OrderedSharedDecodeAndOperation(TranslatorVisitor& tv, size_t size, bool L, bool o0, Reg Rn, Reg Rt) { diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 4d33f33a..342815b1 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -38,6 +38,10 @@ U64 IREmitter::Pack2x32To1x64(const U32& lo, const U32& hi) { return Inst(Opcode::Pack2x32To1x64, lo, hi); } +U128 IREmitter::Pack2x64To1x128(const U64& lo, const U64& hi) { + return Inst(Opcode::Pack2x64To1x128, lo, hi); +} + U32 IREmitter::LeastSignificantWord(const U64& value) { return Inst(Opcode::LeastSignificantWord, value); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 3fd1f569..1f94edff 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -68,6 +68,7 @@ public: void PushRSB(const LocationDescriptor& return_location); U64 Pack2x32To1x64(const U32& lo, const U32& hi); + U128 Pack2x64To1x128(const U64& lo, const U64& hi); U32 LeastSignificantWord(const U64& value); ResultAndCarry MostSignificantWord(const U64& value); U16 LeastSignificantHalf(U32U64 value); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 3796fbca..21a7003a 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -77,6 +77,7 @@ OPCODE(NZCVFromPackedFlags, T::NZCVFlags, T::U32 // Calculations OPCODE(Pack2x32To1x64, T::U64, T::U32, T::U32 ) +OPCODE(Pack2x64To1x128, T::U128, T::U64, T::U64 ) OPCODE(LeastSignificantWord, T::U32, T::U64 ) OPCODE(MostSignificantWord, T::U32, T::U64 ) OPCODE(LeastSignificantHalf, T::U16, T::U32 )