diff --git a/src/dynarmic/backend/arm64/emit_arm64_vector.cpp b/src/dynarmic/backend/arm64/emit_arm64_vector.cpp index 770bdc51..7f68ffec 100644 --- a/src/dynarmic/backend/arm64/emit_arm64_vector.cpp +++ b/src/dynarmic/backend/arm64/emit_arm64_vector.cpp @@ -1321,6 +1321,15 @@ void EmitIR(oaknut::CodeGenerator& code, EmitCont EmitReduce<64>(code, ctx, inst, [&](auto& Dresult, auto Voperand) { code.ADDP(Dresult, Voperand); }); } +template<> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { + ASSERT(shift_amount % 8 == 0); + const u8 ext_imm = (shift_amount % 128) / 8; + code.EXT(Vresult, Voperand, Voperand, ext_imm); + }); +} + template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRHADD(Vresult, Va, Vb); }); @@ -1391,14 +1400,6 @@ void EmitIR(oaknut::CodeGenerator& code, EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); }); } -template<> -void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { - (void)code; - (void)ctx; - (void)inst; - ASSERT_FALSE("Unimplemented"); -} - template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { EmitTwoOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SXTL(Vresult, Voperand); }); diff --git a/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/backend/x64/emit_x64_vector.cpp index 156ee13d..4de61d47 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -3320,6 +3321,20 @@ void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, data); } +void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const u8 shift_amount = args[1].GetImmediateU8(); + ASSERT(shift_amount % 32 == 0); + const u8 shuffle_imm = std::rotr(0b11100100, shift_amount / 32 * 2); + + code.pshufd(result, operand, shuffle_imm); + + ctx.reg_alloc.DefineValue(inst, result); +} + static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -3501,22 +3516,6 @@ void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) { }); } -static void VectorShuffleImpl(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx&, const Xbyak::Operand&, u8)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const u8 mask = args[1].GetImmediateU8(); - - (code.*fn)(result, operand, mask); - - ctx.reg_alloc.DefineValue(inst, result); -} - -void EmitX64::EmitVectorShuffleWords(EmitContext& ctx, IR::Inst* inst) { - VectorShuffleImpl(code, ctx, inst, &Xbyak::CodeGenerator::pshufd); -} - void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { diff --git a/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp b/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp index 95f3a37e..06e15273 100644 --- a/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp +++ b/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp @@ -103,7 +103,7 @@ bool TranslatorVisitor::ZIP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { // TODO: Urgh. const IR::U128 interleaved = ir.VectorInterleaveLower(esize, operand1, operand2); - return ir.VectorZeroUpper(ir.VectorShuffleWords(interleaved, 0b01001110)); + return ir.VectorZeroUpper(ir.VectorRotateWholeVectorRight(interleaved, 64)); }(); V(datasize, Vd, result); diff --git a/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp b/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp index 7d586b3a..795bfcdb 100644 --- a/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp +++ b/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp @@ -39,7 +39,7 @@ IR::U128 SHA1HashUpdate(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA1HashUpdateFun // Move each 32-bit element to the left once // e.g. [3, 2, 1, 0], becomes [2, 1, 0, 3] - const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011); + const IR::U128 shuffled_x = ir.VectorRotateWholeVectorRight(x, 96); x = ir.VectorSetElement(32, shuffled_x, 0, y); y = high_x; } @@ -91,7 +91,7 @@ bool TranslatorVisitor::SHA1SU1(Vec Vn, Vec Vd) { const IR::U128 n = ir.GetQ(Vn); // Shuffle down the whole vector and zero out the top 32 bits - const IR::U128 shuffled_n = ir.VectorSetElement(32, ir.VectorShuffleWords(n, 0b00111001), 3, ir.Imm32(0)); + const IR::U128 shuffled_n = ir.VectorSetElement(32, ir.VectorRotateWholeVectorRight(n, 32), 3, ir.Imm32(0)); const IR::U128 t = ir.VectorEor(d, shuffled_n); const IR::U128 rotated_t = ir.VectorRotateLeft(32, t, 1); diff --git a/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp b/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp index 2c63a87c..766109bf 100644 --- a/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp +++ b/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp @@ -141,7 +141,7 @@ IR::U128 SM4Hash(IREmitter& ir, Vec Vn, Vec Vd, SM4RotationType type) { const IR::U32 intval_low_word = ir.VectorGetElement(32, intval_vec, 0); const IR::U32 round_result_low_word = ir.VectorGetElement(32, roundresult, 0); const IR::U32 intval = SM4Rotation(ir, intval_low_word, round_result_low_word, type); - roundresult = ir.VectorShuffleWords(roundresult, 0b00111001); + roundresult = ir.VectorRotateWholeVectorRight(roundresult, 32); roundresult = ir.VectorSetElement(32, roundresult, 3, intval); } @@ -235,7 +235,7 @@ bool TranslatorVisitor::SM3PARTW1(Vec Vm, Vec Vn, Vec Vd) { const IR::U128 result_low_three_words = [&] { // Move the top-most 3 words down one element (i.e. [3, 2, 1, 0] -> [0, 3, 2, 1]) - const IR::U128 shuffled_m = ir.VectorShuffleWords(m, 0b00111001); + const IR::U128 shuffled_m = ir.VectorRotateWholeVectorRight(m, 32); // We treat the uppermost word as junk data and don't touch/use it explicitly for now. // Given we don't do anything with it yet, the fact we EOR into it doesn't matter. diff --git a/src/dynarmic/ir/ir_emitter.cpp b/src/dynarmic/ir/ir_emitter.cpp index 525a78a3..8c12c3a5 100644 --- a/src/dynarmic/ir/ir_emitter.cpp +++ b/src/dynarmic/ir/ir_emitter.cpp @@ -1695,6 +1695,11 @@ U128 IREmitter::VectorRotateRight(size_t esize, const U128& a, u8 amount) { VectorLogicalShiftLeft(esize, a, static_cast(esize - amount))); } +U128 IREmitter::VectorRotateWholeVectorRight(const U128& a, u8 amount) { + ASSERT(amount % 32 == 0); + return Inst(Opcode::VectorRotateWholeVectorRight, a, Imm8(amount)); +} + U128 IREmitter::VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b) { switch (esize) { case 8: @@ -1751,10 +1756,6 @@ U128 IREmitter::VectorRoundingShiftLeftUnsigned(size_t esize, const U128& a, con UNREACHABLE(); } -U128 IREmitter::VectorShuffleWords(const U128& a, u8 mask) { - return Inst(Opcode::VectorShuffleWords, a, mask); -} - U128 IREmitter::VectorSignExtend(size_t original_esize, const U128& a) { switch (original_esize) { case 8: diff --git a/src/dynarmic/ir/ir_emitter.h b/src/dynarmic/ir/ir_emitter.h index f679a344..8a12c940 100644 --- a/src/dynarmic/ir/ir_emitter.h +++ b/src/dynarmic/ir/ir_emitter.h @@ -291,11 +291,11 @@ public: U128 VectorReduceAdd(size_t esize, const U128& a); U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount); U128 VectorRotateRight(size_t esize, const U128& a, u8 amount); + U128 VectorRotateWholeVectorRight(const U128& a, u8 amount); U128 VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b); U128 VectorRoundingHalvingAddUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorRoundingShiftLeftSigned(size_t esize, const U128& a, const U128& b); U128 VectorRoundingShiftLeftUnsigned(size_t esize, const U128& a, const U128& b); - U128 VectorShuffleWords(const U128& a, u8 mask); U128 VectorSignExtend(size_t original_esize, const U128& a); U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); UpperAndLower VectorSignedMultiply(size_t esize, const U128& a, const U128& b); diff --git a/src/dynarmic/ir/opcodes.inc b/src/dynarmic/ir/opcodes.inc index add9aa93..114ad333 100644 --- a/src/dynarmic/ir/opcodes.inc +++ b/src/dynarmic/ir/opcodes.inc @@ -465,6 +465,7 @@ OPCODE(VectorReduceAdd8, U128, U128 OPCODE(VectorReduceAdd16, U128, U128 ) OPCODE(VectorReduceAdd32, U128, U128 ) OPCODE(VectorReduceAdd64, U128, U128 ) +OPCODE(VectorRotateWholeVectorRight, U128, U128, U8 ) OPCODE(VectorRoundingHalvingAddS8, U128, U128, U128 ) OPCODE(VectorRoundingHalvingAddS16, U128, U128, U128 ) OPCODE(VectorRoundingHalvingAddS32, U128, U128, U128 ) @@ -479,7 +480,6 @@ OPCODE(VectorRoundingShiftLeftU8, U128, U128 OPCODE(VectorRoundingShiftLeftU16, U128, U128, U128 ) OPCODE(VectorRoundingShiftLeftU32, U128, U128, U128 ) OPCODE(VectorRoundingShiftLeftU64, U128, U128, U128 ) -OPCODE(VectorShuffleWords, U128, U128, U8 ) OPCODE(VectorSignExtend8, U128, U128 ) OPCODE(VectorSignExtend16, U128, U128 ) OPCODE(VectorSignExtend32, U128, U128 ) diff --git a/src/dynarmic/ir/opt/polyfill_pass.cpp b/src/dynarmic/ir/opt/polyfill_pass.cpp index 6aac3b39..1aa3aea9 100644 --- a/src/dynarmic/ir/opt/polyfill_pass.cpp +++ b/src/dynarmic/ir/opt/polyfill_pass.cpp @@ -45,7 +45,7 @@ void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) { const IR::U128 T0 = ir.VectorExtract(y, z, 32); const IR::U128 lower_half = [&] { - const IR::U128 T = ir.VectorShuffleWords(z, 0b01001110); + const IR::U128 T = ir.VectorRotateWholeVectorRight(z, 64); const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17); const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19); const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10); @@ -61,8 +61,8 @@ void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) { const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3)); // Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2] - const IR::U128 shuffled_d = ir.VectorShuffleWords(x, 0b01001110); - const IR::U128 shuffled_T0 = ir.VectorShuffleWords(T0, 0b01001110); + const IR::U128 shuffled_d = ir.VectorRotateWholeVectorRight(x, 64); + const IR::U128 shuffled_T0 = ir.VectorRotateWholeVectorRight(T0, 64); const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0)); return ir.VectorGetElement(64, tmp5, 0); @@ -128,8 +128,8 @@ void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) { const IR::U32 new_low_y = ir.Add(t, high_x); // Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3] - const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011); - const IR::U128 shuffled_y = ir.VectorShuffleWords(y, 0b10010011); + const IR::U128 shuffled_x = ir.VectorRotateWholeVectorRight(x, 96); + const IR::U128 shuffled_y = ir.VectorRotateWholeVectorRight(y, 96); x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x); y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);