diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 3bd0e9b4..9f1f936a 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -7,8 +7,6 @@ #include #include -#include - #include "backend_x64/abi.h" #include "backend_x64/emit_x64.h" #include "backend_x64/jitstate.h" @@ -1258,6 +1256,68 @@ static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst code->movd(result, xmm_scratch_a); } +void EmitX64::EmitPackedHalvingAddU8(IR::Block& block, IR::Inst* inst) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + // This code path requires SSSE3 because of the PSHUFB instruction. + // A fallback implementation is provided below. + if (cpu_info.has(Xbyak::util::Cpu::tSSSE3)) { + Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32(); + + // Load the operands into Xmm registers + Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm(); + Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm(); + + Xbyak::Xmm xmm_mask = reg_alloc.ScratchXmm(); + Xbyak::Reg64 mask = reg_alloc.ScratchGpr(); + + code->movd(xmm_scratch_a, result); + code->movd(xmm_scratch_b, arg); + + // Set the mask to expand the values + // 0xAABBCCDD becomes 0x00AA00BB00CC00DD + code->mov(mask, 0x8003800280018000); + code->movq(xmm_mask, mask); + + // Expand each 8-bit value to 16-bit + code->pshufb(xmm_scratch_a, xmm_mask); + code->pshufb(xmm_scratch_b, xmm_mask); + + // Add the individual 16-bit values + code->paddw(xmm_scratch_a, xmm_scratch_b); + + // Shift the 16-bit values to the right to halve them + code->psrlw(xmm_scratch_a, 1); + + // Set the mask to pack the values again + // 0x00AA00BB00CC00DD becomes 0xAABBCCDD + code->mov(mask, 0x06040200); + code->movq(xmm_mask, mask); + + // Shuffle them back to 8-bit values + code->pshufb(xmm_scratch_a, xmm_mask); + + code->movd(result, xmm_scratch_a); + return; + } + + // Fallback implementation in case the CPU doesn't support SSSE3 + Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 and_a_b = reg_a; + Xbyak::Reg32 result = reg_a; + + code->mov(xor_a_b, reg_a); + code->and(and_a_b, reg_b); + code->xor(xor_a_b, reg_b); + code->shr(xor_a_b, 1); + code->and(xor_a_b, 0x7F7F7F7F); + code->add(result, xor_a_b); +} + void EmitX64::EmitPackedSaturatedAddU8(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::paddusb); } diff --git a/src/backend_x64/emit_x64.h b/src/backend_x64/emit_x64.h index 2d046020..1d3d61e1 100644 --- a/src/backend_x64/emit_x64.h +++ b/src/backend_x64/emit_x64.h @@ -11,6 +11,8 @@ #include +#include + #include "backend_x64/block_of_code.h" #include "backend_x64/reg_alloc.h" #include "dynarmic/callbacks.h" @@ -76,6 +78,9 @@ private: void EmitTerminalCheckHalt(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location); void Patch(IR::LocationDescriptor desc, CodePtr bb); + // Global CPU information + Xbyak::util::Cpu cpu_info; + // Per-block state RegAlloc reg_alloc; diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index ab459e3b..188c21c1 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -320,6 +320,10 @@ Value IREmitter::ByteReverseDual(const Value& a) { return Inst(Opcode::ByteReverseDual, {a}); } +Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingAddU8, { a, b }); +} + Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) { return Inst(Opcode::PackedSaturatedAddU8, {a, b}); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 32b1a51b..43a99089 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -121,6 +121,7 @@ public: Value ByteReverseWord(const Value& a); Value ByteReverseHalf(const Value& a); Value ByteReverseDual(const Value& a); + Value PackedHalvingAddU8(const Value& a, const Value& b); Value PackedSaturatedAddU8(const Value& a, const Value& b); Value PackedSaturatedAddS8(const Value& a, const Value& b); Value PackedSaturatedSubU8(const Value& a, const Value& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 171e7a94..92745e9d 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -71,6 +71,7 @@ OPCODE(ZeroExtendByteToWord, T::U32, T::U8 OPCODE(ByteReverseWord, T::U32, T::U32 ) OPCODE(ByteReverseHalf, T::U16, T::U16 ) OPCODE(ByteReverseDual, T::U64, T::U64 ) +OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) diff --git a/src/frontend/translate/translate_arm/parallel.cpp b/src/frontend/translate/translate_arm/parallel.cpp index 8dffd567..afad6823 100644 --- a/src/frontend/translate/translate_arm/parallel.cpp +++ b/src/frontend/translate/translate_arm/parallel.cpp @@ -179,7 +179,13 @@ bool ArmTranslatorVisitor::arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_UHADD8(Cond cond, Reg n, Reg d, Reg m) { - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingAddU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_UHADD16(Cond cond, Reg n, Reg d, Reg m) {