diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 5b9c8b7a..a5095e12 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -1242,22 +1242,51 @@ void EmitX64::EmitByteReverseDual(IR::Block&, IR::Inst* inst) { code->bswap(result); } -static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { +void EmitX64::EmitPackedAddU8(IR::Block& block, IR::Inst* inst) { + auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); - Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); - Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 reg_a = reg_alloc.UseScratchGpr(a).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseScratchGpr(b).cvt32(); + Xbyak::Reg32 result = reg_alloc.DefGpr(inst).cvt32(); + Xbyak::Reg32 reg_ge, tmp; - Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm(); + if (ge_inst) { + EraseInstruction(block, ge_inst); + inst->DecrementRemainingUses(); - code->movd(xmm_scratch_a, result); - code->movd(xmm_scratch_b, arg); + reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); + tmp = reg_alloc.ScratchGpr().cvt32(); - (code->*fn)(xmm_scratch_a, xmm_scratch_b); + code->mov(reg_ge, reg_a); + code->and_(reg_ge, reg_b); + } - code->movd(result, xmm_scratch_a); + // SWAR Arithmetic + code->mov(result, reg_a); + code->xor_(result, reg_b); + code->and_(result, 0x80808080); + code->and_(reg_a, 0x7F7F7F7F); + code->and_(reg_b, 0x7F7F7F7F); + code->add(reg_a, reg_b); + if (ge_inst) { + code->mov(tmp, result); + code->and_(tmp, reg_a); + code->or_(reg_ge, tmp); + } + code->xor_(result, reg_a); + if (ge_inst) { + if (cpu_info.has(Xbyak::util::Cpu::tBMI2)) { + code->mov(tmp, 0x80808080); + code->pext(reg_ge, reg_ge, tmp); + } else { + code->and_(reg_ge, 0x80808080); + code->imul(reg_ge, reg_ge, 0x0204081); + code->shr(reg_ge, 28); + } + } } void EmitX64::EmitPackedHalvingAddU8(IR::Block& block, IR::Inst* inst) { @@ -1461,6 +1490,24 @@ void EmitX64::EmitPackedHalvingSubU16(IR::Block& block, IR::Inst* inst) { // minuend now contains the desired result. } +static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + Xbyak::Reg32 result = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 arg = reg_alloc.UseGpr(b).cvt32(); + + Xbyak::Xmm xmm_scratch_a = reg_alloc.ScratchXmm(); + Xbyak::Xmm xmm_scratch_b = reg_alloc.ScratchXmm(); + + code->movd(xmm_scratch_a, result); + code->movd(xmm_scratch_b, arg); + + (code->*fn)(xmm_scratch_a, xmm_scratch_b); + + code->movd(result, xmm_scratch_a); +} + void EmitX64::EmitPackedSaturatedAddU8(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::paddusb); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index e557df1a..7d2d7120 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -324,6 +324,12 @@ Value IREmitter::ByteReverseDual(const Value& a) { return Inst(Opcode::ByteReverseDual, {a}); } +IREmitter::ResultAndGE IREmitter::PackedAddU8(const Value& a, const Value& b) { + auto result = Inst(Opcode::PackedAddU8, {a, b}); + auto ge = Inst(Opcode::GetGEFromOp, {result}); + return {result, ge}; +} + Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingAddU8, {a, b}); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 5f8f7034..dca52ebc 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -49,6 +49,11 @@ public: Value overflow; }; + struct ResultAndGE { + Value result; + Value ge; + }; + void Unimplemented(); u32 PC(); u32 AlignPC(size_t alignment); @@ -122,6 +127,7 @@ public: Value ByteReverseWord(const Value& a); Value ByteReverseHalf(const Value& a); Value ByteReverseDual(const Value& a); + ResultAndGE PackedAddU8(const Value& a, const Value& b); Value PackedHalvingAddU8(const Value& a, const Value& b); Value PackedHalvingAddS8(const Value& a, const Value& b); Value PackedHalvingSubU8(const Value& a, const Value& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 4d742aa4..39eef49a 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -72,6 +72,7 @@ OPCODE(ZeroExtendByteToWord, T::U32, T::U8 OPCODE(ByteReverseWord, T::U32, T::U32 ) OPCODE(ByteReverseHalf, T::U16, T::U16 ) OPCODE(ByteReverseDual, T::U64, T::U64 ) +OPCODE(PackedAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) diff --git a/src/frontend/translate/translate_arm/parallel.cpp b/src/frontend/translate/translate_arm/parallel.cpp index 5930c7c4..a7ad61ab 100644 --- a/src/frontend/translate/translate_arm/parallel.cpp +++ b/src/frontend/translate/translate_arm/parallel.cpp @@ -35,7 +35,14 @@ bool ArmTranslatorVisitor::arm_SSUB16(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_UADD8(Cond cond, Reg n, Reg d, Reg m) { - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedAddU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + } + return true; } bool ArmTranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) {