diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 0cccec22..61f9d006 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -1341,6 +1341,62 @@ void EmitX64::EmitPackedHalvingAddU16(IR::Block& block, IR::Inst* inst) { code->add(result, xor_a_b); } +void EmitX64::EmitPackedHalvingAddS8(IR::Block& block, IR::Inst* inst) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 and_a_b = reg_a; + Xbyak::Reg32 result = reg_a; + Xbyak::Reg32 carry = reg_alloc.ScratchGpr().cvt32(); + + // This relies on the equality x+y == ((x&y) << 1) + (x^y). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). + // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below. + // carry propagates the sign bit from (x^y)>>1 upwards by one. + + code->mov(xor_a_b, reg_a); + code->and(and_a_b, reg_b); + code->xor(xor_a_b, reg_b); + code->mov(carry, xor_a_b); + code->and(carry, 0x80808080); + code->shr(xor_a_b, 1); + code->and(xor_a_b, 0x7F7F7F7F); + code->add(result, xor_a_b); + code->xor(result, carry); +} + +void EmitX64::EmitPackedHalvingAddS16(IR::Block& block, IR::Inst* inst) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 xor_a_b = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 and_a_b = reg_a; + Xbyak::Reg32 result = reg_a; + Xbyak::Reg32 carry = reg_alloc.ScratchGpr().cvt32(); + + // This relies on the equality x+y == ((x&y) << 1) + (x^y). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). + // We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below. + // carry propagates the sign bit from (x^y)>>1 upwards by one. + + code->mov(xor_a_b, reg_a); + code->and(and_a_b, reg_b); + code->xor(xor_a_b, reg_b); + code->mov(carry, xor_a_b); + code->and(carry, 0x80008000); + code->shr(xor_a_b, 1); + code->and(xor_a_b, 0x7FFF7FFF); + code->add(result, xor_a_b); + code->xor(result, carry); +} + void EmitX64::EmitPackedSaturatedAddU8(IR::Block& block, IR::Inst* inst) { EmitPackedOperation(code, reg_alloc, inst, &Xbyak::CodeGenerator::paddusb); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 418f4111..86eaee17 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -321,13 +321,21 @@ Value IREmitter::ByteReverseDual(const Value& a) { } Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) { - return Inst(Opcode::PackedHalvingAddU8, { a, b }); + return Inst(Opcode::PackedHalvingAddU8, {a, b}); +} + +Value IREmitter::PackedHalvingAddS8(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingAddS8, {a, b}); } Value IREmitter::PackedHalvingAddU16(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingAddU16, {a, b}); } +Value IREmitter::PackedHalvingAddS16(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingAddS16, {a, b}); +} + Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) { return Inst(Opcode::PackedSaturatedAddU8, {a, b}); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 7e2b66ee..0d421970 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -122,7 +122,9 @@ public: Value ByteReverseHalf(const Value& a); Value ByteReverseDual(const Value& a); Value PackedHalvingAddU8(const Value& a, const Value& b); + Value PackedHalvingAddS8(const Value& a, const Value& b); Value PackedHalvingAddU16(const Value& a, const Value& b); + Value PackedHalvingAddS16(const Value& a, const Value& b); Value PackedSaturatedAddU8(const Value& a, const Value& b); Value PackedSaturatedAddS8(const Value& a, const Value& b); Value PackedSaturatedSubU8(const Value& a, const Value& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index b9c9d6d0..7b9b0d52 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -72,7 +72,9 @@ OPCODE(ByteReverseWord, T::U32, T::U32 OPCODE(ByteReverseHalf, T::U16, T::U16 ) OPCODE(ByteReverseDual, T::U64, T::U64 ) OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) diff --git a/src/frontend/translate/translate_arm/parallel.cpp b/src/frontend/translate/translate_arm/parallel.cpp index dd7bd0b8..cb4dadef 100644 --- a/src/frontend/translate/translate_arm/parallel.cpp +++ b/src/frontend/translate/translate_arm/parallel.cpp @@ -155,11 +155,23 @@ bool ArmTranslatorVisitor::arm_UQSUB16(Cond cond, Reg n, Reg d, Reg m) { // Parallel Add/Subtract (Halving) instructions bool ArmTranslatorVisitor::arm_SHADD8(Cond cond, Reg n, Reg d, Reg m) { - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingAddS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_SHADD16(Cond cond, Reg n, Reg d, Reg m) { - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingAddS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_SHASX(Cond cond, Reg n, Reg d, Reg m) {