diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 68c2e53c..2621a6a7 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -53,6 +53,170 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand); } +template +void EmitX64::EmitVectorLowerPairedAdd8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->punpcklqdq(xmm_a, xmm_b); + code->movdqa(tmp, xmm_a); + code->psllw(xmm_a, 8); + code->paddw(xmm_a, tmp); + code->pxor(tmp, tmp); + code->psrlw(xmm_a, 8); + code->packuswb(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +template +void EmitX64::EmitVectorLowerPairedAdd16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->punpcklqdq(xmm_a, xmm_b); + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + code->pxor(tmp, tmp); + code->phaddw(xmm_a, tmp); + } else { + code->movdqa(tmp, xmm_a); + code->pslld(xmm_a, 16); + code->paddd(xmm_a, tmp); + code->pxor(tmp, tmp); + code->psrad(xmm_a, 16); + code->packssdw(xmm_a, tmp); // Note: packusdw is SSE4.1, hence the arithmetic shift above. + } + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +template +void EmitX64::EmitVectorLowerPairedAdd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->punpcklqdq(xmm_a, xmm_b); + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + code->pxor(tmp, tmp); + code->phaddd(xmm_a, tmp); + } else { + code->movdqa(tmp, xmm_a); + code->psllq(xmm_a, 32); + code->paddq(xmm_a, tmp); + code->psrlq(xmm_a, 32); + code->pshufd(xmm_a, xmm_a, 0b11011000); + } + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +template +void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(); + + code->movdqa(c, a); + code->movdqa(d, b); + code->psllw(a, 8); + code->psllw(b, 8); + code->paddw(a, c); + code->paddw(b, d); + code->psrlw(a, 8); + code->psrlw(b, 8); + code->packuswb(a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +template +void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + code->phaddw(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + } else { + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(); + + code->movdqa(c, a); + code->movdqa(d, b); + code->pslld(a, 16); + code->pslld(b, 16); + code->paddd(a, c); + code->paddd(b, d); + code->psrad(a, 16); + code->psrad(b, 16); + code->packssdw(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + } +} + +template +void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + code->phaddd(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + } else { + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(); + + code->movdqa(c, a); + code->movdqa(d, b); + code->psllq(a, 32); + code->psllq(b, 32); + code->paddq(a, c); + code->paddq(b, d); + code->shufps(a, b, 0b11011101); + + ctx.reg_alloc.DefineValue(inst, a); + } +} + +template +void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code->movdqa(c, a); + code->punpcklqdq(a, b); + code->punpckhqdq(c, b); + code->paddq(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + } // namespace BackendX64 } // namespace Dynarmic diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 16b2b77f..97325027 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -656,6 +656,34 @@ U128 IREmitter::VectorAnd(const U128& a, const U128& b) { return Inst(Opcode::VectorAnd, a, b); } +U128 IREmitter::VectorLowerPairedAdd8(const U128& a, const U128& b) { + return Inst(Opcode::VectorLowerPairedAdd8, a, b); +} + +U128 IREmitter::VectorLowerPairedAdd16(const U128& a, const U128& b) { + return Inst(Opcode::VectorLowerPairedAdd16, a, b); +} + +U128 IREmitter::VectorLowerPairedAdd32(const U128& a, const U128& b) { + return Inst(Opcode::VectorLowerPairedAdd32, a, b); +} + +U128 IREmitter::VectorPairedAdd8(const U128& a, const U128& b) { + return Inst(Opcode::VectorPairedAdd8, a, b); +} + +U128 IREmitter::VectorPairedAdd16(const U128& a, const U128& b) { + return Inst(Opcode::VectorPairedAdd16, a, b); +} + +U128 IREmitter::VectorPairedAdd32(const U128& a, const U128& b) { + return Inst(Opcode::VectorPairedAdd32, a, b); +} + +U128 IREmitter::VectorPairedAdd64(const U128& a, const U128& b) { + return Inst(Opcode::VectorPairedAdd64, a, b); +} + U32 IREmitter::FPAbs32(const U32& a) { return Inst(Opcode::FPAbs32, a); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index da49cac7..72f1c5c3 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -188,6 +188,13 @@ public: U128 VectorAdd32(const U128& a, const U128& b); U128 VectorAdd64(const U128& a, const U128& b); U128 VectorAnd(const U128& a, const U128& b); + U128 VectorLowerPairedAdd8(const U128& a, const U128& b); + U128 VectorLowerPairedAdd16(const U128& a, const U128& b); + U128 VectorLowerPairedAdd32(const U128& a, const U128& b); + U128 VectorPairedAdd8(const U128& a, const U128& b); + U128 VectorPairedAdd16(const U128& a, const U128& b); + U128 VectorPairedAdd32(const U128& a, const U128& b); + U128 VectorPairedAdd64(const U128& a, const U128& b); U32 FPAbs32(const U32& a); U64 FPAbs64(const U64& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 1575b91c..66a51844 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -166,6 +166,13 @@ OPCODE(VectorAdd16, T::U128, T::U128, T::U128 OPCODE(VectorAdd32, T::U128, T::U128, T::U128 ) OPCODE(VectorAdd64, T::U128, T::U128, T::U128 ) OPCODE(VectorAnd, T::U128, T::U128, T::U128 ) +OPCODE(VectorLowerPairedAdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorLowerPairedAdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorLowerPairedAdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 ) +OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) // Floating-point operations OPCODE(FPAbs32, T::U32, T::U32 )