diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index ef23f212..17b74679 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -205,5 +205,94 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } +void EmitX64::EmitVectorLowerBroadcast8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->pxor(tmp, tmp); + code->pshufb(a, tmp); + code->movq(a, a); + } else { + code->punpcklbw(a, a); + code->pshuflw(a, a, 0); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorLowerBroadcast16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshuflw(a, a, 0); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorLowerBroadcast32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshuflw(a, a, 0b01000100); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) { + Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code->pxor(tmp, tmp); + code->pshufb(a, tmp); + } else { + code->punpcklbw(a, a); + code->pshuflw(a, a, 0); + code->punpcklqdq(a, a); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshuflw(a, a, 0); + code->punpcklqdq(a, a); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->pshufd(a, a, 0); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code->punpcklqdq(a, a); + + ctx.reg_alloc.DefineValue(inst, a); +} + } // namespace BackendX64 } // namespace Dynarmic diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 825e3a57..83794313 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -652,6 +652,34 @@ U128 IREmitter::VectorAnd(const U128& a, const U128& b) { return Inst(Opcode::VectorAnd, a, b); } +U128 IREmitter::VectorLowerBroadcast8(const U8& a) { + return Inst(Opcode::VectorLowerBroadcast8, a); +} + +U128 IREmitter::VectorLowerBroadcast16(const U16& a) { + return Inst(Opcode::VectorLowerBroadcast16, a); +} + +U128 IREmitter::VectorLowerBroadcast32(const U32& a) { + return Inst(Opcode::VectorLowerBroadcast32, a); +} + +U128 IREmitter::VectorBroadcast8(const U8& a) { + return Inst(Opcode::VectorBroadcast8, a); +} + +U128 IREmitter::VectorBroadcast16(const U16& a) { + return Inst(Opcode::VectorBroadcast16, a); +} + +U128 IREmitter::VectorBroadcast32(const U32& a) { + return Inst(Opcode::VectorBroadcast32, a); +} + +U128 IREmitter::VectorBroadcast64(const U64& a) { + return Inst(Opcode::VectorBroadcast64, a); +} + U128 IREmitter::VectorLowerPairedAdd8(const U128& a, const U128& b) { return Inst(Opcode::VectorLowerPairedAdd8, a, b); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 5f4adc71..e46abcb3 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -182,6 +182,13 @@ public: U128 VectorAdd32(const U128& a, const U128& b); U128 VectorAdd64(const U128& a, const U128& b); U128 VectorAnd(const U128& a, const U128& b); + U128 VectorLowerBroadcast8(const U8& a); + U128 VectorLowerBroadcast16(const U16& a); + U128 VectorLowerBroadcast32(const U32& a); + U128 VectorBroadcast8(const U8& a); + U128 VectorBroadcast16(const U16& a); + U128 VectorBroadcast32(const U32& a); + U128 VectorBroadcast64(const U64& a); U128 VectorLowerPairedAdd8(const U128& a, const U128& b); U128 VectorLowerPairedAdd16(const U128& a, const U128& b); U128 VectorLowerPairedAdd32(const U128& a, const U128& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 66a51844..90868828 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -166,6 +166,13 @@ OPCODE(VectorAdd16, T::U128, T::U128, T::U128 OPCODE(VectorAdd32, T::U128, T::U128, T::U128 ) OPCODE(VectorAdd64, T::U128, T::U128, T::U128 ) OPCODE(VectorAnd, T::U128, T::U128, T::U128 ) +OPCODE(VectorLowerBroadcast8, T::U128, T::U8 ) +OPCODE(VectorLowerBroadcast16, T::U128, T::U16 ) +OPCODE(VectorLowerBroadcast32, T::U128, T::U32 ) +OPCODE(VectorBroadcast8, T::U128, T::U8 ) +OPCODE(VectorBroadcast16, T::U128, T::U16 ) +OPCODE(VectorBroadcast32, T::U128, T::U32 ) +OPCODE(VectorBroadcast64, T::U128, T::U64 ) OPCODE(VectorLowerPairedAdd8, T::U128, T::U128, T::U128 ) OPCODE(VectorLowerPairedAdd16, T::U128, T::U128, T::U128 ) OPCODE(VectorLowerPairedAdd32, T::U128, T::U128, T::U128 )