IR: Implement Vector{Lower,}PairedAdd{8,16,32,64}
This commit is contained in:
parent
a554e4a329
commit
eaf545877a
4 changed files with 206 additions and 0 deletions
|
@ -53,6 +53,170 @@ void EmitX64<JST>::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand);
|
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename JST>
|
||||||
|
void EmitX64<JST>::EmitVectorLowerPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->punpcklqdq(xmm_a, xmm_b);
|
||||||
|
code->movdqa(tmp, xmm_a);
|
||||||
|
code->psllw(xmm_a, 8);
|
||||||
|
code->paddw(xmm_a, tmp);
|
||||||
|
code->pxor(tmp, tmp);
|
||||||
|
code->psrlw(xmm_a, 8);
|
||||||
|
code->packuswb(xmm_a, tmp);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename JST>
|
||||||
|
void EmitX64<JST>::EmitVectorLowerPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->punpcklqdq(xmm_a, xmm_b);
|
||||||
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
|
code->pxor(tmp, tmp);
|
||||||
|
code->phaddw(xmm_a, tmp);
|
||||||
|
} else {
|
||||||
|
code->movdqa(tmp, xmm_a);
|
||||||
|
code->pslld(xmm_a, 16);
|
||||||
|
code->paddd(xmm_a, tmp);
|
||||||
|
code->pxor(tmp, tmp);
|
||||||
|
code->psrad(xmm_a, 16);
|
||||||
|
code->packssdw(xmm_a, tmp); // Note: packusdw is SSE4.1, hence the arithmetic shift above.
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename JST>
|
||||||
|
void EmitX64<JST>::EmitVectorLowerPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->punpcklqdq(xmm_a, xmm_b);
|
||||||
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
|
code->pxor(tmp, tmp);
|
||||||
|
code->phaddd(xmm_a, tmp);
|
||||||
|
} else {
|
||||||
|
code->movdqa(tmp, xmm_a);
|
||||||
|
code->psllq(xmm_a, 32);
|
||||||
|
code->paddq(xmm_a, tmp);
|
||||||
|
code->psrlq(xmm_a, 32);
|
||||||
|
code->pshufd(xmm_a, xmm_a, 0b11011000);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename JST>
|
||||||
|
void EmitX64<JST>::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->movdqa(c, a);
|
||||||
|
code->movdqa(d, b);
|
||||||
|
code->psllw(a, 8);
|
||||||
|
code->psllw(b, 8);
|
||||||
|
code->paddw(a, c);
|
||||||
|
code->paddw(b, d);
|
||||||
|
code->psrlw(a, 8);
|
||||||
|
code->psrlw(b, 8);
|
||||||
|
code->packuswb(a, b);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename JST>
|
||||||
|
void EmitX64<JST>::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
|
Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
|
code->phaddw(a, b);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
|
} else {
|
||||||
|
Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->movdqa(c, a);
|
||||||
|
code->movdqa(d, b);
|
||||||
|
code->pslld(a, 16);
|
||||||
|
code->pslld(b, 16);
|
||||||
|
code->paddd(a, c);
|
||||||
|
code->paddd(b, d);
|
||||||
|
code->psrad(a, 16);
|
||||||
|
code->psrad(b, 16);
|
||||||
|
code->packssdw(a, b);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename JST>
|
||||||
|
void EmitX64<JST>::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
|
||||||
|
Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
|
code->phaddd(a, b);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
|
} else {
|
||||||
|
Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
|
||||||
|
Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->movdqa(c, a);
|
||||||
|
code->movdqa(d, b);
|
||||||
|
code->psllq(a, 32);
|
||||||
|
code->psllq(b, 32);
|
||||||
|
code->paddq(a, c);
|
||||||
|
code->paddq(b, d);
|
||||||
|
code->shufps(a, b, 0b11011101);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename JST>
|
||||||
|
void EmitX64<JST>::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
code->movdqa(c, a);
|
||||||
|
code->punpcklqdq(a, b);
|
||||||
|
code->punpckhqdq(c, b);
|
||||||
|
code->paddq(a, c);
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace BackendX64
|
} // namespace BackendX64
|
||||||
} // namespace Dynarmic
|
} // namespace Dynarmic
|
||||||
|
|
||||||
|
|
|
@ -656,6 +656,34 @@ U128 IREmitter::VectorAnd(const U128& a, const U128& b) {
|
||||||
return Inst<U128>(Opcode::VectorAnd, a, b);
|
return Inst<U128>(Opcode::VectorAnd, a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorLowerPairedAdd8(const U128& a, const U128& b) {
|
||||||
|
return Inst<U128>(Opcode::VectorLowerPairedAdd8, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorLowerPairedAdd16(const U128& a, const U128& b) {
|
||||||
|
return Inst<U128>(Opcode::VectorLowerPairedAdd16, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorLowerPairedAdd32(const U128& a, const U128& b) {
|
||||||
|
return Inst<U128>(Opcode::VectorLowerPairedAdd32, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorPairedAdd8(const U128& a, const U128& b) {
|
||||||
|
return Inst<U128>(Opcode::VectorPairedAdd8, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorPairedAdd16(const U128& a, const U128& b) {
|
||||||
|
return Inst<U128>(Opcode::VectorPairedAdd16, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorPairedAdd32(const U128& a, const U128& b) {
|
||||||
|
return Inst<U128>(Opcode::VectorPairedAdd32, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::VectorPairedAdd64(const U128& a, const U128& b) {
|
||||||
|
return Inst<U128>(Opcode::VectorPairedAdd64, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
U32 IREmitter::FPAbs32(const U32& a) {
|
U32 IREmitter::FPAbs32(const U32& a) {
|
||||||
return Inst<U32>(Opcode::FPAbs32, a);
|
return Inst<U32>(Opcode::FPAbs32, a);
|
||||||
}
|
}
|
||||||
|
|
|
@ -188,6 +188,13 @@ public:
|
||||||
U128 VectorAdd32(const U128& a, const U128& b);
|
U128 VectorAdd32(const U128& a, const U128& b);
|
||||||
U128 VectorAdd64(const U128& a, const U128& b);
|
U128 VectorAdd64(const U128& a, const U128& b);
|
||||||
U128 VectorAnd(const U128& a, const U128& b);
|
U128 VectorAnd(const U128& a, const U128& b);
|
||||||
|
U128 VectorLowerPairedAdd8(const U128& a, const U128& b);
|
||||||
|
U128 VectorLowerPairedAdd16(const U128& a, const U128& b);
|
||||||
|
U128 VectorLowerPairedAdd32(const U128& a, const U128& b);
|
||||||
|
U128 VectorPairedAdd8(const U128& a, const U128& b);
|
||||||
|
U128 VectorPairedAdd16(const U128& a, const U128& b);
|
||||||
|
U128 VectorPairedAdd32(const U128& a, const U128& b);
|
||||||
|
U128 VectorPairedAdd64(const U128& a, const U128& b);
|
||||||
|
|
||||||
U32 FPAbs32(const U32& a);
|
U32 FPAbs32(const U32& a);
|
||||||
U64 FPAbs64(const U64& a);
|
U64 FPAbs64(const U64& a);
|
||||||
|
|
|
@ -166,6 +166,13 @@ OPCODE(VectorAdd16, T::U128, T::U128, T::U128
|
||||||
OPCODE(VectorAdd32, T::U128, T::U128, T::U128 )
|
OPCODE(VectorAdd32, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorAdd64, T::U128, T::U128, T::U128 )
|
OPCODE(VectorAdd64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(VectorAnd, T::U128, T::U128, T::U128 )
|
OPCODE(VectorAnd, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorLowerPairedAdd8, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorLowerPairedAdd16, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorLowerPairedAdd32, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorPairedAdd8, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 )
|
||||||
|
|
||||||
// Floating-point operations
|
// Floating-point operations
|
||||||
OPCODE(FPAbs32, T::U32, T::U32 )
|
OPCODE(FPAbs32, T::U32, T::U32 )
|
||||||
|
|
Loading…
Reference in a new issue