From 089096948a5ce7ca49230082f4d67251c4a1ef1e Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 4 May 2018 08:22:14 -0400 Subject: [PATCH] ir: Add opcodes for performing halving adds --- src/backend_x64/emit_x64_vector.cpp | 103 ++++++++++++++++++++++++++-- src/frontend/ir/ir_emitter.cpp | 26 +++++++ src/frontend/ir/ir_emitter.h | 2 + src/frontend/ir/opcodes.inc | 6 ++ 4 files changed, 131 insertions(+), 6 deletions(-) diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index d5e34860..8e6b0034 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -357,12 +357,8 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand); } -void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); - Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - const u8 shift_amount = args[1].GetImmediateU8(); +static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); // TODO: Optimize code.movdqa(tmp, result); @@ -372,6 +368,15 @@ void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) code.psllw(result, 8); code.psrlw(tmp, 8); code.por(result, tmp); +} + +void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + ArithmeticShiftRightByte(ctx, code, result, shift_amount); ctx.reg_alloc.DefineValue(inst, result); } @@ -758,6 +763,92 @@ void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) { }); } +static void EmitVectorHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, b); + code.pand(tmp, a); + code.pxor(a, b); + + switch (esize) { + case 8: + ArithmeticShiftRightByte(ctx, code, a, 1); + code.paddb(a, tmp); + break; + case 16: + code.psraw(a, 1); + code.paddw(a, tmp); + break; + case 32: + code.psrad(a, 1); + code.paddd(a, tmp); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorHalvingAddS8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddSigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddSigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddS32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddSigned(32, ctx, inst, code); +} + +static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, b); + + switch (esize) { + case 8: + code.pavgb(tmp, a); + code.pxor(a, b); + code.pand(a, code.MConst(xword, 0x0101010101010101, 0x0101010101010101)); + code.psubb(tmp, a); + break; + case 16: + code.pavgw(tmp, a); + code.pxor(a, b); + code.pand(a, code.MConst(xword, 0x0001000100010001, 0x0001000100010001)); + code.psubw(tmp, a); + break; + case 32: + code.pand(tmp, a); + code.pxor(a, b); + code.psrld(a, 1); + code.paddd(tmp, a); + break; + } + + ctx.reg_alloc.DefineValue(inst, tmp); +} + +void EmitX64::EmitVectorHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddUnsigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddUnsigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddUnsigned(32, ctx, inst, code); +} + static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 0911d2dc..b100ae04 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -882,6 +882,32 @@ U128 IREmitter::VectorGreaterUnsigned(size_t esize, const U128& a, const U128& b return VectorNot(VectorEqual(esize, VectorMinUnsigned(esize, a, b), a)); } +U128 IREmitter::VectorHalvingAddSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorHalvingAddS8, a, b); + case 16: + return Inst(Opcode::VectorHalvingAddS16, a, b); + case 32: + return Inst(Opcode::VectorHalvingAddS32, a, b); + } + UNREACHABLE(); + return {}; +} + +U128 IREmitter::VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorHalvingAddU8, a, b); + case 16: + return Inst(Opcode::VectorHalvingAddU16, a, b); + case 32: + return Inst(Opcode::VectorHalvingAddU32, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorInterleaveLower(size_t esize, const U128& a, const U128& b) { switch (esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index c4905af9..acdb3f17 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -206,6 +206,8 @@ public: U128 VectorGreaterEqualUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorGreaterSigned(size_t esize, const U128& a, const U128& b); U128 VectorGreaterUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingAddSigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorInterleaveLower(size_t esize, const U128& a, const U128& b); U128 VectorInterleaveUpper(size_t esize, const U128& a, const U128& b); U128 VectorLessEqualSigned(size_t esize, const U128& a, const U128& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index cb70f62a..7689b4ae 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -251,6 +251,12 @@ OPCODE(VectorGreaterS8, T::U128, T::U128, T::U OPCODE(VectorGreaterS16, T::U128, T::U128, T::U128 ) OPCODE(VectorGreaterS32, T::U128, T::U128, T::U128 ) OPCODE(VectorGreaterS64, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddS8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddS16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddS32, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddU8, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddU16, T::U128, T::U128, T::U128 ) +OPCODE(VectorHalvingAddU32, T::U128, T::U128, T::U128 ) OPCODE(VectorInterleaveLower8, T::U128, T::U128, T::U128 ) OPCODE(VectorInterleaveLower16, T::U128, T::U128, T::U128 ) OPCODE(VectorInterleaveLower32, T::U128, T::U128, T::U128 )