IR: Implement FPVector{Max,Min}
This commit is contained in:
parent
e76e1186bb
commit
7b03da86c2
4 changed files with 90 additions and 0 deletions
|
@ -502,6 +502,68 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
ctx.reg_alloc.DefineValue(inst, b);
|
ctx.reg_alloc.DefineValue(inst, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<size_t fsize>
|
||||||
|
static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
|
||||||
|
const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
||||||
|
// x86-64 treats differently signed zeros as equal while ARM does not.
|
||||||
|
// Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
|
||||||
|
|
||||||
|
code.movaps(neq_mask, result);
|
||||||
|
code.movaps(anded, result);
|
||||||
|
FCODE(cmpneqp)(neq_mask, xmm_b);
|
||||||
|
|
||||||
|
code.andps(anded, xmm_b);
|
||||||
|
FCODE(maxp)(result, xmm_b);
|
||||||
|
|
||||||
|
code.andps(result, neq_mask);
|
||||||
|
code.andnps(neq_mask, anded);
|
||||||
|
code.orps(result, neq_mask);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMax<32>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMax<64>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<size_t fsize>
|
||||||
|
static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitThreeOpVectorOperation<fsize, DefaultIndexer>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){
|
||||||
|
const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm();
|
||||||
|
const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
// What we are doing here is handling the case when the inputs are differently signed zeros.
|
||||||
|
// x86-64 treats differently signed zeros as equal while ARM does not.
|
||||||
|
// Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero.
|
||||||
|
|
||||||
|
code.movaps(neq_mask, result);
|
||||||
|
code.movaps(ored, result);
|
||||||
|
FCODE(cmpneqp)(neq_mask, xmm_b);
|
||||||
|
|
||||||
|
code.orps(ored, xmm_b);
|
||||||
|
FCODE(minp)(result, xmm_b);
|
||||||
|
|
||||||
|
code.andps(result, neq_mask);
|
||||||
|
code.andnps(neq_mask, ored);
|
||||||
|
code.orps(result, neq_mask);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMin32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMin<32>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPVectorMin<64>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps);
|
EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1699,6 +1699,28 @@ U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b)
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b) {
|
||||||
|
switch (esize) {
|
||||||
|
case 32:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMax32, a, b);
|
||||||
|
case 64:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMax64, a, b);
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b) {
|
||||||
|
switch (esize) {
|
||||||
|
case 32:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMin32, a, b);
|
||||||
|
case 64:
|
||||||
|
return Inst<U128>(Opcode::FPVectorMin64, a, b);
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b) {
|
U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b) {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 32:
|
case 32:
|
||||||
|
|
|
@ -301,6 +301,8 @@ public:
|
||||||
U128 FPVectorEqual(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorEqual(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b);
|
||||||
|
U128 FPVectorMax(size_t esize, const U128& a, const U128& b);
|
||||||
|
U128 FPVectorMin(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorMul(size_t esize, const U128& a, const U128& b);
|
U128 FPVectorMul(size_t esize, const U128& a, const U128& b);
|
||||||
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2);
|
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2);
|
||||||
U128 FPVectorNeg(size_t esize, const U128& a);
|
U128 FPVectorNeg(size_t esize, const U128& a);
|
||||||
|
|
|
@ -443,6 +443,10 @@ OPCODE(FPVectorGreater32, T::U128, T::U128,
|
||||||
OPCODE(FPVectorGreater64, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorGreater64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorGreaterEqual32, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorGreaterEqual32, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorGreaterEqual64, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorGreaterEqual64, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(FPVectorMax32, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(FPVectorMax64, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(FPVectorMin32, T::U128, T::U128, T::U128 )
|
||||||
|
OPCODE(FPVectorMin64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorMul32, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorMul32, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorMul64, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorMul64, T::U128, T::U128, T::U128 )
|
||||||
OPCODE(FPVectorMulAdd32, T::U128, T::U128, T::U128, T::U128 )
|
OPCODE(FPVectorMulAdd32, T::U128, T::U128, T::U128, T::U128 )
|
||||||
|
|
Loading…
Reference in a new issue