diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index 53a638c0..a977be41 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -502,6 +502,68 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, b); } +template +static void EmitFPVectorMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ + const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm anded = ctx.reg_alloc.ScratchXmm(); + + // What we are doing here is handling the case when the inputs are differently signed zeros. + // x86-64 treats differently signed zeros as equal while ARM does not. + // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero. + + code.movaps(neq_mask, result); + code.movaps(anded, result); + FCODE(cmpneqp)(neq_mask, xmm_b); + + code.andps(anded, xmm_b); + FCODE(maxp)(result, xmm_b); + + code.andps(result, neq_mask); + code.andnps(neq_mask, anded); + code.orps(result, neq_mask); + }); +} + +void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMax<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMax<64>(code, ctx, inst); +} + +template +static void EmitFPVectorMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_b){ + const Xbyak::Xmm neq_mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm ored = ctx.reg_alloc.ScratchXmm(); + + // What we are doing here is handling the case when the inputs are differently signed zeros. + // x86-64 treats differently signed zeros as equal while ARM does not. + // Thus if we OR together things that x86-64 thinks are equal we'll get the negative zero. + + code.movaps(neq_mask, result); + code.movaps(ored, result); + FCODE(cmpneqp)(neq_mask, xmm_b); + + code.orps(ored, xmm_b); + FCODE(minp)(result, xmm_b); + + code.andps(result, neq_mask); + code.andnps(neq_mask, ored); + code.orps(result, neq_mask); + }); +} + +void EmitX64::EmitFPVectorMin32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMin<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMin<64>(code, ctx, inst); +} + void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) { EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps); } diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 41676fe1..e5cd6a72 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1699,6 +1699,28 @@ U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b) return {}; } +U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorMax32, a, b); + case 64: + return Inst(Opcode::FPVectorMax64, a, b); + } + UNREACHABLE(); + return {}; +} + +U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorMin32, a, b); + case 64: + return Inst(Opcode::FPVectorMin64, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b) { switch (esize) { case 32: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index fef3a97a..9eb575ce 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -301,6 +301,8 @@ public: U128 FPVectorEqual(size_t esize, const U128& a, const U128& b); U128 FPVectorGreater(size_t esize, const U128& a, const U128& b); U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b); + U128 FPVectorMax(size_t esize, const U128& a, const U128& b); + U128 FPVectorMin(size_t esize, const U128& a, const U128& b); U128 FPVectorMul(size_t esize, const U128& a, const U128& b); U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2); U128 FPVectorNeg(size_t esize, const U128& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 8ee80f97..eccd21d3 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -443,6 +443,10 @@ OPCODE(FPVectorGreater32, T::U128, T::U128, OPCODE(FPVectorGreater64, T::U128, T::U128, T::U128 ) OPCODE(FPVectorGreaterEqual32, T::U128, T::U128, T::U128 ) OPCODE(FPVectorGreaterEqual64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorMax32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorMax64, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorMin32, T::U128, T::U128, T::U128 ) +OPCODE(FPVectorMin64, T::U128, T::U128, T::U128 ) OPCODE(FPVectorMul32, T::U128, T::U128, T::U128 ) OPCODE(FPVectorMul64, T::U128, T::U128, T::U128 ) OPCODE(FPVectorMulAdd32, T::U128, T::U128, T::U128, T::U128 )