diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp index 5b45d5a0..8781c236 100644 --- a/src/backend_x64/emit_x64_floating_point.cpp +++ b/src/backend_x64/emit_x64_floating_point.cpp @@ -105,8 +105,8 @@ static void ZeroIfNaN64(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_ code.pand(xmm_value, xmm_scratch); } -static Xbyak::Label PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b) { - Xbyak::Label nan, end; +static void PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label& end) { + Xbyak::Label nan; code.ucomiss(a, b); code.jp(nan, code.T_NEAR); @@ -128,7 +128,6 @@ static Xbyak::Label PreProcessNaNs32(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm code.jmp(end, code.T_NEAR); code.SwitchToNearCode(); - return end; } static void PostProcessNaNs32(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { @@ -146,8 +145,8 @@ static void DefaultNaN32(BlockOfCode& code, Xbyak::Xmm xmm_value) { code.L(end); } -static Xbyak::Label PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b) { - Xbyak::Label nan, end; +static void PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label& end) { + Xbyak::Label nan; code.ucomisd(a, b); code.jp(nan, code.T_NEAR); @@ -167,7 +166,6 @@ static Xbyak::Label PreProcessNaNs64(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm code.jmp(end, code.T_NEAR); code.SwitchToNearCode(); - return end; } static void PostProcessNaNs64(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { @@ -215,8 +213,8 @@ static Xbyak::Label ProcessNaN64(BlockOfCode& code, Xbyak::Xmm a) { return end; } -template -static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { +template +static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, PreprocessFunction preprocess, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Label end; @@ -225,12 +223,15 @@ static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Fun Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32(); + if constexpr(!std::is_same_v) { + preprocess(result, operand, gpr_scratch, end); + } if (ctx.FPSCR_FTZ()) { DenormalsAreZero32(code, result, gpr_scratch); DenormalsAreZero32(code, operand, gpr_scratch); } if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { - end = PreProcessNaNs32(code, result, operand); + PreProcessNaNs32(code, result, operand, end); } if constexpr (std::is_member_function_pointer_v) { (code.*fn)(result, operand); @@ -250,8 +251,8 @@ static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Fun ctx.reg_alloc.DefineValue(inst, result); } -template -static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { +template +static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, PreprocessFunction preprocess, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Label end; @@ -260,12 +261,15 @@ static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Fun Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr(); + if constexpr(!std::is_same_v) { + preprocess(result, operand, gpr_scratch, end); + } if (ctx.FPSCR_FTZ()) { DenormalsAreZero64(code, result, gpr_scratch); DenormalsAreZero64(code, operand, gpr_scratch); } if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { - end = PreProcessNaNs64(code, result, operand); + PreProcessNaNs64(code, result, operand, end); } if constexpr (std::is_member_function_pointer_v) { (code.*fn)(result, operand); @@ -285,6 +289,16 @@ static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Fun ctx.reg_alloc.DefineValue(inst, result); } +template +static void FPThreeOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + FPThreeOp32(code, ctx, inst, nullptr, fn); +} + +template +static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + FPThreeOp64(code, ctx, inst, nullptr, fn); +} + template static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -443,6 +457,74 @@ void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) { }); } +void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg32 scratch, Xbyak::Label& end){ + Xbyak::Label normal, normal_or_equal, result_is_result; + + code.ucomiss(result, operand); + code.jnp(normal_or_equal); + // If operand == QNaN, result = result. + code.movd(scratch, operand); + code.shl(scratch, 1); + code.cmp(scratch, 0xff800000u); + code.jae(result_is_result); + // If operand == SNaN, let usual NaN code handle it. + code.cmp(scratch, 0xff000000u); + code.ja(normal); + // If result == SNaN, && operand != NaN, result = result. + code.movd(scratch, result); + code.shl(scratch, 1); + code.cmp(scratch, 0xff800000u); + code.jnae(result_is_result); + // If result == QNaN && operand != NaN, result = operand. + code.movaps(result, operand); + code.jmp(end); + + code.L(result_is_result); + code.movaps(operand, result); + code.jmp(normal); + + code.L(normal_or_equal); + code.jnz(normal); + code.andps(operand, result); + code.L(normal); + }, &Xbyak::CodeGenerator::maxss); +} + +void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){ + Xbyak::Label normal, normal_or_equal, result_is_result; + + code.ucomisd(result, operand); + code.jnp(normal_or_equal); + // If operand == QNaN, result = result. + code.movq(scratch, operand); + code.shl(scratch, 1); + code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u)); + code.jae(result_is_result); + // If operand == SNaN, let usual NaN code handle it. + code.cmp(scratch, code.MConst(qword, 0xffe0'0000'0000'0000u)); + code.ja(normal); + // If result == SNaN, && operand != NaN, result = result. + code.movq(scratch, result); + code.shl(scratch, 1); + code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u)); + code.jnae(result_is_result); + // If result == QNaN && operand != NaN, result = operand. + code.movaps(result, operand); + code.jmp(end); + + code.L(result_is_result); + code.movaps(operand, result); + code.jmp(normal); + + code.L(normal_or_equal); + code.jnz(normal); + code.andps(operand, result); + code.L(normal); + }, &Xbyak::CodeGenerator::maxsd); +} + void EmitX64::EmitFPMin32(EmitContext& ctx, IR::Inst* inst) { FPThreeOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand){ Xbyak::Label normal, end; diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index e280b88f..2d837094 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -927,7 +927,7 @@ INST(FADD_float, "FADD (scalar)", "00011 INST(FSUB_float, "FSUB (scalar)", "00011110yy1mmmmm001110nnnnnddddd") INST(FMAX_float, "FMAX (scalar)", "00011110yy1mmmmm010010nnnnnddddd") INST(FMIN_float, "FMIN (scalar)", "00011110yy1mmmmm010110nnnnnddddd") -//INST(FMAXNM_float, "FMAXNM (scalar)", "00011110yy1mmmmm011010nnnnnddddd") +INST(FMAXNM_float, "FMAXNM (scalar)", "00011110yy1mmmmm011010nnnnnddddd") //INST(FMINNM_float, "FMINNM (scalar)", "00011110yy1mmmmm011110nnnnnddddd") INST(FNMUL_float, "FNMUL (scalar)", "00011110yy1mmmmm100010nnnnnddddd") diff --git a/src/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp b/src/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp index 13b48c7c..833e868d 100644 --- a/src/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp +++ b/src/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp @@ -113,6 +113,21 @@ bool TranslatorVisitor::FMIN_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { return true; } +bool TranslatorVisitor::FMAXNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + auto datasize = GetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPMaxNumeric(operand1, operand2, true); + + V_scalar(*datasize, Vd, result); + return true; +} + bool TranslatorVisitor::FNMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { auto datasize = GetDataSize(type); if (!datasize) { diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 1b59337e..8b8334b3 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1185,6 +1185,16 @@ U32U64 IREmitter::FPMax(const U32U64& a, const U32U64& b, bool fpscr_controlled) } } +U32U64 IREmitter::FPMaxNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled) { + ASSERT(fpscr_controlled); + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst(Opcode::FPMaxNumeric32, a, b); + } else { + return Inst(Opcode::FPMaxNumeric64, a, b); + } +} + U32U64 IREmitter::FPMin(const U32U64& a, const U32U64& b, bool fpscr_controlled) { ASSERT(fpscr_controlled); ASSERT(a.GetType() == b.GetType()); diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 81a91d57..4d25c62b 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -247,6 +247,7 @@ public: NZCV FPCompare(const U32U64& a, const U32U64& b, bool exc_on_qnan, bool fpscr_controlled); U32U64 FPDiv(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPMax(const U32U64& a, const U32U64& b, bool fpscr_controlled); + U32U64 FPMaxNumeric(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPMin(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPMul(const U32U64& a, const U32U64& b, bool fpscr_controlled); U32U64 FPNeg(const U32U64& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 28192680..d75de2f6 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -300,6 +300,8 @@ OPCODE(FPDiv32, T::U32, T::U32, T::U32 OPCODE(FPDiv64, T::U64, T::U64, T::U64 ) OPCODE(FPMax32, T::U32, T::U32, T::U32 ) OPCODE(FPMax64, T::U64, T::U64, T::U64 ) +OPCODE(FPMaxNumeric32, T::U32, T::U32, T::U32 ) +OPCODE(FPMaxNumeric64, T::U64, T::U64, T::U64 ) OPCODE(FPMin32, T::U32, T::U32, T::U32 ) OPCODE(FPMin64, T::U64, T::U64, T::U64 ) OPCODE(FPMul32, T::U32, T::U32, T::U32 )