emit_x64_floating_point: Fix FP{Max,Min} when FPCR.DN = 1
This commit is contained in:
parent
21a28c2545
commit
7f27945411
1 changed files with 134 additions and 47 deletions
|
@ -51,6 +51,19 @@ constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actual
|
||||||
constexpr u64 f64_min_u64 = 0x0000000000000000u; // 0 as a double
|
constexpr u64 f64_min_u64 = 0x0000000000000000u; // 0 as a double
|
||||||
constexpr u64 f64_max_u64_lim = 0x43f0000000000000u; // 2^64 as a double (actual maximum unrepresentable)
|
constexpr u64 f64_max_u64_lim = 0x43f0000000000000u; // 2^64 as a double (actual maximum unrepresentable)
|
||||||
|
|
||||||
|
template<size_t fsize, typename T>
|
||||||
|
static T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) {
|
||||||
|
static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
|
||||||
|
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
return f32;
|
||||||
|
} else {
|
||||||
|
return f64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FCODE(NAME) (code.*ChooseOnFsize<fsize>(&Xbyak::CodeGenerator::NAME##s, &Xbyak::CodeGenerator::NAME##d))
|
||||||
|
|
||||||
static void DenormalsAreZero32(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg32 gpr_scratch) {
|
static void DenormalsAreZero32(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Reg32 gpr_scratch) {
|
||||||
Xbyak::Label end;
|
Xbyak::Label end;
|
||||||
|
|
||||||
|
@ -369,6 +382,15 @@ static void FPThreeOp64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Fun
|
||||||
FPThreeOp64(code, ctx, inst, nullptr, fn);
|
FPThreeOp64(code, ctx, inst, nullptr, fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <size_t fsize, typename Function>
|
||||||
|
static void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
FPThreeOp32(code, ctx, inst, fn);
|
||||||
|
} else {
|
||||||
|
FPThreeOp64(code, ctx, inst, fn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Function>
|
template <typename Function>
|
||||||
static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
static void FPTwoOp32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
@ -553,44 +575,69 @@ void EmitX64::EmitFPDiv64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
FPThreeOp64(code, ctx, inst, &Xbyak::CodeGenerator::divsd);
|
FPThreeOp64(code, ctx, inst, &Xbyak::CodeGenerator::divsd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) {
|
template<size_t fsize>
|
||||||
FPThreeOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand){
|
void EmitFPMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
Xbyak::Label normal, end;
|
if (ctx.FPSCR_DN()) {
|
||||||
code.ucomiss(result, operand);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
code.jnz(normal);
|
|
||||||
if (!ctx.AccurateNaN()) {
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
Xbyak::Label notnan;
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
code.jnp(notnan);
|
|
||||||
code.addss(result, operand);
|
if (ctx.FPSCR_FTZ()) {
|
||||||
code.jmp(end);
|
if constexpr (fsize == 32) {
|
||||||
code.L(notnan);
|
const Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
DenormalsAreZero32(code, result, gpr_scratch);
|
||||||
|
DenormalsAreZero32(code, operand, gpr_scratch);
|
||||||
|
} else {
|
||||||
|
const Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt64();
|
||||||
|
DenormalsAreZero64(code, result, gpr_scratch);
|
||||||
|
DenormalsAreZero64(code, operand, gpr_scratch);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Xbyak::Label equal, end, nan;
|
||||||
|
|
||||||
|
FCODE(ucomis)(result, operand);
|
||||||
|
code.jz(equal, code.T_NEAR);
|
||||||
|
FCODE(maxs)(result, operand);
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
|
||||||
|
code.L(equal);
|
||||||
|
code.jp(nan);
|
||||||
code.andps(result, operand);
|
code.andps(result, operand);
|
||||||
code.jmp(end);
|
code.jmp(end);
|
||||||
code.L(normal);
|
|
||||||
code.maxss(result, operand);
|
code.L(nan);
|
||||||
|
code.movaps(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
|
||||||
|
code.jmp(end);
|
||||||
|
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FPThreeOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand){
|
||||||
|
Xbyak::Label equal, end;
|
||||||
|
FCODE(ucomis)(result, operand);
|
||||||
|
code.jz(equal);
|
||||||
|
FCODE(maxs)(result, operand);
|
||||||
|
code.jmp(end);
|
||||||
|
code.L(equal);
|
||||||
|
code.andps(result, operand);
|
||||||
code.L(end);
|
code.L(end);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPMax<32>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
FPThreeOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand){
|
EmitFPMax<64>(code, ctx, inst);
|
||||||
Xbyak::Label normal, end;
|
|
||||||
code.ucomisd(result, operand);
|
|
||||||
code.jnz(normal);
|
|
||||||
if (!ctx.AccurateNaN()) {
|
|
||||||
Xbyak::Label notnan;
|
|
||||||
code.jnp(notnan);
|
|
||||||
code.addsd(result, operand);
|
|
||||||
code.jmp(end);
|
|
||||||
code.L(notnan);
|
|
||||||
}
|
|
||||||
code.andps(result, operand);
|
|
||||||
code.jmp(end);
|
|
||||||
code.L(normal);
|
|
||||||
code.maxsd(result, operand);
|
|
||||||
code.L(end);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
@ -661,30 +708,70 @@ void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
}, &Xbyak::CodeGenerator::maxsd);
|
}, &Xbyak::CodeGenerator::maxsd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPMin32(EmitContext& ctx, IR::Inst* inst) {
|
|
||||||
FPThreeOp32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand){
|
template<size_t fsize>
|
||||||
Xbyak::Label normal, end;
|
void EmitFPMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.ucomiss(result, operand);
|
if (ctx.FPSCR_DN()) {
|
||||||
code.jnz(normal);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
|
||||||
|
if (ctx.FPSCR_FTZ()) {
|
||||||
|
if constexpr (fsize == 32) {
|
||||||
|
const Xbyak::Reg32 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
DenormalsAreZero32(code, result, gpr_scratch);
|
||||||
|
DenormalsAreZero32(code, operand, gpr_scratch);
|
||||||
|
} else {
|
||||||
|
const Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr().cvt64();
|
||||||
|
DenormalsAreZero64(code, result, gpr_scratch);
|
||||||
|
DenormalsAreZero64(code, operand, gpr_scratch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Xbyak::Label equal, end, nan;
|
||||||
|
|
||||||
|
FCODE(ucomis)(result, operand);
|
||||||
|
code.jz(equal, code.T_NEAR);
|
||||||
|
FCODE(mins)(result, operand);
|
||||||
|
code.L(end);
|
||||||
|
|
||||||
|
code.SwitchToFarCode();
|
||||||
|
|
||||||
|
code.L(equal);
|
||||||
|
code.jp(nan);
|
||||||
code.orps(result, operand);
|
code.orps(result, operand);
|
||||||
code.jmp(end);
|
code.jmp(end);
|
||||||
code.L(normal);
|
|
||||||
code.minss(result, operand);
|
code.L(nan);
|
||||||
|
code.movaps(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan));
|
||||||
|
code.jmp(end);
|
||||||
|
|
||||||
|
code.SwitchToNearCode();
|
||||||
|
|
||||||
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FPThreeOp<fsize>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand){
|
||||||
|
Xbyak::Label equal, end;
|
||||||
|
FCODE(ucomis)(result, operand);
|
||||||
|
code.jz(equal);
|
||||||
|
FCODE(mins)(result, operand);
|
||||||
|
code.jmp(end);
|
||||||
|
code.L(equal);
|
||||||
|
code.orps(result, operand);
|
||||||
code.L(end);
|
code.L(end);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitX64::EmitFPMin32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
EmitFPMin<32>(code, ctx, inst);
|
||||||
|
}
|
||||||
|
|
||||||
void EmitX64::EmitFPMin64(EmitContext& ctx, IR::Inst* inst) {
|
void EmitX64::EmitFPMin64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
FPThreeOp64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand){
|
EmitFPMin<64>(code, ctx, inst);
|
||||||
Xbyak::Label normal, end;
|
|
||||||
code.ucomisd(result, operand);
|
|
||||||
code.jnz(normal);
|
|
||||||
code.orps(result, operand);
|
|
||||||
code.jmp(end);
|
|
||||||
code.L(normal);
|
|
||||||
code.minsd(result, operand);
|
|
||||||
code.L(end);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue