diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 2add288e..cd01bbb3 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -1300,13 +1300,10 @@ static void DefaultNaN64(BlockOfCode* code, Xbyak::Xmm xmm_value) { code->L(end); } -static void ZeroIfNaN64(BlockOfCode* code, Xbyak::Xmm xmm_value) { - Xbyak::Label end; - - code->ucomisd(xmm_value, xmm_value); - code->jnp(end); - code->pxor(xmm_value, xmm_value); - code->L(end); +static void ZeroIfNaN64(BlockOfCode* code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) { + code->pxor(xmm_scratch, xmm_scratch); + code->cmpordsd(xmm_scratch, xmm_value); // true mask when ordered (i.e.: when not an NaN) + code->pand(xmm_value, xmm_scratch); } static void FPThreeOp32(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { @@ -1532,6 +1529,7 @@ void EmitX64::EmitFPSingleToS32(IR::Block& block, IR::Inst* inst) { Xbyak::Xmm from = reg_alloc.UseScratchXmm(a); Xbyak::Xmm to = reg_alloc.DefXmm(inst); Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Xmm xmm_scratch = reg_alloc.ScratchXmm(); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // Conversion to double is lossless, and allows for clamping. @@ -1547,7 +1545,7 @@ void EmitX64::EmitFPSingleToS32(IR::Block& block, IR::Inst* inst) { code->cvtsd2si(gpr_scratch, from); // 32 bit gpr } // Clamp to output range - ZeroIfNaN64(code, from); + ZeroIfNaN64(code, from, xmm_scratch); code->minsd(from, code->MFloatMaxS32()); code->maxsd(from, code->MFloatMinS32()); // Second time is for real @@ -1566,6 +1564,7 @@ void EmitX64::EmitFPSingleToU32(IR::Block& block, IR::Inst* inst) { Xbyak::Xmm from = reg_alloc.UseScratchXmm(a); Xbyak::Xmm to = reg_alloc.DefXmm(inst); Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Xmm xmm_scratch = reg_alloc.ScratchXmm(); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. // Conversion to double is lossless, and allows for accurate clamping. @@ -1579,7 +1578,7 @@ void EmitX64::EmitFPSingleToU32(IR::Block& block, IR::Inst* inst) { DenormalsAreZero32(code, from, gpr_scratch); } code->cvtss2sd(from, from); - ZeroIfNaN64(code, from); + ZeroIfNaN64(code, from, xmm_scratch); // Bring into SSE range code->addsd(from, code->MFloatMinS32()); // First time is to set flags @@ -1600,7 +1599,7 @@ void EmitX64::EmitFPSingleToU32(IR::Block& block, IR::Inst* inst) { DenormalsAreZero32(code, from, gpr_scratch); } code->cvtss2sd(from, from); - ZeroIfNaN64(code, from); + ZeroIfNaN64(code, from, xmm_scratch); // Generate masks if out-of-signed-range code->movaps(xmm_mask, code->MFloatMaxS32()); code->cmpltsd(xmm_mask, from); @@ -1629,6 +1628,7 @@ void EmitX64::EmitFPDoubleToS32(IR::Block& block, IR::Inst* inst) { Xbyak::Xmm from = reg_alloc.UseScratchXmm(a); Xbyak::Xmm to = reg_alloc.DefXmm(inst); Xbyak::Reg32 gpr_scratch = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Xmm xmm_scratch = reg_alloc.ScratchXmm(); // ARM saturates on conversion; this differs from x64 which returns a sentinel value. @@ -1642,7 +1642,7 @@ void EmitX64::EmitFPDoubleToS32(IR::Block& block, IR::Inst* inst) { code->cvtsd2si(gpr_scratch, from); // 32 bit gpr } // Clamp to output range - ZeroIfNaN64(code, from); + ZeroIfNaN64(code, from, xmm_scratch); code->minsd(from, code->MFloatMaxS32()); code->maxsd(from, code->MFloatMinS32()); // Second time is for real @@ -1671,7 +1671,7 @@ void EmitX64::EmitFPDoubleToU32(IR::Block& block, IR::Inst* inst) { if (block.Location().FPSCR().FTZ()) { DenormalsAreZero64(code, from, gpr_scratch.cvt64()); } - ZeroIfNaN64(code, from); + ZeroIfNaN64(code, from, xmm_scratch); // Bring into SSE range code->addsd(from, code->MFloatMinS32()); // First time is to set flags @@ -1691,7 +1691,7 @@ void EmitX64::EmitFPDoubleToU32(IR::Block& block, IR::Inst* inst) { if (block.Location().FPSCR().FTZ()) { DenormalsAreZero64(code, from, gpr_scratch.cvt64()); } - ZeroIfNaN64(code, from); + ZeroIfNaN64(code, from, xmm_scratch); // Generate masks if out-of-signed-range code->movaps(xmm_mask, code->MFloatMaxS32()); code->cmpltsd(xmm_mask, from);