emit_x64_vector: Simplify fpsr_qc related code
Move the bool conversion into A64JitState::GetFpsr so we don't have to continuously pay the cost of conversion for every saturation instruction.
This commit is contained in:
parent
112cff9ab9
commit
a7c66d2d28
2 changed files with 16 additions and 73 deletions
|
@ -106,7 +106,7 @@ u32 A64JitState::GetFpsr() const {
|
|||
fpsr |= FPSCR_IDC;
|
||||
fpsr |= FPSCR_UFC;
|
||||
fpsr |= fpsr_exc;
|
||||
fpsr |= (fpsr_qc & 1) << 27;
|
||||
fpsr |= (fpsr_qc == 0 ? 0 : 1) << 27;
|
||||
return fpsr;
|
||||
}
|
||||
|
||||
|
|
|
@ -2724,22 +2724,6 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
|
|||
}
|
||||
}();
|
||||
|
||||
const u32 test_mask = [esize] {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
return 0b1111'1111'1111'1111;
|
||||
case 16:
|
||||
return 0b1010'1010'1010'1010;
|
||||
case 32:
|
||||
return 0b1000'1000'1000'1000;
|
||||
case 64:
|
||||
return 0b10000000'10000000;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
return 0;
|
||||
}
|
||||
}();
|
||||
|
||||
const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const Xbyak::Xmm& y) {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
|
@ -2786,10 +2770,7 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
|
|||
code.movdqa(sign, mask);
|
||||
vector_equality(data_test, sign);
|
||||
code.pmovmskb(bit, data_test);
|
||||
code.test(bit, test_mask);
|
||||
code.setnz(bit.cvt8());
|
||||
|
||||
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8());
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, data);
|
||||
}
|
||||
|
@ -2914,11 +2895,9 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext&
|
|||
|
||||
// Check if any saturation occurred (i.e. if any halfwords in x were
|
||||
// 0x8000 before saturating
|
||||
const Xbyak::Reg64 mask = ctx.reg_alloc.ScratchGpr();
|
||||
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.pmovmskb(mask, tmp);
|
||||
code.test(mask.cvt32(), 0b1010'1010'1010'1010);
|
||||
code.setnz(mask.cvt8());
|
||||
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask.cvt8());
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, x);
|
||||
}
|
||||
|
@ -2958,11 +2937,9 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext&
|
|||
|
||||
// Check if any saturation occurred (i.e. if any words in x were
|
||||
// 0x80000000 before saturating
|
||||
const Xbyak::Reg64 mask = ctx.reg_alloc.ScratchGpr();
|
||||
const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.pmovmskb(mask, tmp2);
|
||||
code.test(mask.cvt32(), 0b1000'1000'1000'1000);
|
||||
code.setnz(mask.cvt8());
|
||||
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask.cvt8());
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, x);
|
||||
}
|
||||
|
@ -2998,18 +2975,10 @@ static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, Block
|
|||
}
|
||||
|
||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
||||
code.pxor(reconstructed, src);
|
||||
code.ptest(reconstructed, reconstructed);
|
||||
} else {
|
||||
code.pcmpeqd(reconstructed, src);
|
||||
code.movmskps(bit, reconstructed);
|
||||
code.cmp(bit, 0xF);
|
||||
}
|
||||
|
||||
code.setnz(bit.cvt8());
|
||||
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8());
|
||||
code.pcmpeqd(reconstructed, src);
|
||||
code.movmskps(bit, reconstructed);
|
||||
code.xor_(bit, 0b1111);
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, dest);
|
||||
}
|
||||
|
@ -3062,18 +3031,10 @@ static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, Blo
|
|||
}
|
||||
|
||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
|
||||
if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
|
||||
code.pxor(reconstructed, src);
|
||||
code.ptest(reconstructed, reconstructed);
|
||||
} else {
|
||||
code.pcmpeqd(reconstructed, src);
|
||||
code.movmskps(bit, reconstructed);
|
||||
code.cmp(bit, 0xF);
|
||||
}
|
||||
|
||||
code.setnz(bit.cvt8());
|
||||
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8());
|
||||
code.pcmpeqd(reconstructed, src);
|
||||
code.movmskps(bit, reconstructed);
|
||||
code.xor_(bit, 0b1111);
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, dest);
|
||||
}
|
||||
|
@ -3133,22 +3094,6 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo
|
|||
}
|
||||
}();
|
||||
|
||||
const u32 test_mask = [esize] {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
return 0b1111'1111'1111'1111;
|
||||
case 16:
|
||||
return 0b1010'1010'1010'1010;
|
||||
case 32:
|
||||
return 0b1000'1000'1000'1000;
|
||||
case 64:
|
||||
return 0b10000000'10000000;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
return 0;
|
||||
}
|
||||
}();
|
||||
|
||||
const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) {
|
||||
switch (esize) {
|
||||
case 8:
|
||||
|
@ -3189,11 +3134,9 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo
|
|||
}
|
||||
|
||||
// Check if any elements matched the mask prior to performing saturation. If so, set the Q bit.
|
||||
const Xbyak::Reg64 bit = ctx.reg_alloc.ScratchGpr();
|
||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||
code.pmovmskb(bit, tmp);
|
||||
code.test(bit.cvt32(), test_mask);
|
||||
code.setnz(bit.cvt8());
|
||||
code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit.cvt8());
|
||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, zero);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue