Refactor Xmm{B}Const
to {,B}Const
This commit is contained in:
parent
917335ae8a
commit
00c6c00e86
9 changed files with 206 additions and 206 deletions
|
@ -500,7 +500,7 @@ void BlockOfCode::LoadRequiredFlagsForCondFromRax(IR::Cond cond) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Xbyak::Address BlockOfCode::XmmConst(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) {
|
Xbyak::Address BlockOfCode::Const(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) {
|
||||||
return constant_pool.GetConstant(frame, lower, upper);
|
return constant_pool.GetConstant(frame, lower, upper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -123,12 +123,12 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Xbyak::Address XmmConst(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0);
|
Xbyak::Address Const(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0);
|
||||||
|
|
||||||
template<size_t esize>
|
template<size_t esize>
|
||||||
Xbyak::Address XmmBConst(const Xbyak::AddressFrame& frame, u64 value) {
|
Xbyak::Address BConst(const Xbyak::AddressFrame& frame, u64 value) {
|
||||||
return XmmConst(frame, mcl::bit::replicate_element<u64>(esize, value),
|
return Const(frame, mcl::bit::replicate_element<u64>(esize, value),
|
||||||
mcl::bit::replicate_element<u64>(esize, value));
|
mcl::bit::replicate_element<u64>(esize, value));
|
||||||
}
|
}
|
||||||
|
|
||||||
CodePtr GetCodeBegin() const;
|
CodePtr GetCodeBegin() const;
|
||||||
|
|
|
@ -42,7 +42,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
|
||||||
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movdqa(xmm_const, code.XmmConst(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
|
code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
|
||||||
|
|
||||||
code.movzx(value.cvt32(), value.changeBit(data_size));
|
code.movzx(value.cvt32(), value.changeBit(data_size));
|
||||||
code.xor_(value.cvt32(), crc);
|
code.xor_(value.cvt32(), crc);
|
||||||
|
@ -72,7 +72,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
|
||||||
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movdqa(xmm_const, code.XmmConst(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
|
code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
|
||||||
|
|
||||||
code.xor_(crc, value);
|
code.xor_(crc, value);
|
||||||
code.shl(crc.cvt64(), 32);
|
code.shl(crc.cvt64(), 32);
|
||||||
|
@ -93,7 +93,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
|
||||||
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movdqa(xmm_const, code.XmmConst(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
|
code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
|
||||||
|
|
||||||
code.mov(crc, crc);
|
code.mov(crc, crc);
|
||||||
code.xor_(crc.cvt64(), value);
|
code.xor_(crc.cvt64(), value);
|
||||||
|
|
|
@ -91,7 +91,7 @@ void ForceDenormalsToZero(BlockOfCode& code, std::initializer_list<Xbyak::Xmm> t
|
||||||
FpFixup::Norm_Src);
|
FpFixup::Norm_Src);
|
||||||
|
|
||||||
const Xbyak::Xmm tmp = xmm16;
|
const Xbyak::Xmm tmp = xmm16;
|
||||||
FCODE(vmovap)(tmp, code.XmmBConst<fsize>(xword, denormal_to_zero));
|
FCODE(vmovap)(tmp, code.BConst<fsize>(xword, denormal_to_zero));
|
||||||
|
|
||||||
for (const Xbyak::Xmm& xmm : to_daz) {
|
for (const Xbyak::Xmm& xmm : to_daz) {
|
||||||
FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));
|
FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));
|
||||||
|
@ -100,17 +100,17 @@ void ForceDenormalsToZero(BlockOfCode& code, std::initializer_list<Xbyak::Xmm> t
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const Xbyak::Xmm& xmm : to_daz) {
|
for (const Xbyak::Xmm& xmm : to_daz) {
|
||||||
code.movaps(xmm0, code.XmmConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
code.movaps(xmm0, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
||||||
code.andps(xmm0, xmm);
|
code.andps(xmm0, xmm);
|
||||||
if constexpr (fsize == 32) {
|
if constexpr (fsize == 32) {
|
||||||
code.pcmpgtd(xmm0, code.XmmConst(xword, f32_smallest_normal - 1));
|
code.pcmpgtd(xmm0, code.Const(xword, f32_smallest_normal - 1));
|
||||||
} else if (code.HasHostFeature(HostFeature::SSE42)) {
|
} else if (code.HasHostFeature(HostFeature::SSE42)) {
|
||||||
code.pcmpgtq(xmm0, code.XmmConst(xword, f64_smallest_normal - 1));
|
code.pcmpgtq(xmm0, code.Const(xword, f64_smallest_normal - 1));
|
||||||
} else {
|
} else {
|
||||||
code.pcmpgtd(xmm0, code.XmmConst(xword, f64_smallest_normal - 1));
|
code.pcmpgtd(xmm0, code.Const(xword, f64_smallest_normal - 1));
|
||||||
code.pshufd(xmm0, xmm0, 0b11100101);
|
code.pshufd(xmm0, xmm0, 0b11100101);
|
||||||
}
|
}
|
||||||
code.orps(xmm0, code.XmmConst(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero));
|
code.orps(xmm0, code.Const(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero));
|
||||||
code.andps(xmm, xmm0);
|
code.andps(xmm, xmm0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -127,7 +127,7 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch)
|
||||||
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
|
constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
|
||||||
FpFixup::PosZero);
|
FpFixup::PosZero);
|
||||||
FCODE(vfixupimms)(xmm_value, xmm_value, code.XmmConst(ptr, u64(nan_to_zero)), u8(0));
|
FCODE(vfixupimms)(xmm_value, xmm_value, code.Const(ptr, u64(nan_to_zero)), u8(0));
|
||||||
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
FCODE(vcmpords)(xmm_scratch, xmm_value, xmm_value);
|
FCODE(vcmpords)(xmm_scratch, xmm_value, xmm_value);
|
||||||
FCODE(vandp)(xmm_value, xmm_value, xmm_scratch);
|
FCODE(vandp)(xmm_value, xmm_value, xmm_scratch);
|
||||||
|
@ -143,15 +143,15 @@ void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
|
||||||
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
const Xbyak::Opmask nan_mask = k1;
|
const Xbyak::Opmask nan_mask = k1;
|
||||||
FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
|
FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
|
||||||
FCODE(vblendmp)(result | nan_mask, result, code.XmmConst(ptr_b, fsize == 32 ? f32_nan : f64_nan));
|
FCODE(vblendmp)(result | nan_mask, result, code.Const(ptr_b, fsize == 32 ? f32_nan : f64_nan));
|
||||||
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
FCODE(vcmpunords)(xmm0, result, result);
|
FCODE(vcmpunords)(xmm0, result, result);
|
||||||
FCODE(blendvp)(result, code.XmmConst(xword, fsize == 32 ? f32_nan : f64_nan));
|
FCODE(blendvp)(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan));
|
||||||
} else {
|
} else {
|
||||||
Xbyak::Label end;
|
Xbyak::Label end;
|
||||||
FCODE(ucomis)(result, result);
|
FCODE(ucomis)(result, result);
|
||||||
code.jnp(end);
|
code.jnp(end);
|
||||||
code.movaps(result, code.XmmConst(xword, fsize == 32 ? f32_nan : f64_nan));
|
code.movaps(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan));
|
||||||
code.L(end);
|
code.L(end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -165,7 +165,7 @@ SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) {
|
||||||
|
|
||||||
ctx.deferred_emits.emplace_back([=, &code] {
|
ctx.deferred_emits.emplace_back([=, &code] {
|
||||||
code.L(*nan);
|
code.L(*nan);
|
||||||
code.orps(a, code.XmmConst(xword, fsize == 32 ? 0x00400000 : 0x0008'0000'0000'0000));
|
code.orps(a, code.Const(xword, fsize == 32 ? 0x00400000 : 0x0008'0000'0000'0000));
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -261,10 +261,10 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X
|
||||||
|
|
||||||
// Silence the SNaN as required by spec.
|
// Silence the SNaN as required by spec.
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vorps(result, op2, code.XmmConst(xword, mantissa_msb));
|
code.vorps(result, op2, code.Const(xword, mantissa_msb));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(result, op2);
|
code.movaps(result, op2);
|
||||||
code.orps(result, code.XmmConst(xword, mantissa_msb));
|
code.orps(result, code.Const(xword, mantissa_msb));
|
||||||
}
|
}
|
||||||
code.jmp(end, code.T_NEAR);
|
code.jmp(end, code.T_NEAR);
|
||||||
}
|
}
|
||||||
|
@ -345,7 +345,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
|
||||||
FCODE(ucomis)(op1, op2);
|
FCODE(ucomis)(op1, op2);
|
||||||
code.jp(op_are_nans);
|
code.jp(op_are_nans);
|
||||||
// Here we must return a positive NaN, because the indefinite value on x86 is a negative NaN!
|
// Here we must return a positive NaN, because the indefinite value on x86 is a negative NaN!
|
||||||
code.movaps(result, code.XmmConst(xword, FP::FPInfo<FPT>::DefaultNaN()));
|
code.movaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN()));
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
code.L(op_are_nans);
|
code.L(op_are_nans);
|
||||||
EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end);
|
EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end);
|
||||||
|
@ -363,7 +363,7 @@ void FPAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Address mask = code.XmmConst(xword, non_sign_mask);
|
const Xbyak::Address mask = code.Const(xword, non_sign_mask);
|
||||||
|
|
||||||
code.andps(result, mask);
|
code.andps(result, mask);
|
||||||
|
|
||||||
|
@ -389,7 +389,7 @@ void FPNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Address mask = code.XmmConst(xword, u64(sign_mask));
|
const Xbyak::Address mask = code.Const(xword, u64(sign_mask));
|
||||||
|
|
||||||
code.xorps(result, mask);
|
code.xorps(result, mask);
|
||||||
|
|
||||||
|
@ -460,7 +460,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
code.L(nan);
|
code.L(nan);
|
||||||
if (ctx.FPCR().DN()) {
|
if (ctx.FPCR().DN()) {
|
||||||
code.movaps(result, code.XmmConst(xword, fsize == 32 ? f32_nan : f64_nan));
|
code.movaps(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan));
|
||||||
code.jmp(*end);
|
code.jmp(*end);
|
||||||
} else {
|
} else {
|
||||||
code.movaps(tmp, result);
|
code.movaps(tmp, result);
|
||||||
|
@ -492,7 +492,7 @@ static void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
|
|
||||||
if (ctx.FPCR().DN()) {
|
if (ctx.FPCR().DN()) {
|
||||||
FCODE(vcmps)(k1, op2, op2, Cmp::Unordered_Q);
|
FCODE(vcmps)(k1, op2, op2, Cmp::Unordered_Q);
|
||||||
FCODE(vmovs)(op2 | k1, code.XmmConst(xword, default_nan));
|
FCODE(vmovs)(op2 | k1, code.Const(xword, default_nan));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Xbyak::Reg tmp = ctx.reg_alloc.ScratchGpr();
|
Xbyak::Reg tmp = ctx.reg_alloc.ScratchGpr();
|
||||||
|
@ -549,12 +549,12 @@ static void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
code.jc(maybe_both_nan);
|
code.jc(maybe_both_nan);
|
||||||
if (ctx.FPCR().DN()) {
|
if (ctx.FPCR().DN()) {
|
||||||
code.L(snan);
|
code.L(snan);
|
||||||
code.movaps(op2, code.XmmConst(xword, default_nan));
|
code.movaps(op2, code.Const(xword, default_nan));
|
||||||
code.jmp(*end);
|
code.jmp(*end);
|
||||||
} else {
|
} else {
|
||||||
code.movaps(op2, op1);
|
code.movaps(op2, op1);
|
||||||
code.L(snan);
|
code.L(snan);
|
||||||
code.orps(op2, code.XmmConst(xword, FP::FPInfo<FPT>::mantissa_msb));
|
code.orps(op2, code.Const(xword, FP::FPInfo<FPT>::mantissa_msb));
|
||||||
code.jmp(*end);
|
code.jmp(*end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -660,12 +660,12 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
FCODE(vfmadd231s)(result, operand2, operand3);
|
FCODE(vfmadd231s)(result, operand2, operand3);
|
||||||
|
|
||||||
if (needs_rounding_correction && needs_nan_correction) {
|
if (needs_rounding_correction && needs_nan_correction) {
|
||||||
code.vandps(xmm0, result, code.XmmConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
||||||
FCODE(ucomis)(xmm0, code.XmmConst(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal));
|
FCODE(ucomis)(xmm0, code.Const(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal));
|
||||||
code.jz(*fallback, code.T_NEAR);
|
code.jz(*fallback, code.T_NEAR);
|
||||||
} else if (needs_rounding_correction) {
|
} else if (needs_rounding_correction) {
|
||||||
code.vandps(xmm0, result, code.XmmConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
|
||||||
code.vxorps(xmm0, xmm0, code.XmmConst(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal));
|
code.vxorps(xmm0, xmm0, code.Const(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal));
|
||||||
code.ptest(xmm0, xmm0);
|
code.ptest(xmm0, xmm0);
|
||||||
code.jz(*fallback, code.T_NEAR);
|
code.jz(*fallback, code.T_NEAR);
|
||||||
} else if (needs_nan_correction) {
|
} else if (needs_nan_correction) {
|
||||||
|
@ -723,7 +723,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
Xbyak::Label has_nan, indeterminate, op1_snan, op1_done, op2_done, op3_done;
|
Xbyak::Label has_nan, indeterminate, op1_snan, op1_done, op2_done, op3_done;
|
||||||
|
|
||||||
code.vmovaps(xmm0, code.XmmConst(xword, FP::FPInfo<FPT>::mantissa_msb));
|
code.vmovaps(xmm0, code.Const(xword, FP::FPInfo<FPT>::mantissa_msb));
|
||||||
|
|
||||||
FCODE(ucomis)(operand2, operand3);
|
FCODE(ucomis)(operand2, operand3);
|
||||||
code.jp(has_nan);
|
code.jp(has_nan);
|
||||||
|
@ -738,7 +738,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.jnp(*end);
|
code.jnp(*end);
|
||||||
|
|
||||||
code.L(indeterminate);
|
code.L(indeterminate);
|
||||||
code.vmovaps(result, code.XmmConst(xword, FP::FPInfo<FPT>::DefaultNaN()));
|
code.vmovaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN()));
|
||||||
code.jmp(*end);
|
code.jmp(*end);
|
||||||
|
|
||||||
code.L(has_nan);
|
code.L(has_nan);
|
||||||
|
@ -853,12 +853,12 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.movaps(result, op1);
|
code.movaps(result, op1);
|
||||||
code.xorps(result, op2);
|
code.xorps(result, op2);
|
||||||
}
|
}
|
||||||
code.andps(result, code.XmmConst(xword, FP::FPInfo<FPT>::sign_mask));
|
code.andps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
|
||||||
code.orps(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 2>()));
|
code.orps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
code.L(op_are_nans);
|
code.L(op_are_nans);
|
||||||
if (do_default_nan) {
|
if (do_default_nan) {
|
||||||
code.movaps(result, code.XmmConst(xword, FP::FPInfo<FPT>::DefaultNaN()));
|
code.movaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN()));
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
} else {
|
} else {
|
||||||
EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end);
|
EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end);
|
||||||
|
@ -959,7 +959,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movaps(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 2>()));
|
code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
|
||||||
FCODE(vfnmadd231s)(result, operand1, operand2);
|
FCODE(vfnmadd231s)(result, operand1, operand2);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -973,7 +973,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movaps(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 2>()));
|
code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
|
||||||
FCODE(vfnmadd231s)(result, operand1, operand2);
|
FCODE(vfnmadd231s)(result, operand1, operand2);
|
||||||
FCODE(ucomis)(result, result);
|
FCODE(ucomis)(result, result);
|
||||||
code.jp(*fallback, code.T_NEAR);
|
code.jp(*fallback, code.T_NEAR);
|
||||||
|
@ -1005,7 +1005,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movaps(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 2>()));
|
code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
|
||||||
FCODE(muls)(operand1, operand2);
|
FCODE(muls)(operand1, operand2);
|
||||||
FCODE(subs)(result, operand1);
|
FCODE(subs)(result, operand1);
|
||||||
|
|
||||||
|
@ -1134,19 +1134,19 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
|
|
||||||
code.movaps(value, operand);
|
code.movaps(value, operand);
|
||||||
|
|
||||||
code.movaps(xmm0, code.XmmConst(xword, fsize == 32 ? 0xFFFF8000 : 0xFFFF'F000'0000'0000));
|
code.movaps(xmm0, code.Const(xword, fsize == 32 ? 0xFFFF8000 : 0xFFFF'F000'0000'0000));
|
||||||
code.pand(value, xmm0);
|
code.pand(value, xmm0);
|
||||||
code.por(value, code.XmmConst(xword, fsize == 32 ? 0x00008000 : 0x0000'1000'0000'0000));
|
code.por(value, code.Const(xword, fsize == 32 ? 0x00008000 : 0x0000'1000'0000'0000));
|
||||||
|
|
||||||
// Detect NaNs, negatives, zeros, denormals and infinities
|
// Detect NaNs, negatives, zeros, denormals and infinities
|
||||||
FCODE(ucomis)(value, code.XmmConst(xword, FPT(1) << FP::FPInfo<FPT>::explicit_mantissa_width));
|
FCODE(ucomis)(value, code.Const(xword, FPT(1) << FP::FPInfo<FPT>::explicit_mantissa_width));
|
||||||
code.jna(*bad_values, code.T_NEAR);
|
code.jna(*bad_values, code.T_NEAR);
|
||||||
|
|
||||||
FCODE(sqrts)(value, value);
|
FCODE(sqrts)(value, value);
|
||||||
ICODE(mov)(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 1>()));
|
ICODE(mov)(result, code.Const(xword, FP::FPValue<FPT, false, 0, 1>()));
|
||||||
FCODE(divs)(result, value);
|
FCODE(divs)(result, value);
|
||||||
|
|
||||||
ICODE(padd)(result, code.XmmConst(xword, fsize == 32 ? 0x00004000 : 0x0000'0800'0000'0000));
|
ICODE(padd)(result, code.Const(xword, fsize == 32 ? 0x00004000 : 0x0000'0800'0000'0000));
|
||||||
code.pand(result, xmm0);
|
code.pand(result, xmm0);
|
||||||
|
|
||||||
code.L(*end);
|
code.L(*end);
|
||||||
|
@ -1187,7 +1187,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
}
|
}
|
||||||
|
|
||||||
code.L(default_nan);
|
code.L(default_nan);
|
||||||
code.movd(result, code.XmmConst(xword, 0x7FC00000));
|
code.movd(result, code.Const(xword, 0x7FC00000));
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
} else {
|
} else {
|
||||||
Xbyak::Label nan, zero;
|
Xbyak::Label nan, zero;
|
||||||
|
@ -1216,26 +1216,26 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||||
|
|
||||||
code.L(zero);
|
code.L(zero);
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpor(result, value, code.XmmConst(xword, 0x7FF0'0000'0000'0000));
|
code.vpor(result, value, code.Const(xword, 0x7FF0'0000'0000'0000));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(result, value);
|
code.movaps(result, value);
|
||||||
code.por(result, code.XmmConst(xword, 0x7FF0'0000'0000'0000));
|
code.por(result, code.Const(xword, 0x7FF0'0000'0000'0000));
|
||||||
}
|
}
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
|
|
||||||
code.L(nan);
|
code.L(nan);
|
||||||
if (!ctx.FPCR().DN()) {
|
if (!ctx.FPCR().DN()) {
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpor(result, operand, code.XmmConst(xword, 0x0008'0000'0000'0000));
|
code.vpor(result, operand, code.Const(xword, 0x0008'0000'0000'0000));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(result, operand);
|
code.movaps(result, operand);
|
||||||
code.por(result, code.XmmConst(xword, 0x0008'0000'0000'0000));
|
code.por(result, code.Const(xword, 0x0008'0000'0000'0000));
|
||||||
}
|
}
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
}
|
}
|
||||||
|
|
||||||
code.L(default_nan);
|
code.L(default_nan);
|
||||||
code.movq(result, code.XmmConst(xword, 0x7FF8'0000'0000'0000));
|
code.movq(result, code.Const(xword, 0x7FF8'0000'0000'0000));
|
||||||
code.jmp(*end, code.T_NEAR);
|
code.jmp(*end, code.T_NEAR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1288,9 +1288,9 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.vmovaps(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 3>()));
|
code.vmovaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>()));
|
||||||
FCODE(vfnmadd231s)(result, operand1, operand2);
|
FCODE(vfnmadd231s)(result, operand1, operand2);
|
||||||
FCODE(vmuls)(result, result, code.XmmConst(xword, FP::FPValue<FPT, false, -1, 1>()));
|
FCODE(vmuls)(result, result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>()));
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
return;
|
return;
|
||||||
|
@ -1303,7 +1303,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.vmovaps(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 3>()));
|
code.vmovaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>()));
|
||||||
FCODE(vfnmadd231s)(result, operand1, operand2);
|
FCODE(vfnmadd231s)(result, operand1, operand2);
|
||||||
|
|
||||||
// Detect if the intermediate result is infinity or NaN or nearly an infinity.
|
// Detect if the intermediate result is infinity or NaN or nearly an infinity.
|
||||||
|
@ -1318,7 +1318,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||||
|
|
||||||
code.jae(*fallback, code.T_NEAR);
|
code.jae(*fallback, code.T_NEAR);
|
||||||
|
|
||||||
FCODE(vmuls)(result, result, code.XmmConst(xword, FP::FPValue<FPT, false, -1, 1>()));
|
FCODE(vmuls)(result, result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>()));
|
||||||
code.L(*end);
|
code.L(*end);
|
||||||
|
|
||||||
ctx.deferred_emits.emplace_back([=, &code, &ctx] {
|
ctx.deferred_emits.emplace_back([=, &code, &ctx] {
|
||||||
|
@ -1347,10 +1347,10 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movaps(result, code.XmmConst(xword, FP::FPValue<FPT, false, 0, 3>()));
|
code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>()));
|
||||||
FCODE(muls)(operand1, operand2);
|
FCODE(muls)(operand1, operand2);
|
||||||
FCODE(subs)(result, operand1);
|
FCODE(subs)(result, operand1);
|
||||||
FCODE(muls)(result, code.XmmConst(xword, FP::FPValue<FPT, false, -1, 1>()));
|
FCODE(muls)(result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>()));
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, operand1);
|
ctx.reg_alloc.DefineValue(inst, operand1);
|
||||||
return;
|
return;
|
||||||
|
@ -1602,7 +1602,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
if constexpr (fsize == 64) {
|
if constexpr (fsize == 64) {
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u64 scale_factor = static_cast<u64>((fbits + 1023) << 52);
|
const u64 scale_factor = static_cast<u64>((fbits + 1023) << 52);
|
||||||
code.mulsd(src, code.XmmConst(xword, scale_factor));
|
code.mulsd(src, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!truncating) {
|
if (!truncating) {
|
||||||
|
@ -1611,7 +1611,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
} else {
|
} else {
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u32 scale_factor = static_cast<u32>((fbits + 127) << 23);
|
const u32 scale_factor = static_cast<u32>((fbits + 127) << 23);
|
||||||
code.mulss(src, code.XmmConst(xword, scale_factor));
|
code.mulss(src, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!truncating) {
|
if (!truncating) {
|
||||||
|
@ -1629,7 +1629,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
ZeroIfNaN<64>(code, src, scratch);
|
ZeroIfNaN<64>(code, src, scratch);
|
||||||
|
|
||||||
code.movsd(scratch, code.XmmConst(xword, f64_max_s64_lim));
|
code.movsd(scratch, code.Const(xword, f64_max_s64_lim));
|
||||||
code.comisd(scratch, src);
|
code.comisd(scratch, src);
|
||||||
code.jna(*saturate_max, code.T_NEAR);
|
code.jna(*saturate_max, code.T_NEAR);
|
||||||
code.cvttsd2si(result, src); // 64 bit gpr
|
code.cvttsd2si(result, src); // 64 bit gpr
|
||||||
|
@ -1648,7 +1648,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.pxor(xmm0, xmm0);
|
code.pxor(xmm0, xmm0);
|
||||||
|
|
||||||
code.movaps(scratch, src);
|
code.movaps(scratch, src);
|
||||||
code.subsd(scratch, code.XmmConst(xword, f64_max_s64_lim));
|
code.subsd(scratch, code.Const(xword, f64_max_s64_lim));
|
||||||
|
|
||||||
// these both result in zero if src/scratch are NaN
|
// these both result in zero if src/scratch are NaN
|
||||||
code.maxsd(src, xmm0);
|
code.maxsd(src, xmm0);
|
||||||
|
@ -1670,21 +1670,21 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
ZeroIfNaN<64>(code, src, scratch);
|
ZeroIfNaN<64>(code, src, scratch);
|
||||||
code.minsd(src, code.XmmConst(xword, f64_max_s32));
|
code.minsd(src, code.Const(xword, f64_max_s32));
|
||||||
// maxsd not required as cvttsd2si results in 0x8000'0000 when out of range
|
// maxsd not required as cvttsd2si results in 0x8000'0000 when out of range
|
||||||
code.cvttsd2si(result.cvt32(), src); // 32 bit gpr
|
code.cvttsd2si(result.cvt32(), src); // 32 bit gpr
|
||||||
} else {
|
} else {
|
||||||
code.pxor(xmm0, xmm0);
|
code.pxor(xmm0, xmm0);
|
||||||
code.maxsd(src, xmm0); // results in a zero if src is NaN
|
code.maxsd(src, xmm0); // results in a zero if src is NaN
|
||||||
code.minsd(src, code.XmmConst(xword, f64_max_u32));
|
code.minsd(src, code.Const(xword, f64_max_u32));
|
||||||
code.cvttsd2si(result, src); // 64 bit gpr
|
code.cvttsd2si(result, src); // 64 bit gpr
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
ZeroIfNaN<64>(code, src, scratch);
|
ZeroIfNaN<64>(code, src, scratch);
|
||||||
code.maxsd(src, code.XmmConst(xword, unsigned_ ? f64_min_u16 : f64_min_s16));
|
code.maxsd(src, code.Const(xword, unsigned_ ? f64_min_u16 : f64_min_s16));
|
||||||
code.minsd(src, code.XmmConst(xword, unsigned_ ? f64_max_u16 : f64_max_s16));
|
code.minsd(src, code.Const(xword, unsigned_ ? f64_max_u16 : f64_max_s16));
|
||||||
code.cvttsd2si(result, src); // 64 bit gpr
|
code.cvttsd2si(result, src); // 64 bit gpr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1809,7 +1809,7 @@ void EmitX64::EmitFPFixedS16ToSingle(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
||||||
code.mulss(result, code.XmmConst(xword, scale_factor));
|
code.mulss(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1829,7 +1829,7 @@ void EmitX64::EmitFPFixedU16ToSingle(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
||||||
code.mulss(result, code.XmmConst(xword, scale_factor));
|
code.mulss(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1854,7 +1854,7 @@ void EmitX64::EmitFPFixedS32ToSingle(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
||||||
code.mulss(result, code.XmmConst(xword, scale_factor));
|
code.mulss(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1890,7 +1890,7 @@ void EmitX64::EmitFPFixedU32ToSingle(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
||||||
code.mulss(result, code.XmmConst(xword, scale_factor));
|
code.mulss(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1910,7 +1910,7 @@ void EmitX64::EmitFPFixedS16ToDouble(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
||||||
code.mulsd(result, code.XmmConst(xword, scale_factor));
|
code.mulsd(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1930,7 +1930,7 @@ void EmitX64::EmitFPFixedU16ToDouble(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
||||||
code.mulsd(result, code.XmmConst(xword, scale_factor));
|
code.mulsd(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1948,7 +1948,7 @@ void EmitX64::EmitFPFixedS32ToDouble(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
||||||
code.mulsd(result, code.XmmConst(xword, scale_factor));
|
code.mulsd(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1975,7 +1975,7 @@ void EmitX64::EmitFPFixedU32ToDouble(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
||||||
code.mulsd(to, code.XmmConst(xword, scale_factor));
|
code.mulsd(to, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, to);
|
ctx.reg_alloc.DefineValue(inst, to);
|
||||||
|
@ -1994,7 +1994,7 @@ void EmitX64::EmitFPFixedS64ToDouble(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
||||||
code.mulsd(result, code.XmmConst(xword, scale_factor));
|
code.mulsd(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -2013,7 +2013,7 @@ void EmitX64::EmitFPFixedS64ToSingle(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
||||||
code.mulss(result, code.XmmConst(xword, scale_factor));
|
code.mulss(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -2034,18 +2034,18 @@ void EmitX64::EmitFPFixedU64ToDouble(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movq(tmp, from);
|
code.movq(tmp, from);
|
||||||
code.punpckldq(tmp, code.XmmConst(xword, 0x4530000043300000, 0));
|
code.punpckldq(tmp, code.Const(xword, 0x4530000043300000, 0));
|
||||||
code.subpd(tmp, code.XmmConst(xword, 0x4330000000000000, 0x4530000000000000));
|
code.subpd(tmp, code.Const(xword, 0x4330000000000000, 0x4530000000000000));
|
||||||
code.pshufd(result, tmp, 0b01001110);
|
code.pshufd(result, tmp, 0b01001110);
|
||||||
code.addpd(result, tmp);
|
code.addpd(result, tmp);
|
||||||
if (ctx.FPCR().RMode() == FP::RoundingMode::TowardsMinusInfinity) {
|
if (ctx.FPCR().RMode() == FP::RoundingMode::TowardsMinusInfinity) {
|
||||||
code.pand(result, code.XmmConst(xword, f64_non_sign_mask));
|
code.pand(result, code.Const(xword, f64_non_sign_mask));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
|
||||||
code.mulsd(result, code.XmmConst(xword, scale_factor));
|
code.mulsd(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -2089,7 +2089,7 @@ void EmitX64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
if (fbits != 0) {
|
if (fbits != 0) {
|
||||||
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
|
||||||
code.mulss(result, code.XmmConst(xword, scale_factor));
|
code.mulss(result, code.Const(xword, scale_factor));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
|
|
@ -91,8 +91,8 @@ void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
// !(b <= a+b) == b > a+b
|
// !(b <= a+b) == b > a+b
|
||||||
code.movdqa(tmp_a, xmm_a);
|
code.movdqa(tmp_a, xmm_a);
|
||||||
code.movdqa(tmp_b, xmm_b);
|
code.movdqa(tmp_b, xmm_b);
|
||||||
code.paddw(tmp_a, code.XmmConst(xword, 0x80008000));
|
code.paddw(tmp_a, code.Const(xword, 0x80008000));
|
||||||
code.paddw(tmp_b, code.XmmConst(xword, 0x80008000));
|
code.paddw(tmp_b, code.Const(xword, 0x80008000));
|
||||||
code.pcmpgtw(tmp_b, tmp_a); // *Signed* comparison!
|
code.pcmpgtw(tmp_b, tmp_a); // *Signed* comparison!
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(ge_inst, tmp_b);
|
ctx.reg_alloc.DefineValue(ge_inst, tmp_b);
|
||||||
|
@ -209,8 +209,8 @@ void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
// (a >= b) == !(b > a)
|
// (a >= b) == !(b > a)
|
||||||
code.pcmpeqb(ones, ones);
|
code.pcmpeqb(ones, ones);
|
||||||
code.paddw(xmm_a, code.XmmConst(xword, 0x80008000));
|
code.paddw(xmm_a, code.Const(xword, 0x80008000));
|
||||||
code.paddw(xmm_b, code.XmmConst(xword, 0x80008000));
|
code.paddw(xmm_b, code.Const(xword, 0x80008000));
|
||||||
code.movdqa(xmm_ge, xmm_b);
|
code.movdqa(xmm_ge, xmm_b);
|
||||||
code.pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison!
|
code.pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison!
|
||||||
code.pxor(xmm_ge, ones);
|
code.pxor(xmm_ge, ones);
|
||||||
|
@ -643,7 +643,7 @@ void EmitX64::EmitPackedAbsDiffSumU8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
// TODO: Optimize with zero-extension detection
|
// TODO: Optimize with zero-extension detection
|
||||||
code.movaps(tmp, code.XmmConst(xword, 0x0000'0000'ffff'ffff));
|
code.movaps(tmp, code.Const(xword, 0x0000'0000'ffff'ffff));
|
||||||
code.pand(xmm_a, tmp);
|
code.pand(xmm_a, tmp);
|
||||||
code.pand(xmm_b, tmp);
|
code.pand(xmm_b, tmp);
|
||||||
code.psadbw(xmm_a, xmm_b);
|
code.psadbw(xmm_a, xmm_b);
|
||||||
|
|
|
@ -486,7 +486,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const
|
||||||
const u64 shift_matrix = shift_amount < 8
|
const u64 shift_matrix = shift_amount < 8
|
||||||
? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
|
? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
|
||||||
: 0x8080808080808080;
|
: 0x8080808080808080;
|
||||||
code.gf2p8affineqb(result, code.XmmConst(xword, shift_matrix, shift_matrix), 0);
|
code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -547,7 +547,7 @@ void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst)
|
||||||
|
|
||||||
code.pxor(tmp2, tmp2);
|
code.pxor(tmp2, tmp2);
|
||||||
code.psrlq(result, shift_amount);
|
code.psrlq(result, shift_amount);
|
||||||
code.movdqa(tmp1, code.XmmConst(xword, sign_bit, sign_bit));
|
code.movdqa(tmp1, code.Const(xword, sign_bit, sign_bit));
|
||||||
code.pand(tmp1, result);
|
code.pand(tmp1, result);
|
||||||
code.psubq(tmp2, tmp1);
|
code.psubq(tmp2, tmp1);
|
||||||
code.por(result, tmp2);
|
code.por(result, tmp2);
|
||||||
|
@ -599,7 +599,7 @@ void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm right_shift = xmm16;
|
const Xbyak::Xmm right_shift = xmm16;
|
||||||
const Xbyak::Xmm tmp = xmm17;
|
const Xbyak::Xmm tmp = xmm17;
|
||||||
|
|
||||||
code.vmovdqa32(tmp, code.XmmConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
||||||
code.vpxord(right_shift, right_shift, right_shift);
|
code.vpxord(right_shift, right_shift, right_shift);
|
||||||
code.vpsubw(right_shift, right_shift, left_shift);
|
code.vpsubw(right_shift, right_shift, left_shift);
|
||||||
|
|
||||||
|
@ -634,7 +634,7 @@ void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.vmovdqa(tmp, code.XmmConst(xword, 0x000000FF000000FF, 0x000000FF000000FF));
|
code.vmovdqa(tmp, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF));
|
||||||
code.vpxor(right_shift, right_shift, right_shift);
|
code.vpxor(right_shift, right_shift, right_shift);
|
||||||
code.vpsubd(right_shift, right_shift, left_shift);
|
code.vpsubd(right_shift, right_shift, left_shift);
|
||||||
|
|
||||||
|
@ -665,7 +665,7 @@ void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm right_shift = xmm16;
|
const Xbyak::Xmm right_shift = xmm16;
|
||||||
const Xbyak::Xmm tmp = xmm17;
|
const Xbyak::Xmm tmp = xmm17;
|
||||||
|
|
||||||
code.vmovdqa32(tmp, code.XmmConst(xword, 0x00000000000000FF, 0x00000000000000FF));
|
code.vmovdqa32(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF));
|
||||||
code.vpxorq(right_shift, right_shift, right_shift);
|
code.vpxorq(right_shift, right_shift, right_shift);
|
||||||
code.vpsubq(right_shift, right_shift, left_shift);
|
code.vpsubq(right_shift, right_shift, left_shift);
|
||||||
|
|
||||||
|
@ -953,15 +953,15 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movdqa(tmp1, code.XmmConst(xword, 0x0101010102020304, 0x0000000000000000));
|
code.movdqa(tmp1, code.Const(xword, 0x0101010102020304, 0x0000000000000000));
|
||||||
code.movdqa(tmp2, tmp1);
|
code.movdqa(tmp2, tmp1);
|
||||||
|
|
||||||
code.pshufb(tmp2, data);
|
code.pshufb(tmp2, data);
|
||||||
code.psrlw(data, 4);
|
code.psrlw(data, 4);
|
||||||
code.pand(data, code.XmmConst(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
|
code.pand(data, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
|
||||||
code.pshufb(tmp1, data);
|
code.pshufb(tmp1, data);
|
||||||
|
|
||||||
code.movdqa(data, code.XmmConst(xword, 0x0404040404040404, 0x0404040404040404));
|
code.movdqa(data, code.Const(xword, 0x0404040404040404, 0x0404040404040404));
|
||||||
|
|
||||||
code.pcmpeqb(data, tmp1);
|
code.pcmpeqb(data, tmp1);
|
||||||
code.pand(data, tmp2);
|
code.pand(data, tmp2);
|
||||||
|
@ -994,11 +994,11 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.vpcmpeqw(zeros, zeros, zeros);
|
code.vpcmpeqw(zeros, zeros, zeros);
|
||||||
code.vpcmpeqw(tmp, tmp, tmp);
|
code.vpcmpeqw(tmp, tmp, tmp);
|
||||||
code.vpcmpeqw(zeros, zeros, data);
|
code.vpcmpeqw(zeros, zeros, data);
|
||||||
code.vpmullw(data, data, code.XmmConst(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3));
|
code.vpmullw(data, data, code.Const(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3));
|
||||||
code.vpsllw(tmp, tmp, 15);
|
code.vpsllw(tmp, tmp, 15);
|
||||||
code.vpsllw(zeros, zeros, 7);
|
code.vpsllw(zeros, zeros, 7);
|
||||||
code.vpsrlw(data, data, 12);
|
code.vpsrlw(data, data, 12);
|
||||||
code.vmovdqa(result, code.XmmConst(xword, 0x0903060a040b0c10, 0x0f080e0207050d01));
|
code.vmovdqa(result, code.Const(xword, 0x0903060a040b0c10, 0x0f080e0207050d01));
|
||||||
code.vpor(tmp, tmp, zeros);
|
code.vpor(tmp, tmp, zeros);
|
||||||
code.vpor(data, data, tmp);
|
code.vpor(data, data, tmp);
|
||||||
code.vpshufb(result, result, data);
|
code.vpshufb(result, result, data);
|
||||||
|
@ -1030,11 +1030,11 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.pcmpeqw(zeros, zeros);
|
code.pcmpeqw(zeros, zeros);
|
||||||
code.pcmpeqw(tmp, tmp);
|
code.pcmpeqw(tmp, tmp);
|
||||||
code.pcmpeqw(zeros, data);
|
code.pcmpeqw(zeros, data);
|
||||||
code.pmullw(data, code.XmmConst(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3));
|
code.pmullw(data, code.Const(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3));
|
||||||
code.psllw(tmp, 15);
|
code.psllw(tmp, 15);
|
||||||
code.psllw(zeros, 7);
|
code.psllw(zeros, 7);
|
||||||
code.psrlw(data, 12);
|
code.psrlw(data, 12);
|
||||||
code.movdqa(result, code.XmmConst(xword, 0x0903060a040b0c10, 0x0f080e0207050d01));
|
code.movdqa(result, code.Const(xword, 0x0903060a040b0c10, 0x0f080e0207050d01));
|
||||||
code.por(tmp, zeros);
|
code.por(tmp, zeros);
|
||||||
code.por(data, tmp);
|
code.por(data, tmp);
|
||||||
code.pshufb(result, data);
|
code.pshufb(result, data);
|
||||||
|
@ -1066,7 +1066,7 @@ void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movdqa(tmp, code.XmmConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
||||||
code.pand(lhs, tmp);
|
code.pand(lhs, tmp);
|
||||||
code.pand(rhs, tmp);
|
code.pand(rhs, tmp);
|
||||||
code.packuswb(lhs, rhs);
|
code.packuswb(lhs, rhs);
|
||||||
|
@ -1127,12 +1127,12 @@ void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst)
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
code.punpcklbw(lhs, rhs);
|
code.punpcklbw(lhs, rhs);
|
||||||
code.pshufb(lhs, code.XmmConst(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080));
|
code.pshufb(lhs, code.Const(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080));
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
|
||||||
code.movdqa(tmp, code.XmmConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
||||||
code.pand(lhs, tmp);
|
code.pand(lhs, tmp);
|
||||||
code.pand(rhs, tmp);
|
code.pand(rhs, tmp);
|
||||||
code.packuswb(lhs, rhs);
|
code.packuswb(lhs, rhs);
|
||||||
|
@ -1151,7 +1151,7 @@ void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
code.punpcklwd(lhs, rhs);
|
code.punpcklwd(lhs, rhs);
|
||||||
code.pshufb(lhs, code.XmmConst(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080));
|
code.pshufb(lhs, code.Const(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080));
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
|
||||||
|
@ -1237,7 +1237,7 @@ void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst)
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
code.punpcklbw(lhs, rhs);
|
code.punpcklbw(lhs, rhs);
|
||||||
code.pshufb(lhs, code.XmmConst(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080));
|
code.pshufb(lhs, code.Const(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080));
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
|
||||||
|
@ -1259,7 +1259,7 @@ void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst)
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
code.punpcklwd(lhs, rhs);
|
code.punpcklwd(lhs, rhs);
|
||||||
code.pshufb(lhs, code.XmmConst(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080));
|
code.pshufb(lhs, code.Const(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080));
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
|
||||||
|
@ -1488,13 +1488,13 @@ static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Ins
|
||||||
case 8:
|
case 8:
|
||||||
code.pavgb(tmp, a);
|
code.pavgb(tmp, a);
|
||||||
code.pxor(a, b);
|
code.pxor(a, b);
|
||||||
code.pand(a, code.XmmConst(xword, 0x0101010101010101, 0x0101010101010101));
|
code.pand(a, code.Const(xword, 0x0101010101010101, 0x0101010101010101));
|
||||||
code.psubb(tmp, a);
|
code.psubb(tmp, a);
|
||||||
break;
|
break;
|
||||||
case 16:
|
case 16:
|
||||||
code.pavgw(tmp, a);
|
code.pavgw(tmp, a);
|
||||||
code.pxor(a, b);
|
code.pxor(a, b);
|
||||||
code.pand(a, code.XmmConst(xword, 0x0001000100010001, 0x0001000100010001));
|
code.pand(a, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
|
||||||
code.psubw(tmp, a);
|
code.psubw(tmp, a);
|
||||||
break;
|
break;
|
||||||
case 32:
|
case 32:
|
||||||
|
@ -1529,7 +1529,7 @@ static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst*
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 8: {
|
case 8: {
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp, code.XmmConst(xword, 0x8080808080808080, 0x8080808080808080));
|
code.movdqa(tmp, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
|
||||||
code.pxor(a, tmp);
|
code.pxor(a, tmp);
|
||||||
code.pxor(b, tmp);
|
code.pxor(b, tmp);
|
||||||
code.pavgb(b, a);
|
code.pavgb(b, a);
|
||||||
|
@ -1538,7 +1538,7 @@ static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst*
|
||||||
}
|
}
|
||||||
case 16: {
|
case 16: {
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp, code.XmmConst(xword, 0x8000800080008000, 0x8000800080008000));
|
code.movdqa(tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
|
||||||
code.pxor(a, tmp);
|
code.pxor(a, tmp);
|
||||||
code.pxor(b, tmp);
|
code.pxor(b, tmp);
|
||||||
code.pavgw(b, a);
|
code.pavgw(b, a);
|
||||||
|
@ -1700,13 +1700,13 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.paddb(result, result);
|
code.paddb(result, result);
|
||||||
} else if (code.HasHostFeature(HostFeature::GFNI)) {
|
} else if (code.HasHostFeature(HostFeature::GFNI)) {
|
||||||
const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8);
|
const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8);
|
||||||
code.gf2p8affineqb(result, code.XmmConst(xword, shift_matrix, shift_matrix), 0);
|
code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0);
|
||||||
} else {
|
} else {
|
||||||
const u64 replicand = (0xFFULL << shift_amount) & 0xFF;
|
const u64 replicand = (0xFFULL << shift_amount) & 0xFF;
|
||||||
const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand);
|
const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand);
|
||||||
|
|
||||||
code.psllw(result, shift_amount);
|
code.psllw(result, shift_amount);
|
||||||
code.pand(result, code.XmmConst(xword, mask, mask));
|
code.pand(result, code.Const(xword, mask, mask));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1757,13 +1757,13 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.pxor(result, result);
|
code.pxor(result, result);
|
||||||
} else if (code.HasHostFeature(HostFeature::GFNI)) {
|
} else if (code.HasHostFeature(HostFeature::GFNI)) {
|
||||||
const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8);
|
const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8);
|
||||||
code.gf2p8affineqb(result, code.XmmConst(xword, shift_matrix, shift_matrix), 0);
|
code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0);
|
||||||
} else {
|
} else {
|
||||||
const u64 replicand = 0xFEULL >> shift_amount;
|
const u64 replicand = 0xFEULL >> shift_amount;
|
||||||
const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand);
|
const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand);
|
||||||
|
|
||||||
code.psrlw(result, shift_amount);
|
code.psrlw(result, shift_amount);
|
||||||
code.pand(result, code.XmmConst(xword, mask, mask));
|
code.pand(result, code.Const(xword, mask, mask));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -1817,7 +1817,7 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm right_shift = xmm16;
|
const Xbyak::Xmm right_shift = xmm16;
|
||||||
const Xbyak::Xmm tmp = xmm17;
|
const Xbyak::Xmm tmp = xmm17;
|
||||||
|
|
||||||
code.vmovdqa32(tmp, code.XmmConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
||||||
code.vpxord(right_shift, right_shift, right_shift);
|
code.vpxord(right_shift, right_shift, right_shift);
|
||||||
code.vpsubw(right_shift, right_shift, left_shift);
|
code.vpsubw(right_shift, right_shift, left_shift);
|
||||||
code.vpandd(left_shift, left_shift, tmp);
|
code.vpandd(left_shift, left_shift, tmp);
|
||||||
|
@ -1845,7 +1845,7 @@ void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.vmovdqa(tmp, code.XmmConst(xword, 0x000000FF000000FF, 0x000000FF000000FF));
|
code.vmovdqa(tmp, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF));
|
||||||
code.vpxor(right_shift, right_shift, right_shift);
|
code.vpxor(right_shift, right_shift, right_shift);
|
||||||
code.vpsubd(right_shift, right_shift, left_shift);
|
code.vpsubd(right_shift, right_shift, left_shift);
|
||||||
code.vpand(left_shift, left_shift, tmp);
|
code.vpand(left_shift, left_shift, tmp);
|
||||||
|
@ -1873,7 +1873,7 @@ void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.vmovdqa(tmp, code.XmmConst(xword, 0x00000000000000FF, 0x00000000000000FF));
|
code.vmovdqa(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF));
|
||||||
code.vpxor(right_shift, right_shift, right_shift);
|
code.vpxor(right_shift, right_shift, right_shift);
|
||||||
code.vpsubq(right_shift, right_shift, left_shift);
|
code.vpsubq(right_shift, right_shift, left_shift);
|
||||||
code.vpand(left_shift, left_shift, tmp);
|
code.vpand(left_shift, left_shift, tmp);
|
||||||
|
@ -1993,7 +1993,7 @@ void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.movdqa(tmp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
|
|
||||||
const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp_b, b);
|
code.movdqa(tmp_b, b);
|
||||||
|
@ -2022,7 +2022,7 @@ void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.vmovdqa(xmm0, code.XmmConst(xword, 0x8000000000000000, 0x8000000000000000));
|
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
|
||||||
code.vpsubq(tmp, y, xmm0);
|
code.vpsubq(tmp, y, xmm0);
|
||||||
code.vpsubq(xmm0, x, xmm0);
|
code.vpsubq(xmm0, x, xmm0);
|
||||||
code.vpcmpgtq(xmm0, tmp, xmm0);
|
code.vpcmpgtq(xmm0, tmp, xmm0);
|
||||||
|
@ -2141,7 +2141,7 @@ void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
|
const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
|
||||||
|
|
||||||
const Xbyak::Xmm sint_max_plus_one = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm sint_max_plus_one = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(sint_max_plus_one, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.movdqa(sint_max_plus_one, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
|
|
||||||
const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp_a, a);
|
code.movdqa(tmp_a, a);
|
||||||
|
@ -2172,7 +2172,7 @@ void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.vmovdqa(xmm0, code.XmmConst(xword, 0x8000000000000000, 0x8000000000000000));
|
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
|
||||||
code.vpsubq(tmp, y, xmm0);
|
code.vpsubq(tmp, y, xmm0);
|
||||||
code.vpsubq(xmm0, x, xmm0);
|
code.vpsubq(xmm0, x, xmm0);
|
||||||
code.vpcmpgtq(xmm0, tmp, xmm0);
|
code.vpcmpgtq(xmm0, tmp, xmm0);
|
||||||
|
@ -2201,7 +2201,7 @@ void EmitX64::EmitVectorMultiply8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.psrlw(tmp_a, 8);
|
code.psrlw(tmp_a, 8);
|
||||||
code.psrlw(tmp_b, 8);
|
code.psrlw(tmp_b, 8);
|
||||||
code.pmullw(tmp_a, tmp_b);
|
code.pmullw(tmp_a, tmp_b);
|
||||||
code.pand(a, code.XmmConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
||||||
code.psllw(tmp_a, 8);
|
code.psllw(tmp_a, 8);
|
||||||
code.por(a, tmp_a);
|
code.por(a, tmp_a);
|
||||||
|
|
||||||
|
@ -2327,7 +2327,7 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.pxor(zeros, zeros);
|
code.pxor(zeros, zeros);
|
||||||
code.pand(a, code.XmmConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
||||||
code.packuswb(a, zeros);
|
code.packuswb(a, zeros);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, a);
|
ctx.reg_alloc.DefineValue(inst, a);
|
||||||
|
@ -2611,7 +2611,7 @@ void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst)
|
||||||
|
|
||||||
code.movdqa(c, a);
|
code.movdqa(c, a);
|
||||||
code.psllq(a, 32);
|
code.psllq(a, 32);
|
||||||
code.movdqa(tmp1, code.XmmConst(xword, 0x80000000'00000000, 0x80000000'00000000));
|
code.movdqa(tmp1, code.Const(xword, 0x80000000'00000000, 0x80000000'00000000));
|
||||||
code.movdqa(tmp2, tmp1);
|
code.movdqa(tmp2, tmp1);
|
||||||
code.pand(tmp1, a);
|
code.pand(tmp1, a);
|
||||||
code.pand(tmp2, c);
|
code.pand(tmp2, c);
|
||||||
|
@ -2860,7 +2860,7 @@ void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
ctx.reg_alloc.DefineValue(inst, x);
|
ctx.reg_alloc.DefineValue(inst, x);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp3, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.movdqa(tmp3, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
|
|
||||||
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp2, x);
|
code.movdqa(tmp2, x);
|
||||||
|
@ -2948,7 +2948,7 @@ void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
ctx.reg_alloc.DefineValue(inst, x);
|
ctx.reg_alloc.DefineValue(inst, x);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp3, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.movdqa(tmp3, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
|
|
||||||
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(tmp2, tmp1);
|
code.movdqa(tmp2, tmp1);
|
||||||
|
@ -3104,7 +3104,7 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
Xbyak::Label loop;
|
Xbyak::Label loop;
|
||||||
|
|
||||||
code.pxor(result, result);
|
code.pxor(result, result);
|
||||||
code.movdqa(mask, code.XmmConst(xword, 0x0101010101010101, 0x0101010101010101));
|
code.movdqa(mask, code.Const(xword, 0x0101010101010101, 0x0101010101010101));
|
||||||
code.mov(counter, 8);
|
code.mov(counter, 8);
|
||||||
|
|
||||||
code.L(loop);
|
code.L(loop);
|
||||||
|
@ -3148,7 +3148,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
|
||||||
code.pmovzxbw(xmm_a, xmm_a);
|
code.pmovzxbw(xmm_a, xmm_a);
|
||||||
code.pmovzxbw(xmm_b, xmm_b);
|
code.pmovzxbw(xmm_b, xmm_b);
|
||||||
code.pxor(result, result);
|
code.pxor(result, result);
|
||||||
code.movdqa(mask, code.XmmConst(xword, 0x0001000100010001, 0x0001000100010001));
|
code.movdqa(mask, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
|
||||||
code.mov(counter, 8);
|
code.mov(counter, 8);
|
||||||
|
|
||||||
code.L(loop);
|
code.L(loop);
|
||||||
|
@ -3231,11 +3231,11 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
code.movdqa(high_a, low_a);
|
code.movdqa(high_a, low_a);
|
||||||
code.psrlw(high_a, 4);
|
code.psrlw(high_a, 4);
|
||||||
code.movdqa(tmp1, code.XmmConst(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
|
code.movdqa(tmp1, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
|
||||||
code.pand(high_a, tmp1); // High nibbles
|
code.pand(high_a, tmp1); // High nibbles
|
||||||
code.pand(low_a, tmp1); // Low nibbles
|
code.pand(low_a, tmp1); // Low nibbles
|
||||||
|
|
||||||
code.movdqa(tmp1, code.XmmConst(xword, 0x0302020102010100, 0x0403030203020201));
|
code.movdqa(tmp1, code.Const(xword, 0x0302020102010100, 0x0403030203020201));
|
||||||
code.movdqa(tmp2, tmp1);
|
code.movdqa(tmp2, tmp1);
|
||||||
code.pshufb(tmp1, low_a);
|
code.pshufb(tmp1, low_a);
|
||||||
code.pshufb(tmp2, high_a);
|
code.pshufb(tmp2, high_a);
|
||||||
|
@ -3259,10 +3259,10 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::GFNI)) {
|
if (code.HasHostFeature(HostFeature::GFNI)) {
|
||||||
code.gf2p8affineqb(data, code.XmmConst(xword, 0x8040201008040201, 0x8040201008040201), 0);
|
code.gf2p8affineqb(data, code.Const(xword, 0x8040201008040201, 0x8040201008040201), 0);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(high_nibble_reg, code.XmmConst(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
|
code.movdqa(high_nibble_reg, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
|
||||||
code.pand(high_nibble_reg, data);
|
code.pand(high_nibble_reg, data);
|
||||||
code.pxor(data, high_nibble_reg);
|
code.pxor(data, high_nibble_reg);
|
||||||
code.psrld(high_nibble_reg, 4);
|
code.psrld(high_nibble_reg, 4);
|
||||||
|
@ -3270,25 +3270,25 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
|
||||||
if (code.HasHostFeature(HostFeature::SSSE3)) {
|
if (code.HasHostFeature(HostFeature::SSSE3)) {
|
||||||
// High lookup
|
// High lookup
|
||||||
const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(high_reversed_reg, code.XmmConst(xword, 0xE060A020C0408000, 0xF070B030D0509010));
|
code.movdqa(high_reversed_reg, code.Const(xword, 0xE060A020C0408000, 0xF070B030D0509010));
|
||||||
code.pshufb(high_reversed_reg, data);
|
code.pshufb(high_reversed_reg, data);
|
||||||
|
|
||||||
// Low lookup (low nibble equivalent of the above)
|
// Low lookup (low nibble equivalent of the above)
|
||||||
code.movdqa(data, code.XmmConst(xword, 0x0E060A020C040800, 0x0F070B030D050901));
|
code.movdqa(data, code.Const(xword, 0x0E060A020C040800, 0x0F070B030D050901));
|
||||||
code.pshufb(data, high_nibble_reg);
|
code.pshufb(data, high_nibble_reg);
|
||||||
code.por(data, high_reversed_reg);
|
code.por(data, high_reversed_reg);
|
||||||
} else {
|
} else {
|
||||||
code.pslld(data, 4);
|
code.pslld(data, 4);
|
||||||
code.por(data, high_nibble_reg);
|
code.por(data, high_nibble_reg);
|
||||||
|
|
||||||
code.movdqa(high_nibble_reg, code.XmmConst(xword, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC));
|
code.movdqa(high_nibble_reg, code.Const(xword, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC));
|
||||||
code.pand(high_nibble_reg, data);
|
code.pand(high_nibble_reg, data);
|
||||||
code.pxor(data, high_nibble_reg);
|
code.pxor(data, high_nibble_reg);
|
||||||
code.psrld(high_nibble_reg, 2);
|
code.psrld(high_nibble_reg, 2);
|
||||||
code.pslld(data, 2);
|
code.pslld(data, 2);
|
||||||
code.por(data, high_nibble_reg);
|
code.por(data, high_nibble_reg);
|
||||||
|
|
||||||
code.movdqa(high_nibble_reg, code.XmmConst(xword, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA));
|
code.movdqa(high_nibble_reg, code.Const(xword, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA));
|
||||||
code.pand(high_nibble_reg, data);
|
code.pand(high_nibble_reg, data);
|
||||||
code.pxor(data, high_nibble_reg);
|
code.pxor(data, high_nibble_reg);
|
||||||
code.psrld(high_nibble_reg, 1);
|
code.psrld(high_nibble_reg, 1);
|
||||||
|
@ -3421,7 +3421,7 @@ void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.paddw(data, temp);
|
code.paddw(data, temp);
|
||||||
|
|
||||||
// Add pairs of 16-bit values into 32-bit lanes
|
// Add pairs of 16-bit values into 32-bit lanes
|
||||||
code.movdqa(temp, code.XmmConst(xword, 0x0001000100010001, 0x0001000100010001));
|
code.movdqa(temp, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
|
||||||
code.pmaddwd(data, temp);
|
code.pmaddwd(data, temp);
|
||||||
|
|
||||||
// Sum adjacent 32-bit lanes
|
// Sum adjacent 32-bit lanes
|
||||||
|
@ -3498,7 +3498,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 8: {
|
case 8: {
|
||||||
const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(vec_128, code.XmmConst(xword, 0x8080808080808080, 0x8080808080808080));
|
code.movdqa(vec_128, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
|
||||||
|
|
||||||
code.paddb(a, vec_128);
|
code.paddb(a, vec_128);
|
||||||
code.paddb(b, vec_128);
|
code.paddb(b, vec_128);
|
||||||
|
@ -3508,7 +3508,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
|
||||||
}
|
}
|
||||||
case 16: {
|
case 16: {
|
||||||
const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm();
|
||||||
code.movdqa(vec_32768, code.XmmConst(xword, 0x8000800080008000, 0x8000800080008000));
|
code.movdqa(vec_32768, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
|
||||||
|
|
||||||
code.paddw(a, vec_32768);
|
code.paddw(a, vec_32768);
|
||||||
code.paddw(b, vec_32768);
|
code.paddw(b, vec_32768);
|
||||||
|
@ -3891,7 +3891,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.pand(tmp, y);
|
code.pand(tmp, y);
|
||||||
code.pand(sign_correction, x);
|
code.pand(sign_correction, x);
|
||||||
code.paddd(sign_correction, tmp);
|
code.paddd(sign_correction, tmp);
|
||||||
code.pand(sign_correction, code.XmmConst(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF));
|
code.pand(sign_correction, code.Const(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF));
|
||||||
|
|
||||||
// calculate unsigned multiply
|
// calculate unsigned multiply
|
||||||
code.movdqa(tmp, x);
|
code.movdqa(tmp, x);
|
||||||
|
@ -3930,13 +3930,13 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
|
||||||
const Xbyak::Address mask = [esize, &code] {
|
const Xbyak::Address mask = [esize, &code] {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 8:
|
case 8:
|
||||||
return code.XmmConst(xword, 0x8080808080808080, 0x8080808080808080);
|
return code.Const(xword, 0x8080808080808080, 0x8080808080808080);
|
||||||
case 16:
|
case 16:
|
||||||
return code.XmmConst(xword, 0x8000800080008000, 0x8000800080008000);
|
return code.Const(xword, 0x8000800080008000, 0x8000800080008000);
|
||||||
case 32:
|
case 32:
|
||||||
return code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000);
|
return code.Const(xword, 0x8000000080000000, 0x8000000080000000);
|
||||||
case 64:
|
case 64:
|
||||||
return code.XmmConst(xword, 0x8000000000000000, 0x8000000000000000);
|
return code.Const(xword, 0x8000000000000000, 0x8000000000000000);
|
||||||
default:
|
default:
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
@ -4100,7 +4100,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
|
||||||
code.vpblendvb(xmm0, tmp, tmp2, xmm0);
|
code.vpblendvb(xmm0, tmp, tmp2, xmm0);
|
||||||
ctx.reg_alloc.Release(tmp2);
|
ctx.reg_alloc.Release(tmp2);
|
||||||
} else {
|
} else {
|
||||||
code.pand(xmm0, code.XmmConst(xword, 0x8080808080808080, 0x8080808080808080));
|
code.pand(xmm0, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
|
||||||
code.movdqa(tmp, xmm0);
|
code.movdqa(tmp, xmm0);
|
||||||
code.psrlw(tmp, 7);
|
code.psrlw(tmp, 7);
|
||||||
code.pxor(xmm0, xmm0);
|
code.pxor(xmm0, xmm0);
|
||||||
|
@ -4201,27 +4201,27 @@ static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitC
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
if constexpr (is_rounding) {
|
if constexpr (is_rounding) {
|
||||||
code.vpsrlw(lower_tmp, lower_tmp, 14);
|
code.vpsrlw(lower_tmp, lower_tmp, 14);
|
||||||
code.vpaddw(lower_tmp, lower_tmp, code.XmmConst(xword, 0x0001000100010001, 0x0001000100010001));
|
code.vpaddw(lower_tmp, lower_tmp, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
|
||||||
code.vpsrlw(lower_tmp, lower_tmp, 1);
|
code.vpsrlw(lower_tmp, lower_tmp, 1);
|
||||||
} else {
|
} else {
|
||||||
code.vpsrlw(lower_tmp, lower_tmp, 15);
|
code.vpsrlw(lower_tmp, lower_tmp, 15);
|
||||||
}
|
}
|
||||||
code.vpaddw(upper_tmp, upper_tmp, upper_tmp);
|
code.vpaddw(upper_tmp, upper_tmp, upper_tmp);
|
||||||
code.vpaddw(result, upper_tmp, lower_tmp);
|
code.vpaddw(result, upper_tmp, lower_tmp);
|
||||||
code.vpcmpeqw(upper_tmp, result, code.XmmConst(xword, 0x8000800080008000, 0x8000800080008000));
|
code.vpcmpeqw(upper_tmp, result, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
|
||||||
code.vpxor(result, result, upper_tmp);
|
code.vpxor(result, result, upper_tmp);
|
||||||
} else {
|
} else {
|
||||||
code.paddw(upper_tmp, upper_tmp);
|
code.paddw(upper_tmp, upper_tmp);
|
||||||
if constexpr (is_rounding) {
|
if constexpr (is_rounding) {
|
||||||
code.psrlw(lower_tmp, 14);
|
code.psrlw(lower_tmp, 14);
|
||||||
code.paddw(lower_tmp, code.XmmConst(xword, 0x0001000100010001, 0x0001000100010001));
|
code.paddw(lower_tmp, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
|
||||||
code.psrlw(lower_tmp, 1);
|
code.psrlw(lower_tmp, 1);
|
||||||
} else {
|
} else {
|
||||||
code.psrlw(lower_tmp, 15);
|
code.psrlw(lower_tmp, 15);
|
||||||
}
|
}
|
||||||
code.movdqa(result, upper_tmp);
|
code.movdqa(result, upper_tmp);
|
||||||
code.paddw(result, lower_tmp);
|
code.paddw(result, lower_tmp);
|
||||||
code.movdqa(upper_tmp, code.XmmConst(xword, 0x8000800080008000, 0x8000800080008000));
|
code.movdqa(upper_tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
|
||||||
code.pcmpeqw(upper_tmp, result);
|
code.pcmpeqw(upper_tmp, result);
|
||||||
code.pxor(result, upper_tmp);
|
code.pxor(result, upper_tmp);
|
||||||
}
|
}
|
||||||
|
@ -4265,7 +4265,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
if constexpr (is_rounding) {
|
if constexpr (is_rounding) {
|
||||||
code.vmovdqa(result, code.XmmConst(xword, 0x0000000080000000, 0x0000000080000000));
|
code.vmovdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000));
|
||||||
code.vpaddq(odds, odds, result);
|
code.vpaddq(odds, odds, result);
|
||||||
code.vpaddq(even, even, result);
|
code.vpaddq(even, even, result);
|
||||||
}
|
}
|
||||||
|
@ -4276,7 +4276,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
|
||||||
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
code.vpcmpeqd(mask, result, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
code.vpxor(result, result, mask);
|
code.vpxor(result, result, mask);
|
||||||
code.pmovmskb(bit, mask);
|
code.pmovmskb(bit, mask);
|
||||||
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
|
||||||
|
@ -4316,7 +4316,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
|
||||||
code.paddq(x, x);
|
code.paddq(x, x);
|
||||||
|
|
||||||
if constexpr (is_rounding) {
|
if constexpr (is_rounding) {
|
||||||
code.movdqa(result, code.XmmConst(xword, 0x0000000080000000, 0x0000000080000000));
|
code.movdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000));
|
||||||
code.paddq(tmp, result);
|
code.paddq(tmp, result);
|
||||||
code.paddq(x, result);
|
code.paddq(x, result);
|
||||||
}
|
}
|
||||||
|
@ -4331,7 +4331,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
|
||||||
|
|
||||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
|
|
||||||
code.movdqa(tmp, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.movdqa(tmp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
code.pcmpeqd(tmp, result);
|
code.pcmpeqd(tmp, result);
|
||||||
code.pxor(result, tmp);
|
code.pxor(result, tmp);
|
||||||
code.pmovmskb(bit, tmp);
|
code.pmovmskb(bit, tmp);
|
||||||
|
@ -4359,10 +4359,10 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx,
|
||||||
code.pmaddwd(x, y);
|
code.pmaddwd(x, y);
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpcmpeqd(y, x, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.vpcmpeqd(y, x, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
code.vpxor(x, x, y);
|
code.vpxor(x, x, y);
|
||||||
} else {
|
} else {
|
||||||
code.movdqa(y, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.movdqa(y, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
code.pcmpeqd(y, x);
|
code.pcmpeqd(y, x);
|
||||||
code.pxor(x, y);
|
code.pxor(x, y);
|
||||||
}
|
}
|
||||||
|
@ -4412,11 +4412,11 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
|
||||||
|
|
||||||
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpcmpeqq(y, x, code.XmmConst(xword, 0x8000000000000000, 0x8000000000000000));
|
code.vpcmpeqq(y, x, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
|
||||||
code.vpxor(x, x, y);
|
code.vpxor(x, x, y);
|
||||||
code.vpmovmskb(bit, y);
|
code.vpmovmskb(bit, y);
|
||||||
} else {
|
} else {
|
||||||
code.movdqa(y, code.XmmConst(xword, 0x8000000000000000, 0x8000000000000000));
|
code.movdqa(y, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
|
||||||
code.pcmpeqd(y, x);
|
code.pcmpeqd(y, x);
|
||||||
code.shufps(y, y, 0b11110101);
|
code.shufps(y, y, 0b11110101);
|
||||||
code.pxor(x, y);
|
code.pxor(x, y);
|
||||||
|
@ -4565,13 +4565,13 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo
|
||||||
const Xbyak::Address mask = [esize, &code] {
|
const Xbyak::Address mask = [esize, &code] {
|
||||||
switch (esize) {
|
switch (esize) {
|
||||||
case 8:
|
case 8:
|
||||||
return code.XmmConst(xword, 0x8080808080808080, 0x8080808080808080);
|
return code.Const(xword, 0x8080808080808080, 0x8080808080808080);
|
||||||
case 16:
|
case 16:
|
||||||
return code.XmmConst(xword, 0x8000800080008000, 0x8000800080008000);
|
return code.Const(xword, 0x8000800080008000, 0x8000800080008000);
|
||||||
case 32:
|
case 32:
|
||||||
return code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000);
|
return code.Const(xword, 0x8000000080000000, 0x8000000080000000);
|
||||||
case 64:
|
case 64:
|
||||||
return code.XmmConst(xword, 0x8000000000000000, 0x8000000000000000);
|
return code.Const(xword, 0x8000000000000000, 0x8000000000000000);
|
||||||
default:
|
default:
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
@ -4806,7 +4806,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const u64 index_count64 = mcl::bit::replicate_element<u8, u64>(index_count);
|
const u64 index_count64 = mcl::bit::replicate_element<u8, u64>(index_count);
|
||||||
|
|
||||||
Xbyak::Opmask valid_indices = k1;
|
Xbyak::Opmask valid_indices = k1;
|
||||||
code.vpcmpb(valid_indices, indicies, code.XmmConst(xword, index_count64, 0), CmpInt::LessThan);
|
code.vpcmpb(valid_indices, indicies, code.Const(xword, index_count64, 0), CmpInt::LessThan);
|
||||||
|
|
||||||
if (is_defaults_zero) {
|
if (is_defaults_zero) {
|
||||||
defaults = defaults | valid_indices | T_z;
|
defaults = defaults | valid_indices | T_z;
|
||||||
|
@ -4868,7 +4868,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
code.xorps(result, result);
|
code.xorps(result, result);
|
||||||
code.movsd(result, xmm_table0);
|
code.movsd(result, xmm_table0);
|
||||||
code.paddusb(indicies, code.XmmConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||||
code.pshufb(result, indicies);
|
code.pshufb(result, indicies);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, result);
|
ctx.reg_alloc.DefineValue(inst, result);
|
||||||
|
@ -4881,7 +4881,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
|
const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
|
||||||
|
|
||||||
code.punpcklqdq(xmm_table0, xmm_table0_upper);
|
code.punpcklqdq(xmm_table0, xmm_table0_upper);
|
||||||
code.paddusb(indicies, code.XmmConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||||
code.pshufb(xmm_table0, indicies);
|
code.pshufb(xmm_table0, indicies);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
||||||
|
@ -4900,10 +4900,10 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpaddusb(xmm0, indicies, code.XmmConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
code.vpaddusb(xmm0, indicies, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(xmm0, indicies);
|
code.movaps(xmm0, indicies);
|
||||||
code.paddusb(xmm0, code.XmmConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
code.paddusb(xmm0, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||||
}
|
}
|
||||||
code.pshufb(xmm_table0, indicies);
|
code.pshufb(xmm_table0, indicies);
|
||||||
code.pblendvb(xmm_table0, defaults);
|
code.pblendvb(xmm_table0, defaults);
|
||||||
|
@ -4933,12 +4933,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpaddusb(xmm0, indicies, code.XmmConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(xmm0, indicies);
|
code.movaps(xmm0, indicies);
|
||||||
code.paddusb(xmm0, code.XmmConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||||
}
|
}
|
||||||
code.paddusb(indicies, code.XmmConst(xword, 0x6060606060606060, 0xFFFFFFFFFFFFFFFF));
|
code.paddusb(indicies, code.Const(xword, 0x6060606060606060, 0xFFFFFFFFFFFFFFFF));
|
||||||
code.pshufb(xmm_table0, xmm0);
|
code.pshufb(xmm_table0, xmm0);
|
||||||
code.pshufb(xmm_table1, indicies);
|
code.pshufb(xmm_table1, indicies);
|
||||||
code.pblendvb(xmm_table0, xmm_table1);
|
code.pblendvb(xmm_table0, xmm_table1);
|
||||||
|
@ -4965,19 +4965,19 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpaddusb(xmm0, indicies, code.XmmConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(xmm0, indicies);
|
code.movaps(xmm0, indicies);
|
||||||
code.paddusb(xmm0, code.XmmConst(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
|
||||||
}
|
}
|
||||||
code.pshufb(xmm_table0, indicies);
|
code.pshufb(xmm_table0, indicies);
|
||||||
code.pshufb(xmm_table1, indicies);
|
code.pshufb(xmm_table1, indicies);
|
||||||
code.pblendvb(xmm_table0, xmm_table1);
|
code.pblendvb(xmm_table0, xmm_table1);
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpaddusb(xmm0, indicies, code.XmmConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
code.vpaddusb(xmm0, indicies, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(xmm0, indicies);
|
code.movaps(xmm0, indicies);
|
||||||
code.paddusb(xmm0, code.XmmConst(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
code.paddusb(xmm0, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
|
||||||
}
|
}
|
||||||
code.pblendvb(xmm_table0, defaults);
|
code.pblendvb(xmm_table0, defaults);
|
||||||
|
|
||||||
|
@ -5042,7 +5042,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||||
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
||||||
|
|
||||||
code.vptestnmb(write_mask, indicies, code.XmmConst(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0));
|
code.vptestnmb(write_mask, indicies, code.Const(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0));
|
||||||
code.vpermi2b(indicies | write_mask, xmm_table0, xmm_table1);
|
code.vpermi2b(indicies | write_mask, xmm_table0, xmm_table1);
|
||||||
|
|
||||||
ctx.reg_alloc.Release(xmm_table0);
|
ctx.reg_alloc.Release(xmm_table0);
|
||||||
|
@ -5056,7 +5056,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
|
|
||||||
// Handle vector-table 2,3
|
// Handle vector-table 2,3
|
||||||
// vpcmpuble
|
// vpcmpuble
|
||||||
code.vpcmpub(upper_mask, indicies, code.XmmConst(xword, 0x3F3F3F3F3F3F3F3F, 0x3F3F3F3F3F3F3F3F), CmpInt::LessEqual);
|
code.vpcmpub(upper_mask, indicies, code.Const(xword, 0x3F3F3F3F3F3F3F3F, 0x3F3F3F3F3F3F3F3F), CmpInt::LessEqual);
|
||||||
code.kandnw(write_mask, write_mask, upper_mask);
|
code.kandnw(write_mask, write_mask, upper_mask);
|
||||||
|
|
||||||
const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseScratchXmm(table[2]);
|
const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseScratchXmm(table[2]);
|
||||||
|
@ -5076,7 +5076,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
||||||
const Xbyak::Opmask write_mask = k1;
|
const Xbyak::Opmask write_mask = k1;
|
||||||
|
|
||||||
code.vptestnmb(write_mask, indicies, code.XmmConst(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0));
|
code.vptestnmb(write_mask, indicies, code.Const(xword, 0xE0E0E0E0E0E0E0E0, 0xE0E0E0E0E0E0E0E0));
|
||||||
code.vpermi2b(indicies, xmm_table0, xmm_table1);
|
code.vpermi2b(indicies, xmm_table0, xmm_table1);
|
||||||
|
|
||||||
if (is_defaults_zero) {
|
if (is_defaults_zero) {
|
||||||
|
@ -5093,7 +5093,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
|
||||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||||
|
|
||||||
code.paddusb(indicies, code.XmmConst(xword, 0x7070707070707070, 0x7070707070707070));
|
code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
|
||||||
code.pshufb(xmm_table0, indicies);
|
code.pshufb(xmm_table0, indicies);
|
||||||
|
|
||||||
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
ctx.reg_alloc.DefineValue(inst, xmm_table0);
|
||||||
|
@ -5106,10 +5106,10 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpaddusb(xmm0, indicies, code.XmmConst(xword, 0x7070707070707070, 0x7070707070707070));
|
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(xmm0, indicies);
|
code.movaps(xmm0, indicies);
|
||||||
code.paddusb(xmm0, code.XmmConst(xword, 0x7070707070707070, 0x7070707070707070));
|
code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
|
||||||
}
|
}
|
||||||
code.pshufb(xmm_table0, indicies);
|
code.pshufb(xmm_table0, indicies);
|
||||||
code.pblendvb(xmm_table0, defaults);
|
code.pblendvb(xmm_table0, defaults);
|
||||||
|
@ -5124,12 +5124,12 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::AVX)) {
|
if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpaddusb(xmm0, indicies, code.XmmConst(xword, 0x7070707070707070, 0x7070707070707070));
|
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(xmm0, indicies);
|
code.movaps(xmm0, indicies);
|
||||||
code.paddusb(xmm0, code.XmmConst(xword, 0x7070707070707070, 0x7070707070707070));
|
code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
|
||||||
}
|
}
|
||||||
code.paddusb(indicies, code.XmmConst(xword, 0x6060606060606060, 0x6060606060606060));
|
code.paddusb(indicies, code.Const(xword, 0x6060606060606060, 0x6060606060606060));
|
||||||
code.pshufb(xmm_table0, xmm0);
|
code.pshufb(xmm_table0, xmm0);
|
||||||
code.pshufb(xmm_table1, indicies);
|
code.pshufb(xmm_table1, indicies);
|
||||||
code.pblendvb(xmm_table0, xmm_table1);
|
code.pblendvb(xmm_table0, xmm_table1);
|
||||||
|
@ -5143,14 +5143,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm masked = xmm16;
|
const Xbyak::Xmm masked = xmm16;
|
||||||
|
|
||||||
code.vpandd(masked, indicies, code.XmmConst(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
|
code.vpandd(masked, indicies, code.Const(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
|
||||||
|
|
||||||
for (size_t i = 0; i < table_size; ++i) {
|
for (size_t i = 0; i < table_size; ++i) {
|
||||||
const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(table[i]);
|
const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(table[i]);
|
||||||
const Xbyak::Opmask table_mask = k1;
|
const Xbyak::Opmask table_mask = k1;
|
||||||
const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16);
|
const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16);
|
||||||
|
|
||||||
code.vpcmpeqb(table_mask, masked, code.XmmConst(xword, table_index, table_index));
|
code.vpcmpeqb(table_mask, masked, code.Const(xword, table_index, table_index));
|
||||||
|
|
||||||
if (table_index == 0 && is_defaults_zero) {
|
if (table_index == 0 && is_defaults_zero) {
|
||||||
code.vpshufb(result | table_mask | T_z, xmm_table, indicies);
|
code.vpshufb(result | table_mask | T_z, xmm_table, indicies);
|
||||||
|
@ -5170,7 +5170,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
code.movaps(masked, code.XmmConst(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
|
code.movaps(masked, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
|
||||||
code.pand(masked, indicies);
|
code.pand(masked, indicies);
|
||||||
|
|
||||||
for (size_t i = 0; i < table_size; ++i) {
|
for (size_t i = 0; i < table_size; ++i) {
|
||||||
|
@ -5182,9 +5182,9 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
|
||||||
code.pxor(xmm0, xmm0);
|
code.pxor(xmm0, xmm0);
|
||||||
code.pcmpeqb(xmm0, masked);
|
code.pcmpeqb(xmm0, masked);
|
||||||
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
code.vpcmpeqb(xmm0, masked, code.XmmConst(xword, table_index, table_index));
|
code.vpcmpeqb(xmm0, masked, code.Const(xword, table_index, table_index));
|
||||||
} else {
|
} else {
|
||||||
code.movaps(xmm0, code.XmmConst(xword, table_index, table_index));
|
code.movaps(xmm0, code.Const(xword, table_index, table_index));
|
||||||
code.pcmpeqb(xmm0, masked);
|
code.pcmpeqb(xmm0, masked);
|
||||||
}
|
}
|
||||||
code.pshufb(xmm_table, indicies);
|
code.pshufb(xmm_table, indicies);
|
||||||
|
@ -5242,11 +5242,11 @@ void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const bool part = args[2].GetImmediateU1();
|
const bool part = args[2].GetImmediateU1();
|
||||||
|
|
||||||
if (!part) {
|
if (!part) {
|
||||||
code.pand(lower, code.XmmConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
code.pand(lower, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
|
||||||
code.psllw(upper, 8);
|
code.psllw(upper, 8);
|
||||||
} else {
|
} else {
|
||||||
code.psrlw(lower, 8);
|
code.psrlw(lower, 8);
|
||||||
code.pand(upper, code.XmmConst(xword, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00));
|
code.pand(upper, code.Const(xword, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00));
|
||||||
}
|
}
|
||||||
code.por(lower, upper);
|
code.por(lower, upper);
|
||||||
|
|
||||||
|
@ -5261,11 +5261,11 @@ void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
|
||||||
const bool part = args[2].GetImmediateU1();
|
const bool part = args[2].GetImmediateU1();
|
||||||
|
|
||||||
if (!part) {
|
if (!part) {
|
||||||
code.pand(lower, code.XmmConst(xword, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF));
|
code.pand(lower, code.Const(xword, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF));
|
||||||
code.pslld(upper, 16);
|
code.pslld(upper, 16);
|
||||||
} else {
|
} else {
|
||||||
code.psrld(lower, 16);
|
code.psrld(lower, 16);
|
||||||
code.pand(upper, code.XmmConst(xword, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000));
|
code.pand(upper, code.Const(xword, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000));
|
||||||
}
|
}
|
||||||
code.por(lower, upper);
|
code.por(lower, upper);
|
||||||
|
|
||||||
|
@ -5336,7 +5336,7 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx,
|
||||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||||
|
|
||||||
code.movdqa(temp, code.XmmConst(xword, 0x8000000080000000, 0x8000000080000000));
|
code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
|
||||||
code.pxor(x, temp);
|
code.pxor(x, temp);
|
||||||
code.pxor(y, temp);
|
code.pxor(y, temp);
|
||||||
code.movdqa(temp, x);
|
code.movdqa(temp, x);
|
||||||
|
|
|
@ -145,12 +145,12 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
Xbyak::Address GetVectorOf(BlockOfCode& code, u64 value) {
|
Xbyak::Address GetVectorOf(BlockOfCode& code, u64 value) {
|
||||||
return code.XmmBConst<fsize>(xword, value);
|
return code.BConst<fsize>(xword, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize, u64 value>
|
template<size_t fsize, u64 value>
|
||||||
Xbyak::Address GetVectorOf(BlockOfCode& code) {
|
Xbyak::Address GetVectorOf(BlockOfCode& code) {
|
||||||
return code.XmmBConst<fsize>(xword, value);
|
return code.BConst<fsize>(xword, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t fsize>
|
template<size_t fsize>
|
||||||
|
@ -213,7 +213,7 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
|
||||||
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
|
constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
|
||||||
FpFixup::PosZero);
|
FpFixup::PosZero);
|
||||||
FCODE(vfixupimmp)(result, result, code.XmmBConst<32>(ptr_b, nan_to_zero), u8(0));
|
FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
|
||||||
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
} else if (code.HasHostFeature(HostFeature::AVX)) {
|
||||||
FCODE(vcmpordp)(nan_mask, result, result);
|
FCODE(vcmpordp)(nan_mask, result, result);
|
||||||
FCODE(vandp)(result, result, nan_mask);
|
FCODE(vandp)(result, result, nan_mask);
|
||||||
|
@ -238,7 +238,7 @@ void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xb
|
||||||
FpFixup::Norm_Src,
|
FpFixup::Norm_Src,
|
||||||
FpFixup::Norm_Src);
|
FpFixup::Norm_Src);
|
||||||
|
|
||||||
FCODE(vmovap)(tmp, code.XmmBConst<fsize>(xword, denormal_to_zero));
|
FCODE(vmovap)(tmp, code.BConst<fsize>(xword, denormal_to_zero));
|
||||||
|
|
||||||
for (const Xbyak::Xmm& xmm : to_daz) {
|
for (const Xbyak::Xmm& xmm : to_daz) {
|
||||||
FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
|
FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
|
||||||
|
@ -785,9 +785,9 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
|
||||||
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
|
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
|
||||||
code.vcvtudq2ps(xmm, xmm);
|
code.vcvtudq2ps(xmm, xmm);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Address mem_4B000000 = code.XmmBConst<32>(xword, 0x4B000000);
|
const Xbyak::Address mem_4B000000 = code.BConst<32>(xword, 0x4B000000);
|
||||||
const Xbyak::Address mem_53000000 = code.XmmBConst<32>(xword, 0x53000000);
|
const Xbyak::Address mem_53000000 = code.BConst<32>(xword, 0x53000000);
|
||||||
const Xbyak::Address mem_D3000080 = code.XmmBConst<32>(xword, 0xD3000080);
|
const Xbyak::Address mem_D3000080 = code.BConst<32>(xword, 0xD3000080);
|
||||||
|
|
||||||
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
|
||||||
|
|
||||||
|
@ -798,7 +798,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
|
||||||
code.vaddps(xmm, xmm, mem_D3000080);
|
code.vaddps(xmm, xmm, mem_D3000080);
|
||||||
code.vaddps(xmm, tmp, xmm);
|
code.vaddps(xmm, tmp, xmm);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Address mem_0xFFFF = code.XmmBConst<32>(xword, 0x0000FFFF);
|
const Xbyak::Address mem_0xFFFF = code.BConst<32>(xword, 0x0000FFFF);
|
||||||
|
|
||||||
code.movdqa(tmp, mem_0xFFFF);
|
code.movdqa(tmp, mem_0xFFFF);
|
||||||
|
|
||||||
|
@ -816,7 +816,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
|
if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
|
||||||
code.pand(xmm, code.XmmBConst<32>(xword, 0x7FFFFFFF));
|
code.pand(xmm, code.BConst<32>(xword, 0x7FFFFFFF));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -835,8 +835,8 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
|
||||||
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
|
||||||
code.vcvtuqq2pd(xmm, xmm);
|
code.vcvtuqq2pd(xmm, xmm);
|
||||||
} else {
|
} else {
|
||||||
const Xbyak::Address unpack = code.XmmConst(xword, 0x4530000043300000, 0);
|
const Xbyak::Address unpack = code.Const(xword, 0x4530000043300000, 0);
|
||||||
const Xbyak::Address subtrahend = code.XmmConst(xword, 0x4330000000000000, 0x4530000000000000);
|
const Xbyak::Address subtrahend = code.Const(xword, 0x4330000000000000, 0x4530000000000000);
|
||||||
|
|
||||||
const Xbyak::Xmm unpack_reg = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm unpack_reg = ctx.reg_alloc.ScratchXmm();
|
||||||
const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm();
|
const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm();
|
||||||
|
@ -883,7 +883,7 @@ void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
|
if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
|
||||||
code.pand(xmm, code.XmmBConst<64>(xword, 0x7FFFFFFFFFFFFFFF));
|
code.pand(xmm, code.BConst<64>(xword, 0x7FFFFFFFFFFFFFFF));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1493,7 +1493,7 @@ void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||||
|
|
||||||
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||||
const Xbyak::Address mask = code.XmmBConst<fsize>(xword, sign_mask);
|
const Xbyak::Address mask = code.BConst<fsize>(xword, sign_mask);
|
||||||
|
|
||||||
code.xorps(a, mask);
|
code.xorps(a, mask);
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||||
code.vpmovq2m(k1, xmm0);
|
code.vpmovq2m(k1, xmm0);
|
||||||
}
|
}
|
||||||
ICODE(vpsra)(result | k1, result, u8(esize - 1));
|
ICODE(vpsra)(result | k1, result, u8(esize - 1));
|
||||||
ICODE(vpxor)(result | k1, result, code.XmmBConst<esize>(xword_b, msb_mask));
|
ICODE(vpxor)(result | k1, result, code.BConst<esize>(xword_b, msb_mask));
|
||||||
|
|
||||||
code.ktestb(k1, k1);
|
code.ktestb(k1, k1);
|
||||||
code.setnz(overflow);
|
code.setnz(overflow);
|
||||||
|
@ -148,10 +148,10 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||||
if constexpr (esize == 64) {
|
if constexpr (esize == 64) {
|
||||||
code.pshufd(tmp, tmp, 0b11110101);
|
code.pshufd(tmp, tmp, 0b11110101);
|
||||||
}
|
}
|
||||||
code.pxor(tmp, code.XmmBConst<esize>(xword, msb_mask));
|
code.pxor(tmp, code.BConst<esize>(xword, msb_mask));
|
||||||
|
|
||||||
if (code.HasHostFeature(HostFeature::SSE41)) {
|
if (code.HasHostFeature(HostFeature::SSE41)) {
|
||||||
code.ptest(xmm0, code.XmmConst(xword, msb_mask, msb_mask));
|
code.ptest(xmm0, code.Const(xword, msb_mask, msb_mask));
|
||||||
} else {
|
} else {
|
||||||
FCODE(movmskp)(overflow.cvt32(), xmm0);
|
FCODE(movmskp)(overflow.cvt32(), xmm0);
|
||||||
code.test(overflow.cvt32(), overflow.cvt32());
|
code.test(overflow.cvt32(), overflow.cvt32());
|
||||||
|
|
|
@ -589,7 +589,7 @@ HostLoc RegAlloc::LoadImmediate(IR::Value imm, HostLoc host_loc) {
|
||||||
if (imm_value == 0) {
|
if (imm_value == 0) {
|
||||||
MAYBE_AVX(xorps, reg, reg);
|
MAYBE_AVX(xorps, reg, reg);
|
||||||
} else {
|
} else {
|
||||||
MAYBE_AVX(movaps, reg, code.XmmConst(code.xword, imm_value));
|
MAYBE_AVX(movaps, reg, code.Const(code.xword, imm_value));
|
||||||
}
|
}
|
||||||
return host_loc;
|
return host_loc;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue