emit_x64_vector: Reduce gf2p8affineqb requirement to GFNI

Currently, every usage of `gf2p8affineqb` is guarded by the
`AVX512F + AVX512VL + GFNI` requirement, when really
we only need `GFNI` on its own.

This will allow `GFNI`-only chips to get emit GFNI features without
needing to have AVX512 as well.
There _are_ chips in existance currently that strictly ship with GFNI and
have no implementation of AVX1/AVX2/AVX512(and thus no VEX/EVEX
encoding) such as Tremont(Lakefield) chips.
This commit is contained in:
Wunkolo 2021-06-07 16:48:52 -07:00 committed by merry
parent e47d0d11c3
commit c157dfcc4c

View file

@ -440,11 +440,11 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
}
static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = shift_amount < 8
? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
: 0x8080808080808080;
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
code.gf2p8affineqb(result, code.MConst(xword, shift_matrix, shift_matrix), 0);
return;
}
@ -1472,9 +1472,9 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
code.pxor(result, result);
} else if (shift_amount == 1) {
code.paddb(result, result);
} else if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
} else if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
code.gf2p8affineqb(result, code.MConst(xword, shift_matrix, shift_matrix), 0);
} else {
const u64 replicand = (0xFFULL << shift_amount) & 0xFF;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
@ -1529,9 +1529,9 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
// Do nothing
} else if (shift_amount >= 8) {
code.pxor(result, result);
} else if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
} else if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
code.gf2p8affineqb(result, code.MConst(xword, shift_matrix, shift_matrix), 0);
} else {
const u64 replicand = 0xFEULL >> shift_amount;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
@ -2795,8 +2795,8 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) {
code.vgf2p8affineqb(data, data, code.MConst(xword_b, 0x8040201008040201), 0);
if (code.HasHostFeature(HostFeature::GFNI)) {
code.gf2p8affineqb(data, code.MConst(xword, 0x8040201008040201, 0x8040201008040201), 0);
} else {
const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_nibble_reg, code.MConst(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));