emit_x64_vector: Reduce gf2p8affineqb requirement to GFNI

Currently, every usage of `gf2p8affineqb` is guarded by the
`AVX512F + AVX512VL + GFNI` requirement, when really
we only need `GFNI` on its own.

This will allow `GFNI`-only chips to get emit GFNI features without
needing to have AVX512 as well.
There _are_ chips in existance currently that strictly ship with GFNI and
have no implementation of AVX1/AVX2/AVX512(and thus no VEX/EVEX
encoding) such as Tremont(Lakefield) chips.
This commit is contained in:
Wunkolo 2021-06-07 16:48:52 -07:00 committed by merry
parent e47d0d11c3
commit c157dfcc4c

View file

@ -440,11 +440,11 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
} }
static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) { static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) { if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = shift_amount < 8 const u64 shift_matrix = shift_amount < 8
? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8)) ? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
: 0x8080808080808080; : 0x8080808080808080;
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0); code.gf2p8affineqb(result, code.MConst(xword, shift_matrix, shift_matrix), 0);
return; return;
} }
@ -1472,9 +1472,9 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
code.pxor(result, result); code.pxor(result, result);
} else if (shift_amount == 1) { } else if (shift_amount == 1) {
code.paddb(result, result); code.paddb(result, result);
} else if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) { } else if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8); const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0); code.gf2p8affineqb(result, code.MConst(xword, shift_matrix, shift_matrix), 0);
} else { } else {
const u64 replicand = (0xFFULL << shift_amount) & 0xFF; const u64 replicand = (0xFFULL << shift_amount) & 0xFF;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>()); const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
@ -1529,9 +1529,9 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
// Do nothing // Do nothing
} else if (shift_amount >= 8) { } else if (shift_amount >= 8) {
code.pxor(result, result); code.pxor(result, result);
} else if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) { } else if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8); const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0); code.gf2p8affineqb(result, code.MConst(xword, shift_matrix, shift_matrix), 0);
} else { } else {
const u64 replicand = 0xFEULL >> shift_amount; const u64 replicand = 0xFEULL >> shift_amount;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>()); const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
@ -2795,8 +2795,8 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::GFNI)) { if (code.HasHostFeature(HostFeature::GFNI)) {
code.vgf2p8affineqb(data, data, code.MConst(xword_b, 0x8040201008040201), 0); code.gf2p8affineqb(data, code.MConst(xword, 0x8040201008040201, 0x8040201008040201), 0);
} else { } else {
const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_nibble_reg, code.MConst(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0)); code.movdqa(high_nibble_reg, code.MConst(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));