Squashed 'externals/xbyak/' changes from 671fc805..4a6fac8a

4a6fac8a update version to 5.77
801cf3fd cosmetic change of getNumCores
d397e824 fix number of cores that share LLC cache
a669e092 support non-intel-cpu visual studio
af5f422e Merge branch 'fenghaitao-guard_x86' into develop
9b98dc17 Guard x86 specific codes with "#if defined(__i386__) || defined(__x86_64__)"
dd4173e1 move some member variables input private
f72646a7 update version
4612528f format change
4b95e862 Merge branch 'shelleygoel-master'
4c262fa6 add functionality to get num of cores using x2APIC ID
bc70e7e1 recover Xbyak::CastTo
d09a230f unlink Label when LabelManager is destroyed
973e8597 update version
afdb9fe9 Xbyak::CastTo is removed
b011aca4 add RegRip +/- int
acae93cd increase max temp regs for StackFrame
ea4e3562 util::StackFrame uses push/pop instead of mov
42462ef9 use evex encoding for vpslld/vpslldq/vpsraw/...(reg, mem, imm);
da9117a9 update version of readme.md
d35f4fb7 fix the encoding of vinsertps for disp8N
1de435ed bf uses Label class
613922bd add Label L() for convenience
43e15583 fix typo
93579ee6 add protect-re.cpp
60004b5c fix url of protect-re.cpp
348b2709 fix typo of doc
f34f6ed5 update manual
232110be update test
82b78bf0 add setProtectMode
dd8b290f put warning message if pageSize != 4096
64775ca2 a little refactoring
7c3e7b85 fix wrong VSIB encoding with idx >= 16

git-subtree-dir: externals/xbyak
git-subtree-split: 4a6fac8ade404f667b94170f713367fe7da2a852
This commit is contained in:
MerryMage 2020-04-22 20:59:14 +01:00
parent dbb1f8cf37
commit 080b4b3aff
17 changed files with 994 additions and 489 deletions

View file

@ -37,6 +37,7 @@
T_B64 = 1 << 27, // m64bcst
T_M_K = 1 << 28, // mem{k}
T_VSIB = 1 << 29,
T_MEM_EVEX = 1 << 30, // use evex if mem
T_XXX
};
@ -161,5 +162,9 @@ std::string type2String(int type)
if (!str.empty()) str += " | ";
str += "T_VSIB";
}
if (type & T_MEM_EVEX) {
if (!str.empty()) str += " | ";
str += "T_MEM_EVEX";
}
return str;
}

View file

@ -76,7 +76,7 @@ void putX_X_XM(bool omitOnly)
{ 0xC2, "cmpss", T_0F | T_F3, true, true, 2 },
{ 0x5A, "cvtsd2ss", T_0F | T_F2 | T_EVEX | T_EW1 | T_N8 | T_ER_X, false, true, 2 },
{ 0x5A, "cvtss2sd", T_0F | T_F3 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, false, true, 2 },
{ 0x21, "insertps", T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0, true, true, 2 },
{ 0x21, "insertps", T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, true, true, 2 },
{ 0x63, "packsswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 },
{ 0x6B, "packssdw", T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32, false, true, 2 },
{ 0x67, "packuswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 },
@ -1491,16 +1491,16 @@ void put()
int idx;
int type;
} tbl[] = {
{ "pslldq", 0x73, 7, T_0F | T_66 | T_YMM | T_EVEX },
{ "psrldq", 0x73, 3, T_0F | T_66 | T_YMM | T_EVEX },
{ "psllw", 0x71, 6, T_0F | T_66 | T_YMM | T_EVEX },
{ "pslld", 0x72, 6, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 },
{ "psllq", 0x73, 6, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 },
{ "psraw", 0x71, 4, T_0F | T_66 | T_YMM | T_EVEX },
{ "psrad", 0x72, 4, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 },
{ "psrlw", 0x71, 2, T_0F | T_66 | T_YMM | T_EVEX },
{ "psrld", 0x72, 2, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 },
{ "psrlq", 0x73, 2, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 },
{ "pslldq", 0x73, 7, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psrldq", 0x73, 3, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psllw", 0x71, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "pslld", 0x72, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 },
{ "psllq", 0x73, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW1 | T_B64 },
{ "psraw", 0x71, 4, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psrad", 0x72, 4, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 },
{ "psrlw", 0x71, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psrld", 0x72, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 },
{ "psrlq", 0x73, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW1 | T_B64 },
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];

405
readme.md
View file

@ -1,107 +1,121 @@
Xbyak 5.67 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
=============
# Xbyak 5.77 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
Abstract
-------------
## Abstract
This is a header file which enables dynamically to assemble x86(IA32), x64(AMD64, x86-64) mnemonic.
Feature
-------------
header file only
you can use Xbyak's functions at once if xbyak.h is included.
## Feature
* header file only
* Intel/MASM like syntax
* fully support AVX-512
### Supported Instructions Sets
**Note**: Xbyak uses and(), or(), xor(), not() functions, so `-fno-operator-names` option is necessary for gcc/clang.
MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(*partial*)/AVX/AVX2/FMA/VEX-encoded GPR/AVX-512
Or define `XBYAK_NO_OP_NAMES` before including `xbyak.h` and use and_(), or_(), xor_(), not_() instead of them.
and_(), or_(), xor_(), not_() are always available.
`XBYAK_NO_OP_NAMES` will be defined in the feature version.
### Supported OS
* Windows Xp, Vista, Windows 7(32bit, 64bit)
* Windows Xp, Vista, Windows 7, Windows 10(32bit, 64bit)
* Linux(32bit, 64bit)
* Intel Mac OSX
* Intel macOS
### Supported Compilers
* Visual Studio C++ VC2012 or later
* gcc 4.7 or later
* clang 3.3
* cygwin gcc 4.5.3
* icc 7.2
Almost C++03 or later compilers for x86/x64 such as Visual Studio, g++, clang++, Intel C++ compiler and g++ on mingw/cygwin.
>Note: Xbyak uses and(), or(), xor(), not() functions, so "-fno-operator-names" option is required on gcc.
Or define XBYAK_NO_OP_NAMES and use and_(), or_(), xor_(), not_() instead of them.
and_(), or_(), xor_(), not_() are always available.
## Install
Install
-------------
The following files are necessary. Please add the path to your compile directories.
The following files are necessary. Please add the path to your compile directory.
* xbyak.h
* xbyak_mnemonic.h
* xbyak_util.h
Linux:
```
make install
```
make install
These files are copied into `/usr/local/include/xbyak`.
These files are copied into /usr/local/include/xbyak
## How to use it
New Feature
-------------
Inherit `Xbyak::CodeGenerator` class and make the class method.
```
#define XBYAK_NO_OP_NAMES
#include <xbyak/xbyak.h>
Add support for AVX-512 instruction set.
struct Code : Xbyak::CodeGenerator {
Code(int x)
{
mov(eax, x);
ret();
}
};
```
Make an instance of the class and get the function
pointer by calling `getCode()` and call it.
```
Code c(5);
int (*f)() = c.getCode<int (*)()>();
printf("ret=%d\n", f()); // ret = 5
```
Syntax
-------------
## Syntax
Similar to MASM/NASM syntax with parentheses.
Make Xbyak::CodeGenerator and make the class method and get the function
pointer by calling cgetCode() and casting the return value.
```
NASM Xbyak
mov eax, ebx --> mov(eax, ebx);
inc ecx inc(ecx);
ret --> ret();
```
NASM Xbyak
mov eax, ebx --> mov(eax, ebx);
inc ecx inc(ecx);
ret --> ret();
## Addressing
Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory,
otherwise use `ptr`.
### Addressing
(ptr|dword|word|byte) [base + index * (1|2|4|8) + displacement]
```
(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement]
[rip + 32bit disp] ; x64 only
NASM Xbyak
mov eax, [ebx+ecx] --> mov (eax, ptr[ebx+ecx]);
test byte [esp], 4 --> test (byte [esp], 4);
How to use Selector(Segment Register)
>Note: Segment class is not derived from Operand.
NASM Xbyak
mov eax, [ebx+ecx] --> mov(eax, ptr [ebx+ecx]);
mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]);
test byte [esp], 4 --> test(byte [esp], 4);
inc qword [rax] --> inc(qword [rax]);
```
mov eax, [fs:eax] --> putSeg(fs); mov(eax, ptr [eax]);
**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type.
### How to use Selector (Segment Register)
```
mov eax, [fs:eax] --> putSeg(fs);
mov(eax, ptr [eax]);
mov ax, cs --> mov(ax, cs);
```
**Note**: Segment class is not derived from `Operand`.
>you can use ptr for almost memory access unless you specify the size of memory.
## AVX
>dword, word and byte are member variables, then don't use dword as unsigned int, for example.
### AVX
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
vgatherdpd(xmm1, ptr [ebp+123+xmm2*4], xmm3);
*Remark*
The omitted destination syntax as the following ss disabled.
```
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
```
define `XBYAK_ENABLE_OMITTED_OPERAND` if you use it for backward compatibility.
**Note**:
If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility.
But the newer version will not support it.
```
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
```
### AVX-512
## AVX-512
```
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
@ -130,97 +144,122 @@ vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5)
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
```
Remark
* k1, ..., k7 are new opmask registers.
### Remark
* `k1`, ..., `k7` are opmask registers.
* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively.
* `k4 | k3` is different from `k3 | k4`.
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
* specify xword/yword/zword(_b) for m128/m256/m512 if necessary.
* specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary.
### Label
## Label
Two kinds of Label are supported. (String literal and Label class).
L("L1");
jmp ("L1");
### String literal
```
L("L1");
jmp("L1");
jmp ("L2");
jmp("L2");
...
a few mnemonics(8-bit displacement jmp)
a few mnemonics (8-bit displacement jmp)
...
L("L2");
L("L2");
jmp ("L3", T_NEAR);
jmp("L3", T_NEAR);
...
a lot of mnemonics(32-bit displacement jmp)
a lot of mnemonics (32-bit displacement jmp)
...
L("L3");
L("L3");
```
>Call hasUndefinedLabel() to verify your code has no undefined label.
> you can use a label for immediate value of mov like as mov (eax, "L2");
* Call `hasUndefinedLabel()` to verify your code has no undefined label.
* you can use a label for immediate value of mov like as `mov(eax, "L2")`.
#### 1. support @@, @f, @b like MASM
### Support `@@`, `@f`, `@b` like MASM
L("@@"); // <A>
```
L("@@"); // <A>
jmp("@b"); // jmp to <A>
jmp("@f"); // jmp to <B>
L("@@"); // <B>
L("@@"); // <B>
jmp("@b"); // jmp to <B>
mov(eax, "@b");
jmp(eax); // jmp to <B>
```
#### 2. localization of label by calling inLocalLabel(), outLocallabel().
### Local label
labels begining of period between inLocalLabel() and outLocalLabel()
are dealed with local label.
inLocalLabel() and outLocalLabel() can be nested.
Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabel()`
are treated as a local label.
`inLocalLabel()` and `outLocalLabel()` can be nested.
void func1()
{
```
void func1()
{
inLocalLabel();
L(".lp"); // <A> ; local label
...
jmp(".lp"); // jmpt to <A>
L("aaa"); // global label
jmp(".lp"); // jmp to <A>
L("aaa"); // global label <C>
outLocalLabel();
}
void func2()
{
inLocalLabel();
L(".lp"); // <B> ; local label
func1();
jmp(".lp"); // jmp to <B>
inLocalLabel();
}
jmp("aaa"); // jmp to <C>
}
```
### Label class
L() and jxx() functions support a new Label class.
`L()` and `jxx()` support Label class.
Label label1, label2;
L(label1);
```
Xbyak::Label label1, label2;
L(label1);
...
jmp(label1);
...
jmp(label2);
...
L(label2);
L(label2);
```
Moreover, assignL(dstLabel, srcLabel) method binds dstLabel with srcLabel.
Use `putL` for jmp table
```
Label labelTbl, L0, L1, L2;
mov(rax, labelTbl);
// rdx is an index of jump table
jmp(ptr [rax + rdx * sizeof(void*)]);
L(labelTbl);
putL(L0);
putL(L1);
putL(L2);
L(L0);
....
L(L1);
....
```
Label label1, label2;
L(label1);
`assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel.
```
Label label2;
Label label1 = L(); // make label1 ; same to Label label1; L(label1);
...
jmp(label2);
jmp(label2); // label2 is not determined here
...
assignL(label2, label1); // label2 <= label1
assignL(label2, label1); // label2 <- label1
```
The `jmp` in the above code jumps to label1 assigned by `assignL`.
The above jmp opecode jumps label1.
**Note**:
* srcLabel must be used in `L()`.
* dstLabel must not be used in `L()`.
* Restriction:
* srcLabel must be used in L().
* dstLabel must not be used in L().
Label::getAddress() returns the address specified by the label instance and 0 if not specified.
`Label::getAddress()` returns the address specified by the label instance and 0 if not specified.
```
// not AutoGrow mode
Label label;
@ -229,7 +268,7 @@ L(label);
assert(label.getAddress() == getCurr());
```
### Rip
### Rip ; relative addressing
```
Label label;
mov(eax, ptr [rip + label]); // eax = 4
@ -243,92 +282,127 @@ int x;
...
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
```
### Code size
The default max code size is 4096 bytes. Please set it in constructor of CodeGenerator() if you want to use large size.
class Quantize : public Xbyak::CodeGenerator {
public:
## Code size
The default max code size is 4096 bytes.
Specify the size in constructor of `CodeGenerator()` if necessary.
```
class Quantize : public Xbyak::CodeGenerator {
public:
Quantize()
: CodeGenerator(8192)
{
}
...
};
};
```
### use user allocated memory
## User allocated memory
You can make jit code on prepaired memory.
class Sample : public Xbyak::CodeGenerator {
public:
Sample(void *userPtr, size_t size)
: Xbyak::CodeGenerator(size, userPtr)
{
...
}
};
Call `setProtectModeRE` yourself to change memory mode if using the prepaired memory.
const size_t codeSize = 1024;
uint8 buf[codeSize + 16];
// get 16-byte aligned address
uint8 *p = Xbyak::CodeArray::getAlignedAddress(buf);
// append executable attribute to the memory
Xbyak::CodeArray::protect(p, codeSize, true);
// construct your jit code on the memory
Sample s(p, codeSize);
>See *sample/test0.cpp*
AutoGrow
-------------
Under `AutoGrow` mode, Xbyak extends memory automatically if necessary.
Call ready() before calling getCode() to calc address of jmp.
```
struct Code : Xbyak::CodeGenerator {
uint8_t alignas(4096) buf[8192]; // C++11 or later
struct Code : Xbyak::CodeGenerator {
Code() : Xbyak::CodeGenerator(sizeof(buf), buf)
{
mov(rax, 123);
ret();
}
};
int main()
{
Code c;
c.setProtectModeRE(); // set memory to Read/Exec
printf("%d\n", c.getCode<int(*)()>()());
}
```
**Note**: See [sample/test0.cpp](sample/test0.cpp).
### AutoGrow
The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`.
Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address.
```
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
{
...
}
};
Code c;
c.ready(); // Don't forget to call this function
};
Code c;
// generate code for jit
c.ready(); // mode = Read/Write/Exec
```
>Don't use the address returned by getCurr() before calling ready().
>It may be invalid address.
>RESTRICTION : rip addressing is not supported in AutoGrow
Macro
-------------
**Note**:
* Don't use the address returned by `getCurr()` before calling `ready()` because it may be invalid address.
### Read/Exec mode
Xbyak set Read/Write/Exec mode to memory to run jit code.
If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and
call `setProtectModeRE()` after generating jit code.
```
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
{
mov(eax, 123);
ret();
}
};
Code c;
c.setProtectModeRE();
...
```
Call `readyRE()` instead of `ready()` when using `AutoGrow` mode.
See [protect-re.cpp](sample/protect-re.cpp).
## Macro
* **XBYAK32** is defined on 32bit.
* **XBYAK64** is defined on 64bit.
* **XBYAK64_WIN** is defined on 64bit Windows(VC)
* **XBYAK64_GCC** is defined on 64bit gcc, cygwin
* define **XBYAK_NO_OP_NAMES** on gcc without `-fno-operator-names`
* define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(duplicated in the future)
* define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(deprecated in the future)
* define **XBYAK_UNDEF_JNL** if Bessel function jnl is defined as macro
Sample
-------------
## Sample
* test0.cpp ; tiny sample of Xbyak(x86, x64)
* quantize.cpp ; JIT optimized quantization by fast division(x86 only)
* calc.cpp ; assemble and estimate a given polynomial(x86, x64)
* bf.cpp ; JIT brainfuck(x86, x64)
* [test0.cpp](sample/test0.cpp) ; tiny sample (x86, x64)
* [quantize.cpp](sample/quantize.cpp) ; JIT optimized quantization by fast division (x86 only)
* [calc.cpp](sample/calc.cpp) ; assemble and estimate a given polynomial (x86, x64)
* [bf.cpp](sample/bf.cpp) ; JIT brainfuck (x86, x64)
License
-------------
## License
modified new BSD License
http://opensource.org/licenses/BSD-3-Clause
History
-------------
## History
* 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov
* 2019/Jan/17 ver 5.76 add Cpu::getNumCores() by shelleygoel
* 2018/Oct/31 ver 5.751 recover Xbyak::CastTo for compatibility
* 2018/Oct/29 ver 5.75 unlink LabelManager from Label when msg is destroyed
* 2018/Oct/21 ver 5.74 support RegRip +/- int. Xbyak::CastTo is removed
* 2018/Oct/15 util::AddressFrame uses push/pop instead of mov
* 2018/Sep/19 ver 5.73 fix evex encoding of vpslld, vpslldq, vpsllw, etc for (reg, mem, imm8)
* 2018/Sep/19 ver 5.72 fix the encoding of vinsertps for disp8N(Thanks to petercaday)
* 2018/Sep/04 ver 5.71 L() returns a new label instance
* 2018/Aug/27 ver 5.70 support setProtectMode() and DontUseProtect for read/exec setting
* 2018/Aug/24 ver 5.68 fix wrong VSIB encoding with vector index >= 16(thanks to petercaday)
* 2018/Aug/14 ver 5.67 remove mutable in Address ; fix setCacheHierarchy for cloud vm
* 2018/Jul/26 ver 5.661 support mingw64
* 2018/Jul/24 ver 5.66 add CodeArray::PROTECT_RE to mode of protect()
@ -392,8 +466,7 @@ History
* 2013/Jul/30 ver 4.20 [break backward compatibility] split Reg32e class into RegExp(base+index*scale+disp) and Reg32e(means Reg32 or Reg64)
* 2013/Jul/04 ver 4.10 [break backward compatibility] change the type of Xbyak::Error from enum to a class
* 2013/Jun/21 ver 4.02 add putL(LABEL) function to put the address of the label
* 2013/Jun/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm).
support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest).
* 2013/Jun/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm). support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest).
* 2013/May/30 ver 4.00 support AVX2, VEX-encoded GPR-instructions
* 2013/Mar/27 ver 3.80 support mov(reg, "label");
* 2013/Mar/13 ver 3.76 add cqo(), jcxz(), jecxz(), jrcxz()
@ -453,8 +526,6 @@ History
* 2007/Jan/21 fix the bug to create address like [disp] select smaller representation for mov (eax|ax|al, [disp])
* 2007/Jan/4 first version
Author
-------------
## Author
MITSUNARI Shigeo(herumi@nifty.com)

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.67
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.77
-----------------------------------------------------------------------------
◎概要
@ -245,8 +245,8 @@ void func2()
更にラベルの割り当てを行うassignL(dstLabel, srcLabel)という命令も追加されました。
Label label1, label2;
L(label1);
Label label2;
Label label1 = L(); // Label label1; L(label1);と同じ意味
...
jmp(label2);
...
@ -309,6 +309,41 @@ bool CodeArray::protect(const void *addr, size_t size, bool canExec);
*/
uint8 *CodeArray::getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE);
・read/execモード
デフォルトのCodeGeneratorはコンストラクト時にJIT用の領域をread/write/execモードに設定して利用します。
コード生成時はread/writeでコード実行時にはread/execにしたい場合、次のようにしてください。
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096, Xbyak::DontUseProtect) // JIT領域をread/writeのままコード生成
{
mov(eax, 123);
ret();
}
};
Code c;
c.setProtectModeRE(); // read/execモードに変更
// JIT領域を実行
AutoGrowの場合はreadyの代わりにreadyRE()を読んでください。
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096, Xbyak::AutoGrow) // JIT領域をread/writeのままコード生成
{
mov(eax, 123);
ret();
}
};
Code c;
c.readyRE(); // read/exeモードに変更
// JIT領域を実行
setProtectModeRW()を呼ぶと領域が元のread/execモードに戻ります。
その他詳細は各種サンプルを参照してください。
-----------------------------------------------------------------------------
◎マクロ
@ -338,6 +373,17 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
-----------------------------------------------------------------------------
◎履歴
2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)
2019/01/17 ver 5.76 Cpu::getNumCores()追加(by shelleygoel)
2018/10/31 ver 5.751 互換性のためにXbyak::CastToの復元
2018/10/29 ver 5.75 LabelManagerのデストラクタでLabelから参照を切り離す
2018/10/21 ver 5.74 RegRip +/intの形をサポート Xbyak::CastToを削除
2018/10/15 util::StackFrameでmovの代わりにpush/popを使う
2018/09/19 ver 5.73 vpslld, vpslldq, vpsllwなどの(reg, mem, imm8)に対するevexエンコーディング修整
2018/09/19 ver 5.72 fix the encoding of vinsertps for disp8N(Thanks to petercaday)
2018/08/27 ver 5.71 新しいlabelインスタンスを返すL()を追加
2018/08/27 ver 5.70 read/exec設定のためのsetProtectMode()とDontUseProtectの追加
2018/08/24 ver 5.68 indexが16以上のVSIBエンコーディングのバグ修正(thanks to petercaday)
2018/08/14 ver 5.67 Addressクラス内のmutableを削除 ; fix setCacheHierarchy for cloud vm
2018/07/26 ver 5.661 mingw64対応
2018/07/24 ver 5.66 protect()のmodeにCodeArray::PROTECT_REを追加

View file

@ -10,12 +10,6 @@
#endif
class Brainfuck : public Xbyak::CodeGenerator {
private:
enum Direction { B, F };
std::string toStr(int labelNo, Direction dir)
{
return Xbyak::Label::toStr(labelNo) + (dir == B ? 'B' : 'F');
}
public:
int getContinuousChar(std::istream& is, char c)
{
@ -67,8 +61,7 @@ public:
mov(pGetchar, rsi); // getchar
mov(stack, rdx); // stack
#endif
int labelNo = 0;
std::stack<int> keepLabelNo;
std::stack<Label> labelF, labelB;
char c;
while (is >> c) {
switch (c) {
@ -116,17 +109,22 @@ public:
mov(cur, eax);
break;
case '[':
L(toStr(labelNo, B));
{
Label B = L();
labelB.push(B);
mov(eax, cur);
test(eax, eax);
jz(toStr(labelNo, F), T_NEAR);
keepLabelNo.push(labelNo++);
Label F;
jz(F, T_NEAR);
labelF.push(F);
}
break;
case ']':
{
int no = keepLabelNo.top(); keepLabelNo.pop();
jmp(toStr(no, B));
L(toStr(no, F));
Label B = labelB.top(); labelB.pop();
jmp(B);
Label F = labelF.top(); labelF.pop();
L(F);
}
break;
default:
@ -200,7 +198,7 @@ int main(int argc, char *argv[])
Brainfuck bf(ifs);
if (mode == 0) {
static int stack[128 * 1024];
bf.getCode<void (*)(void*, void*, int *)>()(Xbyak::CastTo<void*>(putchar), Xbyak::CastTo<void*>(getchar), stack);
bf.getCode<void (*)(const void*, const void*, int *)>()(reinterpret_cast<const void*>(putchar), reinterpret_cast<const void*>(getchar), stack);
} else {
dump(bf.getCode(), bf.getSize());
}

70
sample/protect-re.cpp Normal file
View file

@ -0,0 +1,70 @@
#define XBYAK_NO_OP_NAMES
#include <xbyak/xbyak.h>
struct Code1 : Xbyak::CodeGenerator {
Code1()
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
{
mov(eax, 123);
ret();
}
void update()
{
db(0);
}
};
void test1(bool updateCode)
{
Code1 c;
c.setProtectModeRE();
if (updateCode) c.update(); // segmentation fault
int (*f)() = c.getCode<int (*)()>();
printf("f=%d\n", f());
c.setProtectModeRW();
c.update();
puts("ok");
}
struct Code2 : Xbyak::CodeGenerator {
Code2()
: Xbyak::CodeGenerator(4096, Xbyak::AutoGrow)
{
mov(eax, 123);
ret();
}
void update()
{
db(0);
}
};
void test2(bool updateCode)
{
Code2 c;
c.readyRE();
if (updateCode) c.update(); // segmentation fault
int (*f)() = c.getCode<int (*)()>();
printf("f=%d\n", f());
c.setProtectModeRW();
c.update();
puts("ok");
}
int main(int argc, char *argv[])
{
if (argc < 2) {
fprintf(stderr, "%s <testNum> [update]\n", argv[0]);
return 0;
}
bool update = argc == 3;
int n = atoi(argv[1]);
printf("n=%d update=%d\n", n, update);
switch (n) {
case 1: test1(update); break;
case 2: test2(update); break;
default: fprintf(stderr, "no test %d\n", n); break;
}
}

View file

@ -32,7 +32,7 @@ struct Code : Xbyak::CodeGenerator {
inline int add(int a, int b)
{
return Xbyak::CastTo<int (*)(int,int)>(buf)(a, b);
return reinterpret_cast<int (*)(int, int)>(buf)(a, b);
}
int main()

View file

@ -77,7 +77,7 @@ public:
#ifdef XBYAK_VARIADIC_TEMPLATE
call(atoi);
#else
call(Xbyak::CastTo<void*>(atoi));
call(reinterpret_cast<const void*>(atoi));
#endif
add(esp, 4);
#endif
@ -96,7 +96,7 @@ public:
mov(rax, (size_t)atoi);
jmp(rax);
#else
jmp(Xbyak::CastTo<void*>(atoi));
jmp(reinterpret_cast<const void*>(atoi));
#endif
}
int (*get() const)(const char *) { return getCode<int (*)(const char *)>(); }
@ -171,8 +171,9 @@ int main()
return 1;
}
int (*func)(int) = s.getCode<int (*)(int)>();
if (Xbyak::CastTo<uint8*>(func) != p) {
fprintf(stderr, "internal error %p %p\n", p, Xbyak::CastTo<uint8*>(func));
const uint8 *funcp = reinterpret_cast<const uint8*>(func);
if (funcp != p) {
fprintf(stderr, "internal error %p %p\n", p, funcp);
return 1;
}
printf("0 + ... + %d = %d\n", 100, func(100));

View file

@ -104,9 +104,12 @@ void putCPUinfo()
Core i7-3930K 6 2D
*/
cpu.putFamily();
if (!cpu.has(Cpu::tINTEL)) return;
for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) {
printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i));
}
printf("SmtLevel =%u\n", cpu.getNumCores(Xbyak::util::SmtLevel));
printf("CoreLevel=%u\n", cpu.getNumCores(Xbyak::util::CoreLevel));
}
int main()

View file

@ -204,7 +204,7 @@ public:
push(reg[r]);
push('A' + r);
push((int)str);
call(Xbyak::CastTo<void*>(printf));
call(reinterpret_cast<const void*>(printf));
add(esp, 4 * 4);
pop(ecx);
pop(edx);

View file

@ -889,6 +889,34 @@ CYBOZU_TEST_AUTO(testNewLabel)
}
}
CYBOZU_TEST_AUTO(returnLabel)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
xor_(eax, eax);
Label L1 = L();
test(eax, eax);
Label exit;
jnz(exit);
inc(eax); // 1
Label L2;
call(L2);
jmp(L1);
L(L2);
inc(eax); // 2
ret();
L(exit);
inc(eax); // 3
ret();
}
};
Code code;
int (*f)() = code.getCode<int (*)()>();
int r = f();
CYBOZU_TEST_EQUAL(r, 3);
}
CYBOZU_TEST_AUTO(testAssign)
{
struct Code : Xbyak::CodeGenerator {
@ -987,6 +1015,52 @@ struct GetAddressCode1 : Xbyak::CodeGenerator {
}
};
struct CodeLabelTable : Xbyak::CodeGenerator {
enum { ret0 = 3 };
enum { ret1 = 5 };
enum { ret2 = 8 };
CodeLabelTable()
{
using namespace Xbyak;
#ifdef XBYAK64_WIN
const Reg64& p0 = rcx;
const Reg64& a = rax;
#elif defined (XBYAK64_GCC)
const Reg64& p0 = rdi;
const Reg64& a = rax;
#else
const Reg32& p0 = edx;
const Reg32& a = eax;
mov(edx, ptr [esp + 4]);
#endif
Label labelTbl, L0, L1, L2;
mov(a, labelTbl);
jmp(ptr [a + p0 * sizeof(void*)]);
L(labelTbl);
putL(L0);
putL(L1);
putL(L2);
L(L0);
mov(a, ret0);
ret();
L(L1);
mov(a, ret1);
ret();
L(L2);
mov(a, ret2);
ret();
}
};
CYBOZU_TEST_AUTO(LabelTable)
{
CodeLabelTable c;
int (*f)(int) = c.getCode<int (*)(int)>();
CYBOZU_TEST_EQUAL(f(0), c.ret0);
CYBOZU_TEST_EQUAL(f(1), c.ret1);
CYBOZU_TEST_EQUAL(f(2), c.ret2);
}
CYBOZU_TEST_AUTO(getAddress1)
{
GetAddressCode1 c;
@ -1143,11 +1217,56 @@ CYBOZU_TEST_AUTO(rip_addr_with_fixed_buf)
ret();
}
} code;
Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RE);
code.setProtectModeRE();
code.getCode<void (*)()>()();
CYBOZU_TEST_EQUAL(*x0, 123);
CYBOZU_TEST_EQUAL(*x1, 456);
CYBOZU_TEST_EQUAL(buf[8], 99);
Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RW);
code.setProtectModeRW();
}
#endif
struct ReleaseTestCode : Xbyak::CodeGenerator {
ReleaseTestCode(Label& L1, Label& L2, Label& L3)
{
L(L1);
jmp(L1);
L(L2);
jmp(L3); // not assigned
}
};
/*
code must unlink label if code is destroyed
*/
CYBOZU_TEST_AUTO(release_label_after_code)
{
puts("---");
{
Label L1, L2, L3, L4, L5;
{
ReleaseTestCode code(L1, L2, L3);
CYBOZU_TEST_ASSERT(L1.getId() > 0);
CYBOZU_TEST_ASSERT(L1.getAddress() != 0);
CYBOZU_TEST_ASSERT(L2.getId() > 0);
CYBOZU_TEST_ASSERT(L2.getAddress() != 0);
CYBOZU_TEST_ASSERT(L3.getId() > 0);
CYBOZU_TEST_ASSERT(L3.getAddress() == 0); // L3 is not assigned
code.assignL(L4, L1);
L5 = L1;
printf("id=%d %d %d %d %d\n", L1.getId(), L2.getId(), L3.getId(), L4.getId(), L5.getId());
}
puts("code is released");
CYBOZU_TEST_ASSERT(L1.getId() == 0);
CYBOZU_TEST_ASSERT(L1.getAddress() == 0);
CYBOZU_TEST_ASSERT(L2.getId() == 0);
CYBOZU_TEST_ASSERT(L2.getAddress() == 0);
// CYBOZU_TEST_ASSERT(L3.getId() == 0); // L3 is not assigned so not cleared
CYBOZU_TEST_ASSERT(L3.getAddress() == 0);
CYBOZU_TEST_ASSERT(L4.getId() == 0);
CYBOZU_TEST_ASSERT(L4.getAddress() == 0);
CYBOZU_TEST_ASSERT(L5.getId() == 0);
CYBOZU_TEST_ASSERT(L5.getAddress() == 0);
printf("id=%d %d %d %d %d\n", L1.getId(), L2.getId(), L3.getId(), L4.getId(), L5.getId());
}
}

View file

@ -73,7 +73,6 @@ const uint64 YMM_ER = 1ULL << 36;
const uint64 VM32Y_K = 1ULL << 37;
const uint64 IMM_2 = 1ULL << 38;
const uint64 IMM = IMM_1 | IMM_2;
const uint64 XMM = _XMM | _XMM2;
const uint64 YMM = _YMM | _YMM2;
const uint64 K = 1ULL << 43;
const uint64 _ZMM = 1ULL << 44;
@ -90,7 +89,10 @@ const uint64 ZMM_SAE = 1ULL << 48;
const uint64 ZMM_ER = 1ULL << 49;
#ifdef XBYAK64
const uint64 _XMM3 = 1ULL << 50;
#else
const uint64 _XMM3 = 0;
#endif
const uint64 XMM = _XMM | _XMM2 | _XMM3;
const uint64 XMM_SAE = 1ULL << 51;
#ifdef XBYAK64
const uint64 XMM_KZ = 1ULL << 52;
@ -352,7 +354,8 @@ class Test {
case VM32Y_K:
return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}";
case VM32Z_K:
return isXbyak_ ? "ptr [64+zmm13*2+r13] | k6" : "[64+zmm13*2+r13]{k6}";
if (idx & 1) return isXbyak_ ? "ptr [64+zmm10*8+r9] | k6" : "[64+zmm10*8+r9]{k6}";
return isXbyak_ ? "ptr [64+zmm30*2+r13] | k6" : "[64+zmm30*2+r13]{k6}";
case VM32Z:
return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]";
case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
@ -607,7 +610,7 @@ public:
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
put(p->name, K, _XMM, _XMM | MEM, IMM8);
put(p->name, K, XMM, _XMM | MEM, IMM8);
if (!p->supportYMM) continue;
put(p->name, K, _YMM, _YMM | MEM, IMM8);
put(p->name, K, _ZMM, _ZMM | MEM, IMM8);
@ -626,10 +629,10 @@ public:
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i];
put(p->name, XMM | _XMM3, XMM_SAE | XMM | MEM);
put(p->name, XMM, XMM_SAE | XMM | MEM);
}
}
put("vcomiss", _XMM3, XMM | MEM);
put("vcomiss", XMM, _XMM3 | MEM);
put("vcomiss", XMM, XMM_SAE);
#endif
}
@ -673,10 +676,10 @@ public:
"vpbroadcastq",
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
put(tbl[i], XMM_KZ | ZMM_KZ, _XMM | _MEM);
put(tbl[i], XMM_KZ | ZMM_KZ, XMM | _MEM);
}
}
put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, _XMM | _MEM);
put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, XMM | _MEM);
put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM);
put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM);
put("vbroadcasti32x8", ZMM_KZ, _MEM);
@ -684,14 +687,14 @@ public:
}
void putMisc1()
{
put("vmaskmovps", XMM, XMM, MEM);
put("vmaskmovps", _XMM, _XMM, MEM);
put("vmaskmovps", YMM, YMM, MEM);
put("vmaskmovpd", YMM, YMM, MEM);
put("vmaskmovpd", XMM, XMM, MEM);
put("vmaskmovpd", _XMM, _XMM, MEM);
put("vmaskmovps", MEM, XMM, XMM);
put("vmaskmovpd", MEM, XMM, XMM);
put("vmaskmovps", MEM, _XMM, _XMM);
put("vmaskmovpd", MEM, _XMM, _XMM);
put("vbroadcastf128", YMM, MEM);
put("vbroadcasti128", YMM, MEM);
@ -710,8 +713,8 @@ public:
}
}
put("vinsertf128", YMM, YMM, XMM | MEM, IMM8);
put("vinserti128", YMM, YMM, XMM | MEM, IMM8);
put("vinsertf128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8);
put("vinserti128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8);
put("vperm2f128", YMM, YMM, YMM | MEM, IMM8);
put("vperm2i128", YMM, YMM, YMM | MEM, IMM8);
@ -721,9 +724,9 @@ public:
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *name = tbl[i];
put(name, XMM, XMM, MEM);
put(name, _XMM, _XMM, MEM);
put(name, YMM, YMM, MEM);
put(name, MEM, XMM, XMM);
put(name, MEM, _XMM, _XMM);
put(name, MEM, YMM, YMM);
}
}
@ -760,29 +763,29 @@ public:
put(name, MEM, ZMM);
put(name, ZMM, MEM);
#ifdef XBYAK64
put(name, MEM, _XMM3);
put(name, _XMM3, MEM);
put(name, MEM, XMM);
put(name, XMM, MEM);
#endif
}
}
void put_vmov()
{
#ifdef XBYAK64
put("vmovd", _XMM3, MEM|REG32);
put("vmovd", MEM|REG32, _XMM3);
put("vmovq", _XMM3, MEM|REG64|XMM);
put("vmovq", MEM|REG64|XMM, _XMM3);
put("vmovhlps", _XMM3, _XMM3, _XMM3);
put("vmovlhps", _XMM3, _XMM3, _XMM3);
put("vmovntdqa", _XMM3|_YMM3|ZMM, MEM);
put("vmovntdq", MEM, _XMM3 | _YMM3 | ZMM);
put("vmovntpd", MEM, _XMM3 | _YMM3 | ZMM);
put("vmovntps", MEM, _XMM3 | _YMM3 | ZMM);
put("vmovd", XMM, MEM|REG32);
put("vmovd", MEM|REG32, XMM);
put("vmovq", XMM, MEM|REG64|XMM);
put("vmovq", MEM|REG64|XMM, XMM);
put("vmovhlps", XMM, _XMM3, _XMM3);
put("vmovlhps", XMM, _XMM3, _XMM3);
put("vmovntdqa", XMM|_YMM3|ZMM, MEM);
put("vmovntdq", MEM, XMM | _YMM3 | ZMM);
put("vmovntpd", MEM, XMM | _YMM3 | ZMM);
put("vmovntps", MEM, XMM | _YMM3 | ZMM);
put("vmovsd", XMM_KZ, _XMM3, _XMM3);
put("vmovsd", XMM_KZ, XMM, _XMM3);
put("vmovsd", XMM_KZ, MEM);
put("vmovsd", MEM_K, XMM);
put("vmovss", XMM_KZ, _XMM3, _XMM3);
put("vmovss", XMM_KZ, XMM, _XMM3);
put("vmovss", XMM_KZ, MEM);
put("vmovss", MEM_K, XMM);
@ -797,7 +800,7 @@ public:
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *name = tbl[i];
put(name, XMM_KZ, _XMM, _XMM | MEM, IMM);
put(name, XMM_KZ, XMM, _XMM | MEM, IMM);
put(name, _YMM3, _YMM3, _YMM3 | _MEM, IMM);
put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM);
}
@ -810,7 +813,7 @@ public:
"vmovlps",
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
put(tbl[i], _XMM3, _XMM3, MEM);
put(tbl[i], XMM, _XMM3, MEM);
put(tbl[i], MEM, _XMM3);
}
}
@ -836,11 +839,11 @@ public:
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
put(p.name, _XMM|XMM_KZ, _XMM|MEM);
put(p.name, XMM|XMM_KZ, _XMM|MEM);
put(p.name, _YMM|YMM_KZ, _YMM|MEM);
put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM);
if (!p.M_X) continue;
put(p.name, MEM|MEM_K, _XMM);
put(p.name, MEM|MEM_K, XMM);
put(p.name, MEM|MEM_K, _YMM);
put(p.name, MEM|MEM_K, _ZMM);
}
@ -857,7 +860,7 @@ public:
put("vpabsd", ZMM_KZ, M_1to16 | _MEM);
put("vpabsq", ZMM_KZ, M_1to8 | _MEM);
put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, _XMM | _MEM);
put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, XMM | _MEM);
put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM);
put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM);
@ -879,7 +882,7 @@ public:
};
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i];
put(p.name, XMM_KZ, _XMM, _XMM|p.mem);
put(p.name, XMM_KZ, XMM, _XMM|p.mem);
}
}
void put512_X3()
@ -891,54 +894,54 @@ public:
uint64_t x2;
uint64_t xm;
} tbl[] = {
{ "vpacksswb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpacksswb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM },
{ "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpackssdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpackssdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
{ "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
{ "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
{ "vpackusdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpackusdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
{ "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
{ "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
{ "vpackuswb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpackuswb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM },
{ "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpaddb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpaddb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpaddw", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpaddsb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpaddsb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpaddsw", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpaddsw", XMM_KZ, XMM, _XMM | _MEM },
{ "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpaddusb", XMM_KZ, _XMM, _XMM | MEM },
{ "vpaddusb", XMM_KZ, XMM, _XMM | MEM },
{ "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpaddusw", XMM_KZ, _XMM, _XMM | MEM },
{ "vpaddusw", XMM_KZ, XMM, _XMM | MEM },
{ "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpsubb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpsubw", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpsubd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpsubb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubw", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubd", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
{ "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpsubsb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpsubsb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpsubsw", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpsubsw", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpsubusb", XMM_KZ, _XMM, _XMM | MEM },
{ "vpsubusb", XMM_KZ, XMM, _XMM | MEM },
{ "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpsubusw", XMM_KZ, _XMM, _XMM | MEM },
{ "vpsubusw", XMM_KZ, XMM, _XMM | MEM },
{ "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
@ -983,137 +986,137 @@ public:
{ "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
{ "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
{ "vpslldq", _XMM3, _XMM3 | _MEM, IMM8 },
{ "vpslldq", XMM, _XMM3 | _MEM, IMM8 },
{ "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 },
{ "vpslldq", _ZMM, _ZMM | _MEM, IMM8 },
{ "vpsrldq", _XMM3, _XMM3 | _MEM, IMM8 },
{ "vpsrldq", XMM, _XMM3 | _MEM, IMM8 },
{ "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 },
{ "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 },
{ "vpsraw", XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpsraw", XMM_KZ, XMM | _MEM, IMM8 },
{ "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 },
{ "vpsrad", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
{ "vpsrad", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 },
{ "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
{ "vpsraq", XMM, XMM, IMM8 },
{ "vpsraq", XMM_KZ, _XMM | M_1to2 | _MEM, IMM8 },
{ "vpsraq", XMM_KZ, XMM | M_1to2 | _MEM, IMM8 },
{ "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
{ "vpsllw", _XMM3, _XMM3 | _MEM, IMM8 },
{ "vpslld", _XMM3, _XMM3 | _MEM | M_1to4, IMM8 },
{ "vpsllq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
{ "vpsllw", XMM, _XMM3 | _MEM, IMM8 },
{ "vpslld", XMM, _XMM3 | _MEM | M_1to4, IMM8 },
{ "vpsllq", XMM, _XMM3 | _MEM | M_1to2, IMM8 },
{ "vpsrlw", XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpsrlw", XMM_KZ, XMM | _MEM, IMM8 },
{ "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 },
{ "vpsrld", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
{ "vpsrld", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 },
{ "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
{ "vpsrlq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 },
{ "vpsrlq", XMM, _XMM3 | _MEM | M_1to2, IMM8 },
{ "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 },
{ "vpsravw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsravw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsravw", _ZMM, _ZMM, _MEM },
{ "vpsravd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsravd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpsravq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsravq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpsllvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsllvw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsllvw", _ZMM, _ZMM, _MEM },
{ "vpsllvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsllvd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpsllvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsllvq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpsrlvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsrlvw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsrlvw", _ZMM, _ZMM, _MEM },
{ "vpsrlvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsrlvd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpsrlvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM },
{ "vpsrlvq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpshufb", _XMM | XMM_KZ, _XMM, _XMM | _MEM },
{ "vpshufb", XMM | XMM_KZ, _XMM, _XMM | _MEM },
{ "vpshufb", ZMM_KZ, _ZMM, _MEM },
{ "vpshufhw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpshufhw", XMM | XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpshufhw", ZMM_KZ, _MEM, IMM8 },
{ "vpshuflw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpshuflw", XMM | XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpshuflw", ZMM_KZ, _MEM, IMM8 },
{ "vpshufd", _XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
{ "vpshufd", XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
{ "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
{ "vpord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
{ "vporq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
{ "vporq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
{ "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpxord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpxord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
{ "vpxorq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
{ "vpxorq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
{ "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpsadbw", _XMM3, _XMM, _XMM | _MEM },
{ "vpsadbw", XMM, _XMM, _XMM | _MEM },
{ "vpsadbw", _ZMM, _ZMM, _MEM },
{ "vpmuldq", _XMM3, _XMM, _XMM | M_1to2 | _MEM },
{ "vpmuldq", XMM, _XMM, _XMM | M_1to2 | _MEM },
{ "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpmulhrsw", _XMM3, _XMM, _XMM | _MEM },
{ "vpmulhrsw", XMM, _XMM, _XMM | _MEM },
{ "vpmulhrsw", ZMM_KZ, _ZMM, _MEM },
{ "vpmulhuw", _XMM3, _XMM, _XMM | _MEM },
{ "vpmulhuw", XMM, _XMM, _XMM | _MEM },
{ "vpmulhuw", ZMM_KZ, _ZMM, _MEM },
{ "vpmulhw", _XMM3, _XMM, _XMM | _MEM },
{ "vpmulhw", XMM, _XMM, _XMM | _MEM },
{ "vpmulhw", ZMM_KZ, _ZMM, _MEM },
{ "vpmullw", _XMM3, _XMM, _XMM | _MEM },
{ "vpmullw", XMM, _XMM, _XMM | _MEM },
{ "vpmullw", ZMM_KZ, _ZMM, _MEM },
{ "vpmulld", _XMM3, _XMM, M_1to4 | _MEM },
{ "vpmulld", XMM, _XMM, M_1to4 | _MEM },
{ "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM },
{ "vpmullq", _XMM3, _XMM, M_1to2 | _MEM },
{ "vpmullq", XMM, _XMM, M_1to2 | _MEM },
{ "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpmuludq", _XMM3, _XMM, M_1to2 | _MEM },
{ "vpmuludq", XMM, _XMM, M_1to2 | _MEM },
{ "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpunpckhbw", _XMM3, _XMM, _XMM | _MEM },
{ "vpunpckhbw", XMM, _XMM, _XMM | _MEM },
{ "vpunpckhbw", _ZMM, _ZMM, _MEM },
{ "vpunpckhwd", _XMM3, _XMM, _XMM | _MEM },
{ "vpunpckhwd", XMM, _XMM, _XMM | _MEM },
{ "vpunpckhwd", _ZMM, _ZMM, _MEM },
{ "vpunpckhdq", _XMM3, _XMM, M_1to4 | _MEM },
{ "vpunpckhdq", XMM, _XMM, M_1to4 | _MEM },
{ "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpunpckhqdq", _XMM3, _XMM, M_1to2 | _MEM },
{ "vpunpckhqdq", XMM, _XMM, M_1to2 | _MEM },
{ "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpunpcklbw", _XMM3, _XMM, _XMM | _MEM },
{ "vpunpcklbw", XMM, _XMM, _XMM | _MEM },
{ "vpunpcklbw", _ZMM, _ZMM, _MEM },
{ "vpunpcklwd", _XMM3, _XMM, _XMM | _MEM },
{ "vpunpcklwd", XMM, _XMM, _XMM | _MEM },
{ "vpunpcklwd", _ZMM, _ZMM, _MEM },
{ "vpunpckldq", _XMM3, _XMM, M_1to4 | _MEM },
{ "vpunpckldq", XMM, _XMM, M_1to4 | _MEM },
{ "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpunpcklqdq", _XMM3, _XMM, M_1to2 | _MEM },
{ "vpunpcklqdq", XMM, _XMM, M_1to2 | _MEM },
{ "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
@ -1126,7 +1129,7 @@ public:
{ "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
{ "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
{ "vextractps", REG32 | _MEM, _XMM3, IMM8 },
{ "vextractps", REG32 | _MEM, XMM, IMM8 },
{ "vpermb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM },
@ -1175,7 +1178,7 @@ public:
uint64_t xm;
} tbl[] = {
#ifdef XBYAK64
{ "vinsertps", _XMM, _XMM, _XMM3 | _MEM },
{ "vinsertps", XMM, _XMM, _XMM3 | _MEM },
{ "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM },
{ "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
@ -1208,14 +1211,14 @@ public:
put(p.name, p.x1, p.x2, p.xm, IMM8);
}
#ifdef XBYAK64
put("vpextrb", _REG64 | _MEM, _XMM3, IMM8);
put("vpextrw", _REG64 | _MEM, _XMM3, IMM8);
put("vpextrd", _REG32 | _MEM, _XMM3, IMM8);
put("vpextrq", _REG64 | _MEM, _XMM3, IMM8);
put("vpinsrb", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrw", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrd", _XMM3, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrq", _XMM3, _XMM3, _REG64 | _MEM, IMM8);
put("vpextrb", _REG64 | _MEM, XMM, IMM8);
put("vpextrw", _REG64 | _MEM, XMM, IMM8);
put("vpextrd", _REG32 | _MEM, XMM, IMM8);
put("vpextrq", _REG64 | _MEM, XMM, IMM8);
put("vpinsrb", XMM, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrw", XMM, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrd", XMM, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrq", XMM, _XMM3, _REG64 | _MEM, IMM8);
#endif
}
void put512_FMA()
@ -1345,7 +1348,7 @@ public:
} else if (suf == "ps") {
mem = M_1to4;
}
put(p, _XMM3 | XMM_KZ, _XMM, mem | _MEM);
put(p, XMM | XMM_KZ, _XMM, mem | _MEM);
if (!sufTbl[j].supportYMM) continue;
mem = 0;
if (suf == "pd") {
@ -1466,23 +1469,23 @@ public:
put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
put("vcvtsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
put("vcvtsd2si", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
put("vcvtsd2usi", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtsd2ss", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_ER);
put("vcvtsd2ss", XMM_KZ, XMM, _XMM3 | _MEM | XMM_ER);
put("vcvtsi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtsi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtsi2sd", XMM, XMM_ER, REG64);
put("vcvtsi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtsi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64);
put("vcvtss2sd", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_SAE);
put("vcvtss2sd", XMM_KZ, XMM, _XMM3 | _MEM | XMM_SAE);
put("vcvtss2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
put("vcvtss2si", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER);
put("vcvtss2usi", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
@ -1516,13 +1519,13 @@ public:
put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
put("vcvttsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
put("vcvttsd2si", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvttsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
put("vcvttsd2usi", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvttss2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
put("vcvttss2si", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvttss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE);
put("vcvttss2usi", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
@ -1540,10 +1543,10 @@ public:
put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
put("vcvtusi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtusi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtusi2sd", XMM, XMM_ER, REG64);
put("vcvtusi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtusi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64);
#endif
}

View file

@ -40,8 +40,8 @@ struct Code : Xbyak::CodeGenerator {
cmpss(xmm0, ptr[rip + label], 0);
test(dword[rip + label], 33);
bt(dword[rip + label ], 3);
vblendpd(xmm0, dword[rip + label], 3);
vpalignr(xmm0, qword[rip + label], 4);
vblendpd(xmm0, xmm0, dword[rip + label], 3);
vpalignr(xmm0, xmm0, qword[rip + label], 4);
vextractf128(dword[rip + label], ymm3, 12);
vperm2i128(ymm0, ymm1, qword[rip + label], 13);
vcvtps2ph(ptr[rip + label], xmm2, 44);

View file

@ -129,6 +129,55 @@ struct Code : public Xbyak::CodeGenerator {
add(rax, sf.p[2]);
add(rax, sf.p[3]);
}
/*
int64_t f(const int64_t a[13]) { return sum-of-a[]; }
*/
void gen13()
{
StackFrame sf(this, 1, 13);
for (int i = 0; i < 13; i++) {
mov(sf.t[i], ptr[sf.p[0] + i * 8]);
}
mov(rax, sf.t[0]);
for (int i = 1; i < 13; i++) {
add(rax, sf.t[i]);
}
}
/*
same as gen13
*/
void gen14()
{
StackFrame sf(this, 1, 11 | UseRCX | UseRDX);
Pack t = sf.t;
t.append(rcx);
t.append(rdx);
for (int i = 0; i < 13; i++) {
mov(t[i], ptr[sf.p[0] + i * 8]);
}
mov(rax, t[0]);
for (int i = 1; i < 13; i++) {
add(rax, t[i]);
}
}
/*
return (1 << 15) - 1;
*/
void gen15()
{
StackFrame sf(this, 0, 14, 8);
Pack t = sf.t;
t.append(rax);
for (int i = 0; i < 15; i++) {
mov(t[i], 1 << i);
}
mov(qword[rsp], 0);
for (int i = 0; i < 15; i++) {
add(ptr[rsp], t[i]);
}
mov(rax, ptr[rsp]);
}
};
struct Code2 : Xbyak::CodeGenerator {
@ -152,8 +201,14 @@ struct Code2 : Xbyak::CodeGenerator {
add(rax, sf.p[i]);
}
}
void gen2(int pNum, int tNum, int stackSizeByte)
{
StackFrame sf(this, pNum, tNum, stackSizeByte);
mov(rax, rsp);
}
};
static int errNum = 0;
void check(int x, int y)
{
@ -167,19 +222,19 @@ void verify(const Xbyak::uint8 *f, int pNum)
{
switch (pNum) {
case 0:
check(1, Xbyak::CastTo<int (*)()>(f)());
check(1, reinterpret_cast<int (*)()>(f)());
return;
case 1:
check(11, Xbyak::CastTo<int (*)(int)>(f)(10));
check(11, reinterpret_cast<int (*)(int)>(f)(10));
return;
case 2:
check(111, Xbyak::CastTo<int (*)(int, int)>(f)(10, 100));
check(111, reinterpret_cast<int (*)(int, int)>(f)(10, 100));
return;
case 3:
check(1111, Xbyak::CastTo<int (*)(int, int, int)>(f)(10, 100, 1000));
check(1111, reinterpret_cast<int (*)(int, int, int)>(f)(10, 100, 1000));
return;
case 4:
check(11111, Xbyak::CastTo<int (*)(int, int, int, int)>(f)(10, 100, 1000, 10000));
check(11111, reinterpret_cast<int (*)(int, int, int, int)>(f)(10, 100, 1000, 10000));
return;
default:
printf("ERR pNum=%d\n", pNum);
@ -212,6 +267,15 @@ void testAll()
const Xbyak::uint8 *f = code.getCurr();
code.gen(pNum, tNum | opt, stackSize);
verify(f, pNum);
/*
check rsp is 16-byte aligned if stackSize > 0
*/
if (stackSize > 0) {
Code2 c2;
c2.gen2(pNum, tNum | opt, stackSize);
uint64_t addr = c2.getCode<uint64_t (*)()>()();
check(addr % 16, 0);
}
}
}
}
@ -268,6 +332,20 @@ void testPartial()
int (*f12)(int, int, int, int) = code.getCurr<int (*)(int, int, int, int)>();
code.gen12();
check(24, f12(3, 5, 7, 9));
{
int64_t tbl[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 };
int64_t (*f13)(const int64_t*) = code.getCurr<int64_t (*)(const int64_t*)>();
code.gen13();
check(91, f13(tbl));
int64_t (*f14)(const int64_t*) = code.getCurr<int64_t (*)(const int64_t*)>();
code.gen14();
check(91, f14(tbl));
}
int (*f15)() = code.getCurr<int (*)()>();
code.gen15();
check((1 << 15) - 1, f15());
}
void put(const Xbyak::util::Pack& p)

View file

@ -40,6 +40,8 @@
// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\
((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
#include <unordered_set>
#define XBYAK_STD_UNORDERED_SET std::unordered_set
#include <unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
@ -49,16 +51,22 @@
libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
*/
#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || defined(__llvm__)
#include <tr1/unordered_set>
#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
#include <tr1/unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
#include <unordered_set>
#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
#include <unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
#else
#include <set>
#define XBYAK_STD_UNORDERED_SET std::set
#include <map>
#define XBYAK_STD_UNORDERED_MAP std::map
#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
@ -105,7 +113,7 @@ namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x5670 /* 0xABCD = A.BC(D) */
VERSION = 0x5770 /* 0xABCD = A.BC(D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@ -178,7 +186,8 @@ enum {
ERR_INVALID_ZERO,
ERR_INVALID_RIP_IN_AUTO_GROW,
ERR_INVALID_MIB_ADDRESS,
ERR_INTERNAL
ERR_INTERNAL,
ERR_X2APIC_IS_NOT_SUPPORTED
};
class Error : public std::exception {
@ -240,6 +249,7 @@ public:
"invalid rip in AutoGrow",
"invalid mib address",
"internal error",
"x2APIC is not supported"
};
assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
return errTbl[err_];
@ -617,6 +627,12 @@ struct RegRip {
const Label* label_;
bool isAddr_;
explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {}
friend const RegRip operator+(const RegRip& r, int disp) {
return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
}
friend const RegRip operator-(const RegRip& r, int disp) {
return RegRip(r.disp_ - disp, r.label_, r.isAddr_);
}
friend const RegRip operator+(const RegRip& r, sint64 disp) {
return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
}
@ -786,6 +802,7 @@ inline RegExp operator-(const RegExp& e, size_t disp)
// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
void *const AutoGrow = (void*)1; //-V566
void *const DontSetProtectRWE = (void*)2; //-V566
class CodeArray {
enum Type {
@ -825,6 +842,7 @@ protected:
size_t size_;
bool isCalledCalcJmpAddress_;
bool useProtect() const { return alloc_->useProtect(); }
/*
allocate new memory and copy old data to the new area
*/
@ -848,7 +866,6 @@ protected:
uint64 disp = i->getVal(top_);
rewrite(i->codeOffset, disp, i->jmpSize);
}
if (alloc_->useProtect() && !protect(top_, size_, PROTECT_RWE)) throw Error(ERR_CANT_PROTECT);
isCalledCalcJmpAddress_ = true;
}
public:
@ -858,7 +875,7 @@ public:
PROTECT_RE = 2 // read/exec
};
explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0)
: type_(userPtr == AutoGrow ? AUTO_GROW : userPtr ? USER_BUF : ALLOC_BUF)
: type_(userPtr == AutoGrow ? AUTO_GROW : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF)
, alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_)
, maxSize_(maxSize)
, top_(type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1)))
@ -866,7 +883,7 @@ public:
, isCalledCalcJmpAddress_(false)
{
if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC);
if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, PROTECT_RWE)) {
if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
alloc_->free(top_);
throw Error(ERR_CANT_PROTECT);
}
@ -874,10 +891,19 @@ public:
virtual ~CodeArray()
{
if (isAllocType()) {
if (alloc_->useProtect()) protect(top_, maxSize_, PROTECT_RW);
if (useProtect()) setProtectModeRW(false);
alloc_->free(top_);
}
}
bool setProtectMode(ProtectMode mode, bool throwException = true)
{
bool isOK = protect(top_, maxSize_, mode);
if (isOK) return true;
if (throwException) throw Error(ERR_CANT_PROTECT);
return false;
}
bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
void resetSize()
{
size_ = 0;
@ -909,10 +935,10 @@ public:
void dq(uint64 code) { db(code, 8); }
const uint8 *getCode() const { return top_; }
template<class F>
const F getCode() const { return CastTo<F>(top_); }
const F getCode() const { return reinterpret_cast<F>(top_); }
const uint8 *getCurr() const { return &top_[size_]; }
template<class F>
const F getCurr() const { return CastTo<F>(&top_[size_]); }
const F getCurr() const { return reinterpret_cast<F>(&top_[size_]); }
size_t getSize() const { return size_; }
void setSize(size_t size)
{
@ -995,6 +1021,9 @@ public:
size_t pageSize = sysconf(_SC_PAGESIZE);
size_t iaddr = reinterpret_cast<size_t>(addr);
size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
#ifndef NDEBUG
if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize);
#endif
return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
#else
return true;
@ -1115,6 +1144,7 @@ public:
Label(const Label& rhs);
Label& operator=(const Label& rhs);
~Label();
void clear() { mgr = 0; id = 0; }
int getId() const { return id; }
const uint8 *getAddress() const;
@ -1153,6 +1183,7 @@ class LabelManager {
};
typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
CodeArray *base_;
// global : stateList_.front(), local : stateList_.back()
@ -1160,6 +1191,7 @@ class LabelManager {
mutable int labelId_;
ClabelDefList clabelDefList_;
ClabelUndefList clabelUndefList_;
LabelPtrList labelPtrList_;
int getId(const Label& label) const
{
@ -1208,9 +1240,14 @@ class LabelManager {
return true;
}
friend class Label;
void incRefCount(int id) { clabelDefList_[id].refCount++; }
void decRefCount(int id)
void incRefCount(int id, Label *label)
{
clabelDefList_[id].refCount++;
labelPtrList_.insert(label);
}
void decRefCount(int id, Label *label)
{
labelPtrList_.erase(label);
ClabelDefList::iterator i = clabelDefList_.find(id);
if (i == clabelDefList_.end()) return;
if (i->second.refCount == 1) {
@ -1229,11 +1266,23 @@ class LabelManager {
#endif
return !list.empty();
}
// detach all labels linked to LabelManager
void resetLabelPtrList()
{
for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
(*i)->clear();
}
labelPtrList_.clear();
}
public:
LabelManager()
{
reset();
}
~LabelManager()
{
resetLabelPtrList();
}
void reset()
{
base_ = 0;
@ -1243,6 +1292,7 @@ public:
stateList_.push_back(SlabelState());
clabelDefList_.clear();
clabelUndefList_.clear();
resetLabelPtrList();
}
void enterLocal()
{
@ -1275,10 +1325,11 @@ public:
SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
define_inner(st.defList, st.undefList, label, base_->getSize());
}
void defineClabel(const Label& label)
void defineClabel(Label& label)
{
define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
label.mgr = this;
labelPtrList_.insert(&label);
}
void assign(Label& dst, const Label& src)
{
@ -1286,6 +1337,7 @@ public:
if (i == clabelDefList_.end()) throw Error(ERR_LABEL_ISNOT_SET_BY_L);
define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
dst.mgr = this;
labelPtrList_.insert(&dst);
}
bool getOffset(size_t *offset, std::string& label) const
{
@ -1333,19 +1385,19 @@ inline Label::Label(const Label& rhs)
{
id = rhs.id;
mgr = rhs.mgr;
if (mgr) mgr->incRefCount(id);
if (mgr) mgr->incRefCount(id, this);
}
inline Label& Label::operator=(const Label& rhs)
{
if (id) throw Error(ERR_LABEL_IS_ALREADY_SET_BY_L);
id = rhs.id;
mgr = rhs.mgr;
if (mgr) mgr->incRefCount(id);
if (mgr) mgr->incRefCount(id, this);
return *this;
}
inline Label::~Label()
{
if (id && mgr) mgr->decRefCount(id);
if (id && mgr) mgr->decRefCount(id, this);
}
inline const uint8* Label::getAddress() const
{
@ -1463,6 +1515,7 @@ private:
T_B64 = 1 << 27, // m64bcst
T_M_K = 1 << 28, // mem{k}
T_VSIB = 1 << 29,
T_MEM_EVEX = 1 << 30, // use evex if mem
T_XXX
};
void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
@ -1500,7 +1553,7 @@ private:
if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) return Error(err);
return v;
}
int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0)
int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0, bool Hi16Vidx = false)
{
if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID);
int w = (type & T_EW1) ? 1 : 0;
@ -1543,7 +1596,7 @@ private:
}
}
}
bool Vp = !(v ? v->isExtIdx2() : 0);
bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET);
db(0x62);
@ -1935,10 +1988,11 @@ private:
const Address& addr = op2.getAddress();
const RegExp& regExp = addr.getRegExp();
const Reg& base = regExp.getBase();
const Reg& index = regExp.getIndex();
if (BIT == 64 && addr.is32bit()) db(0x67);
int disp8N = 0;
bool x = regExp.getIndex().isExtIdx();
if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
bool x = index.isExtIdx();
if ((type & (T_MUST_EVEX|T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
int aaa = addr.getOpmaskIdx();
if (aaa && !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY);
bool b = false;
@ -1946,8 +2000,8 @@ private:
if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST);
b = true;
}
int VL = regExp.isVsib() ? regExp.getIndex().getBit() : 0;
disp8N = evex(r, base, p1, type, code, x, b, aaa, VL);
int VL = regExp.isVsib() ? index.getBit() : 0;
disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
} else {
vex(r, base, p1, type, code, x);
}
@ -2147,7 +2201,8 @@ public:
const Segment es, cs, ss, ds, fs, gs;
#endif
void L(const std::string& label) { labelMgr_.defineSlabel(label); }
void L(const Label& label) { labelMgr_.defineClabel(label); }
void L(Label& label) { labelMgr_.defineClabel(label); }
Label L() { Label label; L(label); return label; }
void inLocalLabel() { labelMgr_.enterLocal(); }
void outLocalLabel() { labelMgr_.leaveLocal(); }
/*
@ -2178,7 +2233,7 @@ public:
// call(function pointer)
#ifdef XBYAK_VARIADIC_TEMPLATE
template<class Ret, class... Params>
void call(Ret(*func)(Params...)) { call(CastTo<const void*>(func)); }
void call(Ret(*func)(Params...)) { call(reinterpret_cast<const void*>(func)); }
#endif
void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
@ -2436,11 +2491,16 @@ public:
MUST call ready() to complete generating code if you use AutoGrow mode.
It is not necessary for the other mode if hasUndefinedLabel() is true.
*/
void ready()
void ready(ProtectMode mode = PROTECT_RWE)
{
if (hasUndefinedLabel()) throw Error(ERR_LABEL_IS_NOT_FOUND);
if (isAutoGrow()) calcJmpAddress();
if (isAutoGrow()) {
calcJmpAddress();
if (useProtect()) setProtectMode(mode);
}
}
// set read/exec
void readyRE() { return ready(PROTECT_RE); }
#ifdef XBYAK_TEST
void dump(bool doClear = true)
{

View file

@ -1,4 +1,4 @@
const char *getVersionString() const { return "5.67"; }
const char *getVersionString() const { return "5.77"; }
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -1023,7 +1023,7 @@ void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D); }
void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm); }
void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm); }
void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); }
void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); }
void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
@ -1206,28 +1206,28 @@ void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm,
void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); }
void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2); }
void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); }
void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3); }
void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47); }
void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47); }
void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); }
void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1); }
void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); }
void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2); }
void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46); }
void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); }
void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1); }
void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); }
void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2); }
void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); }
void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3); }
void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45); }
void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45); }
void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); }
void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1); }
void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8); }
void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA); }

View file

@ -9,6 +9,11 @@
*/
#include "xbyak.h"
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#define XBYAK_INTEL_CPU_SPECIFIC
#endif
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
#if (_MSC_VER < 1400) && defined(XBYAK32)
static inline __declspec(naked) void __cpuid(int[4], int)
@ -47,14 +52,30 @@
#endif
#endif
#endif
#endif
namespace Xbyak { namespace util {
typedef enum {
SmtLevel = 1,
CoreLevel = 2
} IntelCpuTopologyLevel;
/**
CPU detection class
*/
class Cpu {
uint64 type_;
//system topology
bool x2APIC_supported_;
static const size_t maxTopologyLevels = 2;
unsigned int numCores_[maxTopologyLevels];
static const unsigned int maxNumberCacheLevels = 10;
unsigned int dataCacheSize_[maxNumberCacheLevels];
unsigned int coresSharignDataCache_[maxNumberCacheLevels];
unsigned int dataCacheLevels_;
unsigned int get32bitAsBE(const char *x) const
{
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
@ -65,7 +86,7 @@ class Cpu {
}
void setFamily()
{
unsigned int data[4];
unsigned int data[4] = {};
getCpuid(1, data);
stepping = data[0] & mask(4);
model = (data[0] >> 4) & mask(4);
@ -88,6 +109,39 @@ class Cpu {
{
return (val >> base) & ((1u << (end - base)) - 1);
}
void setNumCores()
{
if ((type_ & tINTEL) == 0) return;
unsigned int data[4] = {};
/* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
/*
if leaf 11 exists(x2APIC is supported),
we use it to get the number of smt cores and cores on socket
leaf 0xB can be zeroed-out by a hypervisor
*/
x2APIC_supported_ = true;
for (unsigned int i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data);
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) {
numCores_[level - 1] = extractBit(data[1], 0, 15);
}
}
} else {
/*
Failed to deremine num of cores without x2APIC support.
TODO: USE initial APIC ID to determine ncores.
*/
numCores_[SmtLevel - 1] = 0;
numCores_[CoreLevel - 1] = 0;
}
}
void setCacheHierarchy()
{
if ((type_ & tINTEL) == 0) return;
@ -96,21 +150,12 @@ class Cpu {
// const unsigned int INSTRUCTION_CACHE = 2;
const unsigned int UNIFIED_CACHE = 3;
unsigned int smt_width = 0;
unsigned int n_cores = 0;
unsigned int data[4];
unsigned int logical_cores = 0;
unsigned int data[4] = {};
/*
if leaf 11 exists, we use it to get the number of smt cores and cores on socket
If x2APIC is supported, these are the only correct numbers.
leaf 0xB can be zeroed-out by a hypervisor
*/
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
getCpuidEx(0xB, 0, data); // CPUID for SMT Level
smt_width = data[1] & 0x7FFF;
getCpuidEx(0xB, 1, data); // CPUID for CORE Level
n_cores = data[1] & 0x7FFF;
if (x2APIC_supported_) {
smt_width = numCores_[0];
logical_cores = numCores_[1];
}
/*
@ -118,29 +163,29 @@ class Cpu {
the first level of data cache is not shared (which is the
case for every existing architecture) and use this to
determine the SMT width for arch not supporting leaf 11.
when leaf 4 reports a number of core less than n_cores
when leaf 4 reports a number of core less than numCores_
on socket reported by leaf 11, then it is a correct number
of cores not an upperbound.
*/
for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) {
for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
getCpuidEx(0x4, i, data);
unsigned int cacheType = extractBit(data[0], 0, 4);
if (cacheType == NO_CACHE) break;
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1;
if (n_cores != 0) { // true only if leaf 0xB is supported and valid
nb_logical_cores = (std::min)(nb_logical_cores, n_cores);
unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
}
assert(nb_logical_cores != 0);
data_cache_size[data_cache_levels] =
assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] =
(extractBit(data[1], 22, 31) + 1)
* (extractBit(data[1], 12, 21) + 1)
* (extractBit(data[1], 0, 11) + 1)
* (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores;
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0);
cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u);
data_cache_levels++;
coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++;
}
}
}
@ -154,22 +199,25 @@ public:
int displayFamily; // family + extFamily
int displayModel; // model + extModel
// may I move these members into private?
static const unsigned int maxNumberCacheLevels = 10;
unsigned int data_cache_size[maxNumberCacheLevels];
unsigned int cores_sharing_data_cache[maxNumberCacheLevels];
unsigned int data_cache_levels;
unsigned int getNumCores(IntelCpuTopologyLevel level) {
if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
switch (level) {
case SmtLevel: return numCores_[level - 1];
case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
default: throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
}
}
unsigned int getDataCacheLevels() const { return data_cache_levels; }
unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
unsigned int getCoresSharingDataCache(unsigned int i) const
{
if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER);
return cores_sharing_data_cache[i];
if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
return coresSharignDataCache_[i];
}
unsigned int getDataCacheSize(unsigned int i) const
{
if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER);
return data_cache_size[i];
if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
return dataCacheSize_[i];
}
/*
@ -177,30 +225,45 @@ public:
*/
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuid(reinterpret_cast<int*>(data), eaxIn);
#else
#else
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)data;
#endif
}
static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else
#else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)ecxIn;
(void)data;
#endif
}
static inline uint64 getXfeature()
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return _xgetbv(0);
#else
#else
unsigned int eax, edx;
// xgetvb is not support on gcc 4.2
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64)edx << 32) | eax;
#endif
#else
return 0;
#endif
}
typedef uint64 Type;
@ -271,9 +334,13 @@ public:
Cpu()
: type_(NONE)
, data_cache_levels(0)
, x2APIC_supported_(false)
, numCores_()
, dataCacheSize_()
, coresSharignDataCache_()
, dataCacheLevels_(0)
{
unsigned int data[4];
unsigned int data[4] = {};
const unsigned int& EAX = data[0];
const unsigned int& EBX = data[1];
const unsigned int& ECX = data[2];
@ -363,6 +430,7 @@ public:
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
}
setFamily();
setNumCores();
setCacheHierarchy();
}
void putFamily() const
@ -381,12 +449,17 @@ class Clock {
public:
static inline uint64 getRdtsc()
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return __rdtsc();
#else
#else
unsigned int eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64)edx << 32) | eax;
#endif
#else
// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
return 0;
#endif
}
Clock()
@ -416,7 +489,7 @@ const int UseRCX = 1 << 6;
const int UseRDX = 1 << 7;
class Pack {
static const size_t maxTblNum = 10;
static const size_t maxTblNum = 15;
const Xbyak::Reg64 *tbl_[maxTblNum];
size_t n_;
public:
@ -476,7 +549,7 @@ public:
const Xbyak::Reg64& operator[](size_t n) const
{
if (n >= n_) {
fprintf(stderr, "ERR Pack bad n=%d\n", (int)n);
fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
throw Error(ERR_BAD_PARAMETER);
}
return *tbl_[n];
@ -518,6 +591,7 @@ class StackFrame {
static const int rcxPos = 3;
static const int rdxPos = 2;
#endif
static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
Xbyak::CodeGenerator *code_;
int pNum_;
int tNum_;
@ -527,7 +601,7 @@ class StackFrame {
int P_;
bool makeEpilog_;
Xbyak::Reg64 pTbl_[4];
Xbyak::Reg64 tTbl_[10];
Xbyak::Reg64 tTbl_[maxRegNum];
Pack p_;
Pack t_;
StackFrame(const StackFrame&);
@ -539,7 +613,7 @@ public:
make stack frame
@param sf [in] this
@param pNum [in] num of function parameter(0 <= pNum <= 4)
@param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX)
@param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
@param stackSizeByte [in] local stack size
@param makeEpilog [in] automatically call close() if true
@ -566,27 +640,17 @@ public:
using namespace Xbyak;
if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM);
if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM);
const Reg64& _rsp = code->rsp;
const AddressFrame& _ptr = code->ptr;
saveNum_ = (std::max)(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum;
P_ = saveNum_ + (stackSizeByte + 7) / 8;
if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment
for (int i = 0; i < saveNum_; i++) {
code->push(Reg64(tbl[i]));
}
P_ = (stackSizeByte + 7) / 8;
if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
P_ *= 8;
if (P_ > 0) code->sub(_rsp, P_);
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i]));
}
for (int i = 4; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#else
for (int i = 0; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#endif
int pos = 0;
for (int i = 0; i < pNum; i++) {
pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
@ -607,21 +671,11 @@ public:
{
using namespace Xbyak;
const Reg64& _rsp = code_->rsp;
const AddressFrame& _ptr = code_->ptr;
const int *tbl = getOrderTbl() + noSaveNum;
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
}
for (int i = 4; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#else
for (int i = 0; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#endif
if (P_ > 0) code_->add(_rsp, P_);
for (int i = 0; i < saveNum_; i++) {
code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
}
if (callRet) code_->ret();
}
@ -633,9 +687,6 @@ public:
} catch (std::exception& e) {
printf("ERR:StackFrame %s\n", e.what());
exit(1);
} catch (...) {
printf("ERR:StackFrame otherwise\n");
exit(1);
}
}
private:
@ -654,7 +705,7 @@ private:
}
int getRegIdx(int& pos) const
{
assert(pos < 14);
assert(pos < maxRegNum);
using namespace Xbyak;
const int *tbl = getOrderTbl();
int r = tbl[pos++];