Squashed 'externals/xbyak/' changes from 671fc805..4a6fac8a

4a6fac8a update version to 5.77
801cf3fd cosmetic change of getNumCores
d397e824 fix number of cores that share LLC cache
a669e092 support non-intel-cpu visual studio
af5f422e Merge branch 'fenghaitao-guard_x86' into develop
9b98dc17 Guard x86 specific codes with "#if defined(__i386__) || defined(__x86_64__)"
dd4173e1 move some member variables input private
f72646a7 update version
4612528f format change
4b95e862 Merge branch 'shelleygoel-master'
4c262fa6 add functionality to get num of cores using x2APIC ID
bc70e7e1 recover Xbyak::CastTo
d09a230f unlink Label when LabelManager is destroyed
973e8597 update version
afdb9fe9 Xbyak::CastTo is removed
b011aca4 add RegRip +/- int
acae93cd increase max temp regs for StackFrame
ea4e3562 util::StackFrame uses push/pop instead of mov
42462ef9 use evex encoding for vpslld/vpslldq/vpsraw/...(reg, mem, imm);
da9117a9 update version of readme.md
d35f4fb7 fix the encoding of vinsertps for disp8N
1de435ed bf uses Label class
613922bd add Label L() for convenience
43e15583 fix typo
93579ee6 add protect-re.cpp
60004b5c fix url of protect-re.cpp
348b2709 fix typo of doc
f34f6ed5 update manual
232110be update test
82b78bf0 add setProtectMode
dd8b290f put warning message if pageSize != 4096
64775ca2 a little refactoring
7c3e7b85 fix wrong VSIB encoding with idx >= 16

git-subtree-dir: externals/xbyak
git-subtree-split: 4a6fac8ade404f667b94170f713367fe7da2a852
This commit is contained in:
MerryMage 2020-04-22 20:59:14 +01:00
parent dbb1f8cf37
commit 080b4b3aff
17 changed files with 994 additions and 489 deletions

View file

@ -37,6 +37,7 @@
T_B64 = 1 << 27, // m64bcst T_B64 = 1 << 27, // m64bcst
T_M_K = 1 << 28, // mem{k} T_M_K = 1 << 28, // mem{k}
T_VSIB = 1 << 29, T_VSIB = 1 << 29,
T_MEM_EVEX = 1 << 30, // use evex if mem
T_XXX T_XXX
}; };
@ -161,5 +162,9 @@ std::string type2String(int type)
if (!str.empty()) str += " | "; if (!str.empty()) str += " | ";
str += "T_VSIB"; str += "T_VSIB";
} }
if (type & T_MEM_EVEX) {
if (!str.empty()) str += " | ";
str += "T_MEM_EVEX";
}
return str; return str;
} }

View file

@ -76,7 +76,7 @@ void putX_X_XM(bool omitOnly)
{ 0xC2, "cmpss", T_0F | T_F3, true, true, 2 }, { 0xC2, "cmpss", T_0F | T_F3, true, true, 2 },
{ 0x5A, "cvtsd2ss", T_0F | T_F2 | T_EVEX | T_EW1 | T_N8 | T_ER_X, false, true, 2 }, { 0x5A, "cvtsd2ss", T_0F | T_F2 | T_EVEX | T_EW1 | T_N8 | T_ER_X, false, true, 2 },
{ 0x5A, "cvtss2sd", T_0F | T_F3 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, false, true, 2 }, { 0x5A, "cvtss2sd", T_0F | T_F3 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, false, true, 2 },
{ 0x21, "insertps", T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0, true, true, 2 }, { 0x21, "insertps", T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, true, true, 2 },
{ 0x63, "packsswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 }, { 0x63, "packsswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 },
{ 0x6B, "packssdw", T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32, false, true, 2 }, { 0x6B, "packssdw", T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32, false, true, 2 },
{ 0x67, "packuswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 }, { 0x67, "packuswb", T_0F | T_66 | T_YMM | T_EVEX, false, true, 2 },
@ -1491,16 +1491,16 @@ void put()
int idx; int idx;
int type; int type;
} tbl[] = { } tbl[] = {
{ "pslldq", 0x73, 7, T_0F | T_66 | T_YMM | T_EVEX }, { "pslldq", 0x73, 7, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psrldq", 0x73, 3, T_0F | T_66 | T_YMM | T_EVEX }, { "psrldq", 0x73, 3, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psllw", 0x71, 6, T_0F | T_66 | T_YMM | T_EVEX }, { "psllw", 0x71, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "pslld", 0x72, 6, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 }, { "pslld", 0x72, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 },
{ "psllq", 0x73, 6, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 }, { "psllq", 0x73, 6, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW1 | T_B64 },
{ "psraw", 0x71, 4, T_0F | T_66 | T_YMM | T_EVEX }, { "psraw", 0x71, 4, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psrad", 0x72, 4, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 }, { "psrad", 0x72, 4, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 },
{ "psrlw", 0x71, 2, T_0F | T_66 | T_YMM | T_EVEX }, { "psrlw", 0x71, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX },
{ "psrld", 0x72, 2, T_0F | T_66 | T_YMM | T_EVEX | T_EW0 | T_B32 }, { "psrld", 0x72, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW0 | T_B32 },
{ "psrlq", 0x73, 2, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 }, { "psrlq", 0x73, 2, T_0F | T_66 | T_YMM | T_EVEX | T_MEM_EVEX | T_EW1 | T_B64 },
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];

479
readme.md
View file

@ -1,107 +1,121 @@
Xbyak 5.67 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++ # Xbyak 5.77 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
=============
Abstract ## Abstract
-------------
This is a header file which enables dynamically to assemble x86(IA32), x64(AMD64, x86-64) mnemonic. This is a header file which enables dynamically to assemble x86(IA32), x64(AMD64, x86-64) mnemonic.
Feature ## Feature
------------- * header file only
header file only * Intel/MASM like syntax
you can use Xbyak's functions at once if xbyak.h is included. * fully support AVX-512
### Supported Instructions Sets **Note**: Xbyak uses and(), or(), xor(), not() functions, so `-fno-operator-names` option is necessary for gcc/clang.
MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(*partial*)/AVX/AVX2/FMA/VEX-encoded GPR/AVX-512 Or define `XBYAK_NO_OP_NAMES` before including `xbyak.h` and use and_(), or_(), xor_(), not_() instead of them.
and_(), or_(), xor_(), not_() are always available.
`XBYAK_NO_OP_NAMES` will be defined in the feature version.
### Supported OS ### Supported OS
* Windows Xp, Vista, Windows 7(32bit, 64bit) * Windows Xp, Vista, Windows 7, Windows 10(32bit, 64bit)
* Linux(32bit, 64bit) * Linux(32bit, 64bit)
* Intel Mac OSX * Intel macOS
### Supported Compilers ### Supported Compilers
* Visual Studio C++ VC2012 or later Almost C++03 or later compilers for x86/x64 such as Visual Studio, g++, clang++, Intel C++ compiler and g++ on mingw/cygwin.
* gcc 4.7 or later
* clang 3.3
* cygwin gcc 4.5.3
* icc 7.2
>Note: Xbyak uses and(), or(), xor(), not() functions, so "-fno-operator-names" option is required on gcc. ## Install
Or define XBYAK_NO_OP_NAMES and use and_(), or_(), xor_(), not_() instead of them.
and_(), or_(), xor_(), not_() are always available.
Install The following files are necessary. Please add the path to your compile directory.
-------------
The following files are necessary. Please add the path to your compile directories.
* xbyak.h * xbyak.h
* xbyak_mnemonic.h * xbyak_mnemonic.h
* xbyak_util.h
Linux: Linux:
```
make install
```
make install These files are copied into `/usr/local/include/xbyak`.
These files are copied into /usr/local/include/xbyak ## How to use it
New Feature Inherit `Xbyak::CodeGenerator` class and make the class method.
------------- ```
#define XBYAK_NO_OP_NAMES
#include <xbyak/xbyak.h>
Add support for AVX-512 instruction set. struct Code : Xbyak::CodeGenerator {
Code(int x)
{
mov(eax, x);
ret();
}
};
```
Make an instance of the class and get the function
pointer by calling `getCode()` and call it.
```
Code c(5);
int (*f)() = c.getCode<int (*)()>();
printf("ret=%d\n", f()); // ret = 5
```
Syntax ## Syntax
------------- Similar to MASM/NASM syntax with parentheses.
Make Xbyak::CodeGenerator and make the class method and get the function
pointer by calling cgetCode() and casting the return value.
NASM Xbyak
mov eax, ebx --> mov(eax, ebx);
inc ecx inc(ecx);
ret --> ret();
### Addressing
(ptr|dword|word|byte) [base + index * (1|2|4|8) + displacement]
[rip + 32bit disp] ; x64 only
NASM Xbyak
mov eax, [ebx+ecx] --> mov (eax, ptr[ebx+ecx]);
test byte [esp], 4 --> test (byte [esp], 4);
How to use Selector(Segment Register)
>Note: Segment class is not derived from Operand.
``` ```
mov eax, [fs:eax] --> putSeg(fs); mov(eax, ptr [eax]); NASM Xbyak
mov eax, ebx --> mov(eax, ebx);
inc ecx inc(ecx);
ret --> ret();
```
## Addressing
Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory,
otherwise use `ptr`.
```
(ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement]
[rip + 32bit disp] ; x64 only
NASM Xbyak
mov eax, [ebx+ecx] --> mov(eax, ptr [ebx+ecx]);
mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]);
test byte [esp], 4 --> test(byte [esp], 4);
inc qword [rax] --> inc(qword [rax]);
```
**Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type.
### How to use Selector (Segment Register)
```
mov eax, [fs:eax] --> putSeg(fs);
mov(eax, ptr [eax]);
mov ax, cs --> mov(ax, cs); mov ax, cs --> mov(ax, cs);
``` ```
**Note**: Segment class is not derived from `Operand`.
>you can use ptr for almost memory access unless you specify the size of memory. ## AVX
>dword, word and byte are member variables, then don't use dword as unsigned int, for example.
### AVX
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
vgatherdpd(xmm1, ptr [ebp+123+xmm2*4], xmm3);
*Remark*
The omitted destination syntax as the following ss disabled.
``` ```
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3 vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3);
``` ```
define `XBYAK_ENABLE_OMITTED_OPERAND` if you use it for backward compatibility.
**Note**:
If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility.
But the newer version will not support it. But the newer version will not support it.
```
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
```
### AVX-512 ## AVX-512
``` ```
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30); vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
@ -130,97 +144,122 @@ vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5)
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit
``` ```
Remark ### Remark
* k1, ..., k7 are new opmask registers. * `k1`, ..., `k7` are opmask registers.
* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively. * use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively.
* `k4 | k3` is different from `k3 | k4`. * `k4 | k3` is different from `k3 | k4`.
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined. * use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
* specify xword/yword/zword(_b) for m128/m256/m512 if necessary. * specify `xword`/`yword`/`zword(_b)` for m128/m256/m512 if necessary.
### Label ## Label
Two kinds of Label are supported. (String literal and Label class).
L("L1"); ### String literal
jmp ("L1"); ```
L("L1");
jmp("L1");
jmp ("L2"); jmp("L2");
... ...
a few mnemonics(8-bit displacement jmp) a few mnemonics (8-bit displacement jmp)
... ...
L("L2"); L("L2");
jmp ("L3", T_NEAR); jmp("L3", T_NEAR);
... ...
a lot of mnemonics(32-bit displacement jmp) a lot of mnemonics (32-bit displacement jmp)
... ...
L("L3"); L("L3");
```
>Call hasUndefinedLabel() to verify your code has no undefined label. * Call `hasUndefinedLabel()` to verify your code has no undefined label.
> you can use a label for immediate value of mov like as mov (eax, "L2"); * you can use a label for immediate value of mov like as `mov(eax, "L2")`.
#### 1. support @@, @f, @b like MASM ### Support `@@`, `@f`, `@b` like MASM
L("@@"); // <A> ```
jmp("@b"); // jmp to <A> L("@@"); // <A>
jmp("@f"); // jmp to <B> jmp("@b"); // jmp to <A>
L("@@"); // <B> jmp("@f"); // jmp to <B>
jmp("@b"); // jmp to <B> L("@@"); // <B>
mov(eax, "@b"); jmp("@b"); // jmp to <B>
jmp(eax); // jmp to <B> mov(eax, "@b");
jmp(eax); // jmp to <B>
```
#### 2. localization of label by calling inLocalLabel(), outLocallabel(). ### Local label
labels begining of period between inLocalLabel() and outLocalLabel() Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabel()`
are dealed with local label. are treated as a local label.
inLocalLabel() and outLocalLabel() can be nested. `inLocalLabel()` and `outLocalLabel()` can be nested.
void func1() ```
{ void func1()
inLocalLabel(); {
L(".lp"); // <A> ; local label inLocalLabel();
... L(".lp"); // <A> ; local label
jmp(".lp"); // jmpt to <A> ...
L("aaa"); // global label jmp(".lp"); // jmp to <A>
outLocalLabel(); L("aaa"); // global label <C>
} outLocalLabel();
void func2() inLocalLabel();
{ L(".lp"); // <B> ; local label
inLocalLabel(); func1();
L(".lp"); // <B> ; local label jmp(".lp"); // jmp to <B>
func1(); inLocalLabel();
jmp(".lp"); // jmp to <B> jmp("aaa"); // jmp to <C>
inLocalLabel(); }
} ```
### Label class ### Label class
L() and jxx() functions support a new Label class. `L()` and `jxx()` support Label class.
Label label1, label2; ```
L(label1); Xbyak::Label label1, label2;
... L(label1);
jmp(label1); ...
... jmp(label1);
jmp(label2); ...
... jmp(label2);
L(label2); ...
L(label2);
```
Moreover, assignL(dstLabel, srcLabel) method binds dstLabel with srcLabel. Use `putL` for jmp table
```
Label labelTbl, L0, L1, L2;
mov(rax, labelTbl);
// rdx is an index of jump table
jmp(ptr [rax + rdx * sizeof(void*)]);
L(labelTbl);
putL(L0);
putL(L1);
putL(L2);
L(L0);
....
L(L1);
....
```
Label label1, label2; `assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel.
L(label1);
...
jmp(label2);
...
assignL(label2, label1); // label2 <= label1
The above jmp opecode jumps label1. ```
Label label2;
Label label1 = L(); // make label1 ; same to Label label1; L(label1);
...
jmp(label2); // label2 is not determined here
...
assignL(label2, label1); // label2 <- label1
```
The `jmp` in the above code jumps to label1 assigned by `assignL`.
* Restriction: **Note**:
* srcLabel must be used in L(). * srcLabel must be used in `L()`.
* dstLabel must not be used in L(). * dstLabel must not be used in `L()`.
Label::getAddress() returns the address specified by the label instance and 0 if not specified. `Label::getAddress()` returns the address specified by the label instance and 0 if not specified.
``` ```
// not AutoGrow mode // not AutoGrow mode
Label label; Label label;
@ -229,7 +268,7 @@ L(label);
assert(label.getAddress() == getCurr()); assert(label.getAddress() == getCurr());
``` ```
### Rip ### Rip ; relative addressing
``` ```
Label label; Label label;
mov(eax, ptr [rip + label]); // eax = 4 mov(eax, ptr [rip + label]); // eax = 4
@ -243,92 +282,127 @@ int x;
... ...
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
``` ```
### Code size
The default max code size is 4096 bytes. Please set it in constructor of CodeGenerator() if you want to use large size.
class Quantize : public Xbyak::CodeGenerator { ## Code size
public: The default max code size is 4096 bytes.
Quantize() Specify the size in constructor of `CodeGenerator()` if necessary.
: CodeGenerator(8192)
{
}
...
};
### use user allocated memory ```
class Quantize : public Xbyak::CodeGenerator {
public:
Quantize()
: CodeGenerator(8192)
{
}
...
};
```
## User allocated memory
You can make jit code on prepaired memory. You can make jit code on prepaired memory.
class Sample : public Xbyak::CodeGenerator { Call `setProtectModeRE` yourself to change memory mode if using the prepaired memory.
public:
Sample(void *userPtr, size_t size)
: Xbyak::CodeGenerator(size, userPtr)
{
...
}
};
const size_t codeSize = 1024;
uint8 buf[codeSize + 16];
// get 16-byte aligned address
uint8 *p = Xbyak::CodeArray::getAlignedAddress(buf);
// append executable attribute to the memory
Xbyak::CodeArray::protect(p, codeSize, true);
// construct your jit code on the memory
Sample s(p, codeSize);
>See *sample/test0.cpp*
AutoGrow
-------------
Under `AutoGrow` mode, Xbyak extends memory automatically if necessary.
Call ready() before calling getCode() to calc address of jmp.
``` ```
struct Code : Xbyak::CodeGenerator { uint8_t alignas(4096) buf[8192]; // C++11 or later
Code()
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow) struct Code : Xbyak::CodeGenerator {
{ Code() : Xbyak::CodeGenerator(sizeof(buf), buf)
... {
} mov(rax, 123);
}; ret();
}
};
int main()
{
Code c; Code c;
c.ready(); // Don't forget to call this function c.setProtectModeRE(); // set memory to Read/Exec
printf("%d\n", c.getCode<int(*)()>()());
}
``` ```
>Don't use the address returned by getCurr() before calling ready().
>It may be invalid address.
>RESTRICTION : rip addressing is not supported in AutoGrow
Macro **Note**: See [sample/test0.cpp](sample/test0.cpp).
-------------
### AutoGrow
The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`.
Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address.
```
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
{
...
}
};
Code c;
// generate code for jit
c.ready(); // mode = Read/Write/Exec
```
**Note**:
* Don't use the address returned by `getCurr()` before calling `ready()` because it may be invalid address.
### Read/Exec mode
Xbyak set Read/Write/Exec mode to memory to run jit code.
If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and
call `setProtectModeRE()` after generating jit code.
```
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
{
mov(eax, 123);
ret();
}
};
Code c;
c.setProtectModeRE();
...
```
Call `readyRE()` instead of `ready()` when using `AutoGrow` mode.
See [protect-re.cpp](sample/protect-re.cpp).
## Macro
* **XBYAK32** is defined on 32bit. * **XBYAK32** is defined on 32bit.
* **XBYAK64** is defined on 64bit. * **XBYAK64** is defined on 64bit.
* **XBYAK64_WIN** is defined on 64bit Windows(VC) * **XBYAK64_WIN** is defined on 64bit Windows(VC)
* **XBYAK64_GCC** is defined on 64bit gcc, cygwin * **XBYAK64_GCC** is defined on 64bit gcc, cygwin
* define **XBYAK_NO_OP_NAMES** on gcc without `-fno-operator-names` * define **XBYAK_NO_OP_NAMES** on gcc without `-fno-operator-names`
* define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(duplicated in the future) * define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(deprecated in the future)
* define **XBYAK_UNDEF_JNL** if Bessel function jnl is defined as macro * define **XBYAK_UNDEF_JNL** if Bessel function jnl is defined as macro
Sample ## Sample
-------------
* test0.cpp ; tiny sample of Xbyak(x86, x64) * [test0.cpp](sample/test0.cpp) ; tiny sample (x86, x64)
* quantize.cpp ; JIT optimized quantization by fast division(x86 only) * [quantize.cpp](sample/quantize.cpp) ; JIT optimized quantization by fast division (x86 only)
* calc.cpp ; assemble and estimate a given polynomial(x86, x64) * [calc.cpp](sample/calc.cpp) ; assemble and estimate a given polynomial (x86, x64)
* bf.cpp ; JIT brainfuck(x86, x64) * [bf.cpp](sample/bf.cpp) ; JIT brainfuck (x86, x64)
License ## License
-------------
modified new BSD License modified new BSD License
http://opensource.org/licenses/BSD-3-Clause http://opensource.org/licenses/BSD-3-Clause
History ## History
------------- * 2019/Mar/06 ver 5.77 fix number of cores that share LLC cache by densamoilov
* 2019/Jan/17 ver 5.76 add Cpu::getNumCores() by shelleygoel
* 2018/Oct/31 ver 5.751 recover Xbyak::CastTo for compatibility
* 2018/Oct/29 ver 5.75 unlink LabelManager from Label when msg is destroyed
* 2018/Oct/21 ver 5.74 support RegRip +/- int. Xbyak::CastTo is removed
* 2018/Oct/15 util::AddressFrame uses push/pop instead of mov
* 2018/Sep/19 ver 5.73 fix evex encoding of vpslld, vpslldq, vpsllw, etc for (reg, mem, imm8)
* 2018/Sep/19 ver 5.72 fix the encoding of vinsertps for disp8N(Thanks to petercaday)
* 2018/Sep/04 ver 5.71 L() returns a new label instance
* 2018/Aug/27 ver 5.70 support setProtectMode() and DontUseProtect for read/exec setting
* 2018/Aug/24 ver 5.68 fix wrong VSIB encoding with vector index >= 16(thanks to petercaday)
* 2018/Aug/14 ver 5.67 remove mutable in Address ; fix setCacheHierarchy for cloud vm * 2018/Aug/14 ver 5.67 remove mutable in Address ; fix setCacheHierarchy for cloud vm
* 2018/Jul/26 ver 5.661 support mingw64 * 2018/Jul/26 ver 5.661 support mingw64
* 2018/Jul/24 ver 5.66 add CodeArray::PROTECT_RE to mode of protect() * 2018/Jul/24 ver 5.66 add CodeArray::PROTECT_RE to mode of protect()
@ -392,8 +466,7 @@ History
* 2013/Jul/30 ver 4.20 [break backward compatibility] split Reg32e class into RegExp(base+index*scale+disp) and Reg32e(means Reg32 or Reg64) * 2013/Jul/30 ver 4.20 [break backward compatibility] split Reg32e class into RegExp(base+index*scale+disp) and Reg32e(means Reg32 or Reg64)
* 2013/Jul/04 ver 4.10 [break backward compatibility] change the type of Xbyak::Error from enum to a class * 2013/Jul/04 ver 4.10 [break backward compatibility] change the type of Xbyak::Error from enum to a class
* 2013/Jun/21 ver 4.02 add putL(LABEL) function to put the address of the label * 2013/Jun/21 ver 4.02 add putL(LABEL) function to put the address of the label
* 2013/Jun/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm). * 2013/Jun/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm). support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest).
support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest).
* 2013/May/30 ver 4.00 support AVX2, VEX-encoded GPR-instructions * 2013/May/30 ver 4.00 support AVX2, VEX-encoded GPR-instructions
* 2013/Mar/27 ver 3.80 support mov(reg, "label"); * 2013/Mar/27 ver 3.80 support mov(reg, "label");
* 2013/Mar/13 ver 3.76 add cqo(), jcxz(), jecxz(), jrcxz() * 2013/Mar/13 ver 3.76 add cqo(), jcxz(), jecxz(), jrcxz()
@ -453,8 +526,6 @@ History
* 2007/Jan/21 fix the bug to create address like [disp] select smaller representation for mov (eax|ax|al, [disp]) * 2007/Jan/21 fix the bug to create address like [disp] select smaller representation for mov (eax|ax|al, [disp])
* 2007/Jan/4 first version * 2007/Jan/4 first version
Author ## Author
-------------
MITSUNARI Shigeo(herumi@nifty.com) MITSUNARI Shigeo(herumi@nifty.com)

View file

@ -1,5 +1,5 @@
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.67 C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.77
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎概要 ◎概要
@ -245,8 +245,8 @@ void func2()
更にラベルの割り当てを行うassignL(dstLabel, srcLabel)という命令も追加されました。 更にラベルの割り当てを行うassignL(dstLabel, srcLabel)という命令も追加されました。
Label label1, label2; Label label2;
L(label1); Label label1 = L(); // Label label1; L(label1);と同じ意味
... ...
jmp(label2); jmp(label2);
... ...
@ -309,6 +309,41 @@ bool CodeArray::protect(const void *addr, size_t size, bool canExec);
*/ */
uint8 *CodeArray::getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE); uint8 *CodeArray::getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE);
・read/execモード
デフォルトのCodeGeneratorはコンストラクト時にJIT用の領域をread/write/execモードに設定して利用します。
コード生成時はread/writeでコード実行時にはread/execにしたい場合、次のようにしてください。
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096, Xbyak::DontUseProtect) // JIT領域をread/writeのままコード生成
{
mov(eax, 123);
ret();
}
};
Code c;
c.setProtectModeRE(); // read/execモードに変更
// JIT領域を実行
AutoGrowの場合はreadyの代わりにreadyRE()を読んでください。
struct Code : Xbyak::CodeGenerator {
Code()
: Xbyak::CodeGenerator(4096, Xbyak::AutoGrow) // JIT領域をread/writeのままコード生成
{
mov(eax, 123);
ret();
}
};
Code c;
c.readyRE(); // read/exeモードに変更
// JIT領域を実行
setProtectModeRW()を呼ぶと領域が元のread/execモードに戻ります。
その他詳細は各種サンプルを参照してください。 その他詳細は各種サンプルを参照してください。
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎マクロ ◎マクロ
@ -338,6 +373,17 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
◎履歴 ◎履歴
2019/03/06 ver 5.77 LLCキャッシュを共有数CPU数の修整(by densamoilov)
2019/01/17 ver 5.76 Cpu::getNumCores()追加(by shelleygoel)
2018/10/31 ver 5.751 互換性のためにXbyak::CastToの復元
2018/10/29 ver 5.75 LabelManagerのデストラクタでLabelから参照を切り離す
2018/10/21 ver 5.74 RegRip +/intの形をサポート Xbyak::CastToを削除
2018/10/15 util::StackFrameでmovの代わりにpush/popを使う
2018/09/19 ver 5.73 vpslld, vpslldq, vpsllwなどの(reg, mem, imm8)に対するevexエンコーディング修整
2018/09/19 ver 5.72 fix the encoding of vinsertps for disp8N(Thanks to petercaday)
2018/08/27 ver 5.71 新しいlabelインスタンスを返すL()を追加
2018/08/27 ver 5.70 read/exec設定のためのsetProtectMode()とDontUseProtectの追加
2018/08/24 ver 5.68 indexが16以上のVSIBエンコーディングのバグ修正(thanks to petercaday)
2018/08/14 ver 5.67 Addressクラス内のmutableを削除 ; fix setCacheHierarchy for cloud vm 2018/08/14 ver 5.67 Addressクラス内のmutableを削除 ; fix setCacheHierarchy for cloud vm
2018/07/26 ver 5.661 mingw64対応 2018/07/26 ver 5.661 mingw64対応
2018/07/24 ver 5.66 protect()のmodeにCodeArray::PROTECT_REを追加 2018/07/24 ver 5.66 protect()のmodeにCodeArray::PROTECT_REを追加

View file

@ -10,12 +10,6 @@
#endif #endif
class Brainfuck : public Xbyak::CodeGenerator { class Brainfuck : public Xbyak::CodeGenerator {
private:
enum Direction { B, F };
std::string toStr(int labelNo, Direction dir)
{
return Xbyak::Label::toStr(labelNo) + (dir == B ? 'B' : 'F');
}
public: public:
int getContinuousChar(std::istream& is, char c) int getContinuousChar(std::istream& is, char c)
{ {
@ -67,8 +61,7 @@ public:
mov(pGetchar, rsi); // getchar mov(pGetchar, rsi); // getchar
mov(stack, rdx); // stack mov(stack, rdx); // stack
#endif #endif
int labelNo = 0; std::stack<Label> labelF, labelB;
std::stack<int> keepLabelNo;
char c; char c;
while (is >> c) { while (is >> c) {
switch (c) { switch (c) {
@ -116,17 +109,22 @@ public:
mov(cur, eax); mov(cur, eax);
break; break;
case '[': case '[':
L(toStr(labelNo, B)); {
mov(eax, cur); Label B = L();
test(eax, eax); labelB.push(B);
jz(toStr(labelNo, F), T_NEAR); mov(eax, cur);
keepLabelNo.push(labelNo++); test(eax, eax);
Label F;
jz(F, T_NEAR);
labelF.push(F);
}
break; break;
case ']': case ']':
{ {
int no = keepLabelNo.top(); keepLabelNo.pop(); Label B = labelB.top(); labelB.pop();
jmp(toStr(no, B)); jmp(B);
L(toStr(no, F)); Label F = labelF.top(); labelF.pop();
L(F);
} }
break; break;
default: default:
@ -200,7 +198,7 @@ int main(int argc, char *argv[])
Brainfuck bf(ifs); Brainfuck bf(ifs);
if (mode == 0) { if (mode == 0) {
static int stack[128 * 1024]; static int stack[128 * 1024];
bf.getCode<void (*)(void*, void*, int *)>()(Xbyak::CastTo<void*>(putchar), Xbyak::CastTo<void*>(getchar), stack); bf.getCode<void (*)(const void*, const void*, int *)>()(reinterpret_cast<const void*>(putchar), reinterpret_cast<const void*>(getchar), stack);
} else { } else {
dump(bf.getCode(), bf.getSize()); dump(bf.getCode(), bf.getSize());
} }

70
sample/protect-re.cpp Normal file
View file

@ -0,0 +1,70 @@
#define XBYAK_NO_OP_NAMES
#include <xbyak/xbyak.h>
struct Code1 : Xbyak::CodeGenerator {
Code1()
: Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE)
{
mov(eax, 123);
ret();
}
void update()
{
db(0);
}
};
void test1(bool updateCode)
{
Code1 c;
c.setProtectModeRE();
if (updateCode) c.update(); // segmentation fault
int (*f)() = c.getCode<int (*)()>();
printf("f=%d\n", f());
c.setProtectModeRW();
c.update();
puts("ok");
}
struct Code2 : Xbyak::CodeGenerator {
Code2()
: Xbyak::CodeGenerator(4096, Xbyak::AutoGrow)
{
mov(eax, 123);
ret();
}
void update()
{
db(0);
}
};
void test2(bool updateCode)
{
Code2 c;
c.readyRE();
if (updateCode) c.update(); // segmentation fault
int (*f)() = c.getCode<int (*)()>();
printf("f=%d\n", f());
c.setProtectModeRW();
c.update();
puts("ok");
}
int main(int argc, char *argv[])
{
if (argc < 2) {
fprintf(stderr, "%s <testNum> [update]\n", argv[0]);
return 0;
}
bool update = argc == 3;
int n = atoi(argv[1]);
printf("n=%d update=%d\n", n, update);
switch (n) {
case 1: test1(update); break;
case 2: test2(update); break;
default: fprintf(stderr, "no test %d\n", n); break;
}
}

View file

@ -32,7 +32,7 @@ struct Code : Xbyak::CodeGenerator {
inline int add(int a, int b) inline int add(int a, int b)
{ {
return Xbyak::CastTo<int (*)(int,int)>(buf)(a, b); return reinterpret_cast<int (*)(int, int)>(buf)(a, b);
} }
int main() int main()

View file

@ -77,7 +77,7 @@ public:
#ifdef XBYAK_VARIADIC_TEMPLATE #ifdef XBYAK_VARIADIC_TEMPLATE
call(atoi); call(atoi);
#else #else
call(Xbyak::CastTo<void*>(atoi)); call(reinterpret_cast<const void*>(atoi));
#endif #endif
add(esp, 4); add(esp, 4);
#endif #endif
@ -96,7 +96,7 @@ public:
mov(rax, (size_t)atoi); mov(rax, (size_t)atoi);
jmp(rax); jmp(rax);
#else #else
jmp(Xbyak::CastTo<void*>(atoi)); jmp(reinterpret_cast<const void*>(atoi));
#endif #endif
} }
int (*get() const)(const char *) { return getCode<int (*)(const char *)>(); } int (*get() const)(const char *) { return getCode<int (*)(const char *)>(); }
@ -171,8 +171,9 @@ int main()
return 1; return 1;
} }
int (*func)(int) = s.getCode<int (*)(int)>(); int (*func)(int) = s.getCode<int (*)(int)>();
if (Xbyak::CastTo<uint8*>(func) != p) { const uint8 *funcp = reinterpret_cast<const uint8*>(func);
fprintf(stderr, "internal error %p %p\n", p, Xbyak::CastTo<uint8*>(func)); if (funcp != p) {
fprintf(stderr, "internal error %p %p\n", p, funcp);
return 1; return 1;
} }
printf("0 + ... + %d = %d\n", 100, func(100)); printf("0 + ... + %d = %d\n", 100, func(100));

View file

@ -104,9 +104,12 @@ void putCPUinfo()
Core i7-3930K 6 2D Core i7-3930K 6 2D
*/ */
cpu.putFamily(); cpu.putFamily();
if (!cpu.has(Cpu::tINTEL)) return;
for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) { for (unsigned int i = 0; i < cpu.getDataCacheLevels(); i++) {
printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i)); printf("cache level=%u data cache size=%u cores sharing data cache=%u\n", i, cpu.getDataCacheSize(i), cpu.getCoresSharingDataCache(i));
} }
printf("SmtLevel =%u\n", cpu.getNumCores(Xbyak::util::SmtLevel));
printf("CoreLevel=%u\n", cpu.getNumCores(Xbyak::util::CoreLevel));
} }
int main() int main()

View file

@ -204,7 +204,7 @@ public:
push(reg[r]); push(reg[r]);
push('A' + r); push('A' + r);
push((int)str); push((int)str);
call(Xbyak::CastTo<void*>(printf)); call(reinterpret_cast<const void*>(printf));
add(esp, 4 * 4); add(esp, 4 * 4);
pop(ecx); pop(ecx);
pop(edx); pop(edx);

View file

@ -889,6 +889,34 @@ CYBOZU_TEST_AUTO(testNewLabel)
} }
} }
CYBOZU_TEST_AUTO(returnLabel)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
xor_(eax, eax);
Label L1 = L();
test(eax, eax);
Label exit;
jnz(exit);
inc(eax); // 1
Label L2;
call(L2);
jmp(L1);
L(L2);
inc(eax); // 2
ret();
L(exit);
inc(eax); // 3
ret();
}
};
Code code;
int (*f)() = code.getCode<int (*)()>();
int r = f();
CYBOZU_TEST_EQUAL(r, 3);
}
CYBOZU_TEST_AUTO(testAssign) CYBOZU_TEST_AUTO(testAssign)
{ {
struct Code : Xbyak::CodeGenerator { struct Code : Xbyak::CodeGenerator {
@ -987,6 +1015,52 @@ struct GetAddressCode1 : Xbyak::CodeGenerator {
} }
}; };
struct CodeLabelTable : Xbyak::CodeGenerator {
enum { ret0 = 3 };
enum { ret1 = 5 };
enum { ret2 = 8 };
CodeLabelTable()
{
using namespace Xbyak;
#ifdef XBYAK64_WIN
const Reg64& p0 = rcx;
const Reg64& a = rax;
#elif defined (XBYAK64_GCC)
const Reg64& p0 = rdi;
const Reg64& a = rax;
#else
const Reg32& p0 = edx;
const Reg32& a = eax;
mov(edx, ptr [esp + 4]);
#endif
Label labelTbl, L0, L1, L2;
mov(a, labelTbl);
jmp(ptr [a + p0 * sizeof(void*)]);
L(labelTbl);
putL(L0);
putL(L1);
putL(L2);
L(L0);
mov(a, ret0);
ret();
L(L1);
mov(a, ret1);
ret();
L(L2);
mov(a, ret2);
ret();
}
};
CYBOZU_TEST_AUTO(LabelTable)
{
CodeLabelTable c;
int (*f)(int) = c.getCode<int (*)(int)>();
CYBOZU_TEST_EQUAL(f(0), c.ret0);
CYBOZU_TEST_EQUAL(f(1), c.ret1);
CYBOZU_TEST_EQUAL(f(2), c.ret2);
}
CYBOZU_TEST_AUTO(getAddress1) CYBOZU_TEST_AUTO(getAddress1)
{ {
GetAddressCode1 c; GetAddressCode1 c;
@ -1143,11 +1217,56 @@ CYBOZU_TEST_AUTO(rip_addr_with_fixed_buf)
ret(); ret();
} }
} code; } code;
Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RE); code.setProtectModeRE();
code.getCode<void (*)()>()(); code.getCode<void (*)()>()();
CYBOZU_TEST_EQUAL(*x0, 123); CYBOZU_TEST_EQUAL(*x0, 123);
CYBOZU_TEST_EQUAL(*x1, 456); CYBOZU_TEST_EQUAL(*x1, 456);
CYBOZU_TEST_EQUAL(buf[8], 99); CYBOZU_TEST_EQUAL(buf[8], 99);
Xbyak::CodeArray::protect(p, 4096, Xbyak::CodeArray::PROTECT_RW); code.setProtectModeRW();
} }
#endif #endif
struct ReleaseTestCode : Xbyak::CodeGenerator {
ReleaseTestCode(Label& L1, Label& L2, Label& L3)
{
L(L1);
jmp(L1);
L(L2);
jmp(L3); // not assigned
}
};
/*
code must unlink label if code is destroyed
*/
CYBOZU_TEST_AUTO(release_label_after_code)
{
puts("---");
{
Label L1, L2, L3, L4, L5;
{
ReleaseTestCode code(L1, L2, L3);
CYBOZU_TEST_ASSERT(L1.getId() > 0);
CYBOZU_TEST_ASSERT(L1.getAddress() != 0);
CYBOZU_TEST_ASSERT(L2.getId() > 0);
CYBOZU_TEST_ASSERT(L2.getAddress() != 0);
CYBOZU_TEST_ASSERT(L3.getId() > 0);
CYBOZU_TEST_ASSERT(L3.getAddress() == 0); // L3 is not assigned
code.assignL(L4, L1);
L5 = L1;
printf("id=%d %d %d %d %d\n", L1.getId(), L2.getId(), L3.getId(), L4.getId(), L5.getId());
}
puts("code is released");
CYBOZU_TEST_ASSERT(L1.getId() == 0);
CYBOZU_TEST_ASSERT(L1.getAddress() == 0);
CYBOZU_TEST_ASSERT(L2.getId() == 0);
CYBOZU_TEST_ASSERT(L2.getAddress() == 0);
// CYBOZU_TEST_ASSERT(L3.getId() == 0); // L3 is not assigned so not cleared
CYBOZU_TEST_ASSERT(L3.getAddress() == 0);
CYBOZU_TEST_ASSERT(L4.getId() == 0);
CYBOZU_TEST_ASSERT(L4.getAddress() == 0);
CYBOZU_TEST_ASSERT(L5.getId() == 0);
CYBOZU_TEST_ASSERT(L5.getAddress() == 0);
printf("id=%d %d %d %d %d\n", L1.getId(), L2.getId(), L3.getId(), L4.getId(), L5.getId());
}
}

View file

@ -73,7 +73,6 @@ const uint64 YMM_ER = 1ULL << 36;
const uint64 VM32Y_K = 1ULL << 37; const uint64 VM32Y_K = 1ULL << 37;
const uint64 IMM_2 = 1ULL << 38; const uint64 IMM_2 = 1ULL << 38;
const uint64 IMM = IMM_1 | IMM_2; const uint64 IMM = IMM_1 | IMM_2;
const uint64 XMM = _XMM | _XMM2;
const uint64 YMM = _YMM | _YMM2; const uint64 YMM = _YMM | _YMM2;
const uint64 K = 1ULL << 43; const uint64 K = 1ULL << 43;
const uint64 _ZMM = 1ULL << 44; const uint64 _ZMM = 1ULL << 44;
@ -90,7 +89,10 @@ const uint64 ZMM_SAE = 1ULL << 48;
const uint64 ZMM_ER = 1ULL << 49; const uint64 ZMM_ER = 1ULL << 49;
#ifdef XBYAK64 #ifdef XBYAK64
const uint64 _XMM3 = 1ULL << 50; const uint64 _XMM3 = 1ULL << 50;
#else
const uint64 _XMM3 = 0;
#endif #endif
const uint64 XMM = _XMM | _XMM2 | _XMM3;
const uint64 XMM_SAE = 1ULL << 51; const uint64 XMM_SAE = 1ULL << 51;
#ifdef XBYAK64 #ifdef XBYAK64
const uint64 XMM_KZ = 1ULL << 52; const uint64 XMM_KZ = 1ULL << 52;
@ -352,7 +354,8 @@ class Test {
case VM32Y_K: case VM32Y_K:
return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}"; return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}";
case VM32Z_K: case VM32Z_K:
return isXbyak_ ? "ptr [64+zmm13*2+r13] | k6" : "[64+zmm13*2+r13]{k6}"; if (idx & 1) return isXbyak_ ? "ptr [64+zmm10*8+r9] | k6" : "[64+zmm10*8+r9]{k6}";
return isXbyak_ ? "ptr [64+zmm30*2+r13] | k6" : "[64+zmm30*2+r13]{k6}";
case VM32Z: case VM32Z:
return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]"; return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]";
case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}"; case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
@ -607,7 +610,7 @@ public:
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
put(p->name, K, _XMM, _XMM | MEM, IMM8); put(p->name, K, XMM, _XMM | MEM, IMM8);
if (!p->supportYMM) continue; if (!p->supportYMM) continue;
put(p->name, K, _YMM, _YMM | MEM, IMM8); put(p->name, K, _YMM, _YMM | MEM, IMM8);
put(p->name, K, _ZMM, _ZMM | MEM, IMM8); put(p->name, K, _ZMM, _ZMM | MEM, IMM8);
@ -626,10 +629,10 @@ public:
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl *p = &tbl[i]; const Tbl *p = &tbl[i];
put(p->name, XMM | _XMM3, XMM_SAE | XMM | MEM); put(p->name, XMM, XMM_SAE | XMM | MEM);
} }
} }
put("vcomiss", _XMM3, XMM | MEM); put("vcomiss", XMM, _XMM3 | MEM);
put("vcomiss", XMM, XMM_SAE); put("vcomiss", XMM, XMM_SAE);
#endif #endif
} }
@ -673,10 +676,10 @@ public:
"vpbroadcastq", "vpbroadcastq",
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
put(tbl[i], XMM_KZ | ZMM_KZ, _XMM | _MEM); put(tbl[i], XMM_KZ | ZMM_KZ, XMM | _MEM);
} }
} }
put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, _XMM | _MEM); put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, XMM | _MEM);
put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM); put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM);
put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM); put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM);
put("vbroadcasti32x8", ZMM_KZ, _MEM); put("vbroadcasti32x8", ZMM_KZ, _MEM);
@ -684,14 +687,14 @@ public:
} }
void putMisc1() void putMisc1()
{ {
put("vmaskmovps", XMM, XMM, MEM); put("vmaskmovps", _XMM, _XMM, MEM);
put("vmaskmovps", YMM, YMM, MEM); put("vmaskmovps", YMM, YMM, MEM);
put("vmaskmovpd", YMM, YMM, MEM); put("vmaskmovpd", YMM, YMM, MEM);
put("vmaskmovpd", XMM, XMM, MEM); put("vmaskmovpd", _XMM, _XMM, MEM);
put("vmaskmovps", MEM, XMM, XMM); put("vmaskmovps", MEM, _XMM, _XMM);
put("vmaskmovpd", MEM, XMM, XMM); put("vmaskmovpd", MEM, _XMM, _XMM);
put("vbroadcastf128", YMM, MEM); put("vbroadcastf128", YMM, MEM);
put("vbroadcasti128", YMM, MEM); put("vbroadcasti128", YMM, MEM);
@ -710,8 +713,8 @@ public:
} }
} }
put("vinsertf128", YMM, YMM, XMM | MEM, IMM8); put("vinsertf128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8);
put("vinserti128", YMM, YMM, XMM | MEM, IMM8); put("vinserti128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8);
put("vperm2f128", YMM, YMM, YMM | MEM, IMM8); put("vperm2f128", YMM, YMM, YMM | MEM, IMM8);
put("vperm2i128", YMM, YMM, YMM | MEM, IMM8); put("vperm2i128", YMM, YMM, YMM | MEM, IMM8);
@ -721,9 +724,9 @@ public:
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *name = tbl[i]; const char *name = tbl[i];
put(name, XMM, XMM, MEM); put(name, _XMM, _XMM, MEM);
put(name, YMM, YMM, MEM); put(name, YMM, YMM, MEM);
put(name, MEM, XMM, XMM); put(name, MEM, _XMM, _XMM);
put(name, MEM, YMM, YMM); put(name, MEM, YMM, YMM);
} }
} }
@ -760,29 +763,29 @@ public:
put(name, MEM, ZMM); put(name, MEM, ZMM);
put(name, ZMM, MEM); put(name, ZMM, MEM);
#ifdef XBYAK64 #ifdef XBYAK64
put(name, MEM, _XMM3); put(name, MEM, XMM);
put(name, _XMM3, MEM); put(name, XMM, MEM);
#endif #endif
} }
} }
void put_vmov() void put_vmov()
{ {
#ifdef XBYAK64 #ifdef XBYAK64
put("vmovd", _XMM3, MEM|REG32); put("vmovd", XMM, MEM|REG32);
put("vmovd", MEM|REG32, _XMM3); put("vmovd", MEM|REG32, XMM);
put("vmovq", _XMM3, MEM|REG64|XMM); put("vmovq", XMM, MEM|REG64|XMM);
put("vmovq", MEM|REG64|XMM, _XMM3); put("vmovq", MEM|REG64|XMM, XMM);
put("vmovhlps", _XMM3, _XMM3, _XMM3); put("vmovhlps", XMM, _XMM3, _XMM3);
put("vmovlhps", _XMM3, _XMM3, _XMM3); put("vmovlhps", XMM, _XMM3, _XMM3);
put("vmovntdqa", _XMM3|_YMM3|ZMM, MEM); put("vmovntdqa", XMM|_YMM3|ZMM, MEM);
put("vmovntdq", MEM, _XMM3 | _YMM3 | ZMM); put("vmovntdq", MEM, XMM | _YMM3 | ZMM);
put("vmovntpd", MEM, _XMM3 | _YMM3 | ZMM); put("vmovntpd", MEM, XMM | _YMM3 | ZMM);
put("vmovntps", MEM, _XMM3 | _YMM3 | ZMM); put("vmovntps", MEM, XMM | _YMM3 | ZMM);
put("vmovsd", XMM_KZ, _XMM3, _XMM3); put("vmovsd", XMM_KZ, XMM, _XMM3);
put("vmovsd", XMM_KZ, MEM); put("vmovsd", XMM_KZ, MEM);
put("vmovsd", MEM_K, XMM); put("vmovsd", MEM_K, XMM);
put("vmovss", XMM_KZ, _XMM3, _XMM3); put("vmovss", XMM_KZ, XMM, _XMM3);
put("vmovss", XMM_KZ, MEM); put("vmovss", XMM_KZ, MEM);
put("vmovss", MEM_K, XMM); put("vmovss", MEM_K, XMM);
@ -797,7 +800,7 @@ public:
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const char *name = tbl[i]; const char *name = tbl[i];
put(name, XMM_KZ, _XMM, _XMM | MEM, IMM); put(name, XMM_KZ, XMM, _XMM | MEM, IMM);
put(name, _YMM3, _YMM3, _YMM3 | _MEM, IMM); put(name, _YMM3, _YMM3, _YMM3 | _MEM, IMM);
put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM); put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM);
} }
@ -810,7 +813,7 @@ public:
"vmovlps", "vmovlps",
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
put(tbl[i], _XMM3, _XMM3, MEM); put(tbl[i], XMM, _XMM3, MEM);
put(tbl[i], MEM, _XMM3); put(tbl[i], MEM, _XMM3);
} }
} }
@ -836,11 +839,11 @@ public:
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
put(p.name, _XMM|XMM_KZ, _XMM|MEM); put(p.name, XMM|XMM_KZ, _XMM|MEM);
put(p.name, _YMM|YMM_KZ, _YMM|MEM); put(p.name, _YMM|YMM_KZ, _YMM|MEM);
put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM); put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM);
if (!p.M_X) continue; if (!p.M_X) continue;
put(p.name, MEM|MEM_K, _XMM); put(p.name, MEM|MEM_K, XMM);
put(p.name, MEM|MEM_K, _YMM); put(p.name, MEM|MEM_K, _YMM);
put(p.name, MEM|MEM_K, _ZMM); put(p.name, MEM|MEM_K, _ZMM);
} }
@ -857,7 +860,7 @@ public:
put("vpabsd", ZMM_KZ, M_1to16 | _MEM); put("vpabsd", ZMM_KZ, M_1to16 | _MEM);
put("vpabsq", ZMM_KZ, M_1to8 | _MEM); put("vpabsq", ZMM_KZ, M_1to8 | _MEM);
put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, _XMM | _MEM); put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, XMM | _MEM);
put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM); put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM);
put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM); put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM);
@ -879,7 +882,7 @@ public:
}; };
for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
const Tbl& p = tbl[i]; const Tbl& p = tbl[i];
put(p.name, XMM_KZ, _XMM, _XMM|p.mem); put(p.name, XMM_KZ, XMM, _XMM|p.mem);
} }
} }
void put512_X3() void put512_X3()
@ -891,54 +894,54 @@ public:
uint64_t x2; uint64_t x2;
uint64_t xm; uint64_t xm;
} tbl[] = { } tbl[] = {
{ "vpacksswb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpacksswb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM }, { "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM },
{ "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM }, { "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpackssdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpackssdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
{ "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM }, { "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
{ "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, { "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
{ "vpackusdw", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpackusdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
{ "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM }, { "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
{ "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, { "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
{ "vpackuswb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpackuswb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM }, { "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM },
{ "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM }, { "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpaddb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpaddb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpaddw", XMM_KZ, _XMM, _XMM | _MEM }, { "vpaddw", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpaddsb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpaddsb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, { "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpaddsw", XMM_KZ, _XMM, _XMM | _MEM }, { "vpaddsw", XMM_KZ, XMM, _XMM | _MEM },
{ "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, { "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpaddusb", XMM_KZ, _XMM, _XMM | MEM }, { "vpaddusb", XMM_KZ, XMM, _XMM | MEM },
{ "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM }, { "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpaddusw", XMM_KZ, _XMM, _XMM | MEM }, { "vpaddusw", XMM_KZ, XMM, _XMM | MEM },
{ "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM }, { "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpsubb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpsubb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubw", XMM_KZ, _XMM, _XMM | _MEM }, { "vpsubw", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpsubd", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
{ "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpsubsb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpsubsb", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, { "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpsubsw", XMM_KZ, _XMM, _XMM | _MEM }, { "vpsubsw", XMM_KZ, XMM, _XMM | _MEM },
{ "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, { "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
{ "vpsubusb", XMM_KZ, _XMM, _XMM | MEM }, { "vpsubusb", XMM_KZ, XMM, _XMM | MEM },
{ "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM }, { "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpsubusw", XMM_KZ, _XMM, _XMM | MEM }, { "vpsubusw", XMM_KZ, XMM, _XMM | MEM },
{ "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM }, { "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM },
{ "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, { "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
@ -983,137 +986,137 @@ public:
{ "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 }, { "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
{ "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 }, { "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
{ "vpslldq", _XMM3, _XMM3 | _MEM, IMM8 }, { "vpslldq", XMM, _XMM3 | _MEM, IMM8 },
{ "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 }, { "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 },
{ "vpslldq", _ZMM, _ZMM | _MEM, IMM8 }, { "vpslldq", _ZMM, _ZMM | _MEM, IMM8 },
{ "vpsrldq", _XMM3, _XMM3 | _MEM, IMM8 }, { "vpsrldq", XMM, _XMM3 | _MEM, IMM8 },
{ "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 }, { "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 },
{ "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 }, { "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 },
{ "vpsraw", XMM_KZ, _XMM | _MEM, IMM8 }, { "vpsraw", XMM_KZ, XMM | _MEM, IMM8 },
{ "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 }, { "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 },
{ "vpsrad", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, { "vpsrad", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 },
{ "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, { "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
{ "vpsraq", XMM, XMM, IMM8 }, { "vpsraq", XMM, XMM, IMM8 },
{ "vpsraq", XMM_KZ, _XMM | M_1to2 | _MEM, IMM8 }, { "vpsraq", XMM_KZ, XMM | M_1to2 | _MEM, IMM8 },
{ "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 }, { "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
{ "vpsllw", _XMM3, _XMM3 | _MEM, IMM8 }, { "vpsllw", XMM, _XMM3 | _MEM, IMM8 },
{ "vpslld", _XMM3, _XMM3 | _MEM | M_1to4, IMM8 }, { "vpslld", XMM, _XMM3 | _MEM | M_1to4, IMM8 },
{ "vpsllq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 }, { "vpsllq", XMM, _XMM3 | _MEM | M_1to2, IMM8 },
{ "vpsrlw", XMM_KZ, _XMM | _MEM, IMM8 }, { "vpsrlw", XMM_KZ, XMM | _MEM, IMM8 },
{ "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 }, { "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 },
{ "vpsrld", XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, { "vpsrld", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 },
{ "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, { "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
{ "vpsrlq", _XMM3, _XMM3 | _MEM | M_1to2, IMM8 }, { "vpsrlq", XMM, _XMM3 | _MEM | M_1to2, IMM8 },
{ "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 }, { "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 },
{ "vpsravw", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsravw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsravw", _ZMM, _ZMM, _MEM }, { "vpsravw", _ZMM, _ZMM, _MEM },
{ "vpsravd", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsravd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM }, { "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpsravq", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsravq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM }, { "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpsllvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsllvw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsllvw", _ZMM, _ZMM, _MEM }, { "vpsllvw", _ZMM, _ZMM, _MEM },
{ "vpsllvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsllvd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM }, { "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpsllvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsllvq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM }, { "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpsrlvw", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsrlvw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsrlvw", _ZMM, _ZMM, _MEM }, { "vpsrlvw", _ZMM, _ZMM, _MEM },
{ "vpsrlvd", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsrlvd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM }, { "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpsrlvq", XMM_KZ | _XMM, _XMM, _XMM | _MEM }, { "vpsrlvq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
{ "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM }, { "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpshufb", _XMM | XMM_KZ, _XMM, _XMM | _MEM }, { "vpshufb", XMM | XMM_KZ, _XMM, _XMM | _MEM },
{ "vpshufb", ZMM_KZ, _ZMM, _MEM }, { "vpshufb", ZMM_KZ, _ZMM, _MEM },
{ "vpshufhw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 }, { "vpshufhw", XMM | XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpshufhw", ZMM_KZ, _MEM, IMM8 }, { "vpshufhw", ZMM_KZ, _MEM, IMM8 },
{ "vpshuflw", _XMM | XMM_KZ, _XMM | _MEM, IMM8 }, { "vpshuflw", XMM | XMM_KZ, _XMM | _MEM, IMM8 },
{ "vpshuflw", ZMM_KZ, _MEM, IMM8 }, { "vpshuflw", ZMM_KZ, _MEM, IMM8 },
{ "vpshufd", _XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, { "vpshufd", XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
{ "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, { "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
{ "vpord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM }, { "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
{ "vporq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, { "vporq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
{ "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpxord", _XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, { "vpxord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
{ "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM }, { "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
{ "vpxorq", _XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, { "vpxorq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
{ "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpsadbw", _XMM3, _XMM, _XMM | _MEM }, { "vpsadbw", XMM, _XMM, _XMM | _MEM },
{ "vpsadbw", _ZMM, _ZMM, _MEM }, { "vpsadbw", _ZMM, _ZMM, _MEM },
{ "vpmuldq", _XMM3, _XMM, _XMM | M_1to2 | _MEM }, { "vpmuldq", XMM, _XMM, _XMM | M_1to2 | _MEM },
{ "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpmulhrsw", _XMM3, _XMM, _XMM | _MEM }, { "vpmulhrsw", XMM, _XMM, _XMM | _MEM },
{ "vpmulhrsw", ZMM_KZ, _ZMM, _MEM }, { "vpmulhrsw", ZMM_KZ, _ZMM, _MEM },
{ "vpmulhuw", _XMM3, _XMM, _XMM | _MEM }, { "vpmulhuw", XMM, _XMM, _XMM | _MEM },
{ "vpmulhuw", ZMM_KZ, _ZMM, _MEM }, { "vpmulhuw", ZMM_KZ, _ZMM, _MEM },
{ "vpmulhw", _XMM3, _XMM, _XMM | _MEM }, { "vpmulhw", XMM, _XMM, _XMM | _MEM },
{ "vpmulhw", ZMM_KZ, _ZMM, _MEM }, { "vpmulhw", ZMM_KZ, _ZMM, _MEM },
{ "vpmullw", _XMM3, _XMM, _XMM | _MEM }, { "vpmullw", XMM, _XMM, _XMM | _MEM },
{ "vpmullw", ZMM_KZ, _ZMM, _MEM }, { "vpmullw", ZMM_KZ, _ZMM, _MEM },
{ "vpmulld", _XMM3, _XMM, M_1to4 | _MEM }, { "vpmulld", XMM, _XMM, M_1to4 | _MEM },
{ "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM }, { "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM },
{ "vpmullq", _XMM3, _XMM, M_1to2 | _MEM }, { "vpmullq", XMM, _XMM, M_1to2 | _MEM },
{ "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpmuludq", _XMM3, _XMM, M_1to2 | _MEM }, { "vpmuludq", XMM, _XMM, M_1to2 | _MEM },
{ "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
{ "vpunpckhbw", _XMM3, _XMM, _XMM | _MEM }, { "vpunpckhbw", XMM, _XMM, _XMM | _MEM },
{ "vpunpckhbw", _ZMM, _ZMM, _MEM }, { "vpunpckhbw", _ZMM, _ZMM, _MEM },
{ "vpunpckhwd", _XMM3, _XMM, _XMM | _MEM }, { "vpunpckhwd", XMM, _XMM, _XMM | _MEM },
{ "vpunpckhwd", _ZMM, _ZMM, _MEM }, { "vpunpckhwd", _ZMM, _ZMM, _MEM },
{ "vpunpckhdq", _XMM3, _XMM, M_1to4 | _MEM }, { "vpunpckhdq", XMM, _XMM, M_1to4 | _MEM },
{ "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM }, { "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpunpckhqdq", _XMM3, _XMM, M_1to2 | _MEM }, { "vpunpckhqdq", XMM, _XMM, M_1to2 | _MEM },
{ "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM }, { "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vpunpcklbw", _XMM3, _XMM, _XMM | _MEM }, { "vpunpcklbw", XMM, _XMM, _XMM | _MEM },
{ "vpunpcklbw", _ZMM, _ZMM, _MEM }, { "vpunpcklbw", _ZMM, _ZMM, _MEM },
{ "vpunpcklwd", _XMM3, _XMM, _XMM | _MEM }, { "vpunpcklwd", XMM, _XMM, _XMM | _MEM },
{ "vpunpcklwd", _ZMM, _ZMM, _MEM }, { "vpunpcklwd", _ZMM, _ZMM, _MEM },
{ "vpunpckldq", _XMM3, _XMM, M_1to4 | _MEM }, { "vpunpckldq", XMM, _XMM, M_1to4 | _MEM },
{ "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM }, { "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM },
{ "vpunpcklqdq", _XMM3, _XMM, M_1to2 | _MEM }, { "vpunpcklqdq", XMM, _XMM, M_1to2 | _MEM },
{ "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM }, { "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM },
{ "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 }, { "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
@ -1126,7 +1129,7 @@ public:
{ "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, { "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
{ "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, { "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
{ "vextractps", REG32 | _MEM, _XMM3, IMM8 }, { "vextractps", REG32 | _MEM, XMM, IMM8 },
{ "vpermb", XMM_KZ, _XMM, _XMM | _MEM }, { "vpermb", XMM_KZ, _XMM, _XMM | _MEM },
{ "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM }, { "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM },
@ -1175,7 +1178,7 @@ public:
uint64_t xm; uint64_t xm;
} tbl[] = { } tbl[] = {
#ifdef XBYAK64 #ifdef XBYAK64
{ "vinsertps", _XMM, _XMM, _XMM3 | _MEM }, { "vinsertps", XMM, _XMM, _XMM3 | _MEM },
{ "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM }, { "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM },
{ "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM }, { "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
@ -1208,14 +1211,14 @@ public:
put(p.name, p.x1, p.x2, p.xm, IMM8); put(p.name, p.x1, p.x2, p.xm, IMM8);
} }
#ifdef XBYAK64 #ifdef XBYAK64
put("vpextrb", _REG64 | _MEM, _XMM3, IMM8); put("vpextrb", _REG64 | _MEM, XMM, IMM8);
put("vpextrw", _REG64 | _MEM, _XMM3, IMM8); put("vpextrw", _REG64 | _MEM, XMM, IMM8);
put("vpextrd", _REG32 | _MEM, _XMM3, IMM8); put("vpextrd", _REG32 | _MEM, XMM, IMM8);
put("vpextrq", _REG64 | _MEM, _XMM3, IMM8); put("vpextrq", _REG64 | _MEM, XMM, IMM8);
put("vpinsrb", _XMM3, _XMM3, _REG32 | _MEM, IMM8); put("vpinsrb", XMM, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrw", _XMM3, _XMM3, _REG32 | _MEM, IMM8); put("vpinsrw", XMM, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrd", _XMM3, _XMM3, _REG32 | _MEM, IMM8); put("vpinsrd", XMM, _XMM3, _REG32 | _MEM, IMM8);
put("vpinsrq", _XMM3, _XMM3, _REG64 | _MEM, IMM8); put("vpinsrq", XMM, _XMM3, _REG64 | _MEM, IMM8);
#endif #endif
} }
void put512_FMA() void put512_FMA()
@ -1345,7 +1348,7 @@ public:
} else if (suf == "ps") { } else if (suf == "ps") {
mem = M_1to4; mem = M_1to4;
} }
put(p, _XMM3 | XMM_KZ, _XMM, mem | _MEM); put(p, XMM | XMM_KZ, _XMM, mem | _MEM);
if (!sufTbl[j].supportYMM) continue; if (!sufTbl[j].supportYMM) continue;
mem = 0; mem = 0;
if (suf == "pd") { if (suf == "pd") {
@ -1466,23 +1469,23 @@ public:
put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4); put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
put("vcvtsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER); put("vcvtsd2si", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER); put("vcvtsd2usi", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtsd2ss", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_ER); put("vcvtsd2ss", XMM_KZ, XMM, _XMM3 | _MEM | XMM_ER);
put("vcvtsi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtsi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtsi2sd", XMM, XMM_ER, REG64); put("vcvtsi2sd", XMM, XMM_ER, REG64);
put("vcvtsi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtsi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64); put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64);
put("vcvtss2sd", XMM_KZ, _XMM3, _XMM3 | _MEM | XMM_SAE); put("vcvtss2sd", XMM_KZ, XMM, _XMM3 | _MEM | XMM_SAE);
put("vcvtss2si", REG32 | REG64, _XMM3 | _MEM | XMM_ER); put("vcvtss2si", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_ER); put("vcvtss2usi", REG32 | REG64, XMM | _MEM | XMM_ER);
put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2); put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4); put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
@ -1516,13 +1519,13 @@ public:
put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4); put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE); put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
put("vcvttsd2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); put("vcvttsd2si", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvttsd2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); put("vcvttsd2usi", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvttss2si", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); put("vcvttss2si", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvttss2usi", REG32 | REG64, _XMM3 | _MEM | XMM_SAE); put("vcvttss2usi", REG32 | REG64, XMM | _MEM | XMM_SAE);
put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2); put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4); put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
@ -1540,10 +1543,10 @@ public:
put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4); put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
put("vcvtusi2sd", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtusi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtusi2sd", XMM, XMM_ER, REG64); put("vcvtusi2sd", XMM, XMM_ER, REG64);
put("vcvtusi2ss", _XMM3, _XMM3, REG32 | REG64 | MEM32 | MEM64); put("vcvtusi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64); put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64);
#endif #endif
} }

View file

@ -40,8 +40,8 @@ struct Code : Xbyak::CodeGenerator {
cmpss(xmm0, ptr[rip + label], 0); cmpss(xmm0, ptr[rip + label], 0);
test(dword[rip + label], 33); test(dword[rip + label], 33);
bt(dword[rip + label ], 3); bt(dword[rip + label ], 3);
vblendpd(xmm0, dword[rip + label], 3); vblendpd(xmm0, xmm0, dword[rip + label], 3);
vpalignr(xmm0, qword[rip + label], 4); vpalignr(xmm0, xmm0, qword[rip + label], 4);
vextractf128(dword[rip + label], ymm3, 12); vextractf128(dword[rip + label], ymm3, 12);
vperm2i128(ymm0, ymm1, qword[rip + label], 13); vperm2i128(ymm0, ymm1, qword[rip + label], 13);
vcvtps2ph(ptr[rip + label], xmm2, 44); vcvtps2ph(ptr[rip + label], xmm2, 44);

View file

@ -129,6 +129,55 @@ struct Code : public Xbyak::CodeGenerator {
add(rax, sf.p[2]); add(rax, sf.p[2]);
add(rax, sf.p[3]); add(rax, sf.p[3]);
} }
/*
int64_t f(const int64_t a[13]) { return sum-of-a[]; }
*/
void gen13()
{
StackFrame sf(this, 1, 13);
for (int i = 0; i < 13; i++) {
mov(sf.t[i], ptr[sf.p[0] + i * 8]);
}
mov(rax, sf.t[0]);
for (int i = 1; i < 13; i++) {
add(rax, sf.t[i]);
}
}
/*
same as gen13
*/
void gen14()
{
StackFrame sf(this, 1, 11 | UseRCX | UseRDX);
Pack t = sf.t;
t.append(rcx);
t.append(rdx);
for (int i = 0; i < 13; i++) {
mov(t[i], ptr[sf.p[0] + i * 8]);
}
mov(rax, t[0]);
for (int i = 1; i < 13; i++) {
add(rax, t[i]);
}
}
/*
return (1 << 15) - 1;
*/
void gen15()
{
StackFrame sf(this, 0, 14, 8);
Pack t = sf.t;
t.append(rax);
for (int i = 0; i < 15; i++) {
mov(t[i], 1 << i);
}
mov(qword[rsp], 0);
for (int i = 0; i < 15; i++) {
add(ptr[rsp], t[i]);
}
mov(rax, ptr[rsp]);
}
}; };
struct Code2 : Xbyak::CodeGenerator { struct Code2 : Xbyak::CodeGenerator {
@ -152,8 +201,14 @@ struct Code2 : Xbyak::CodeGenerator {
add(rax, sf.p[i]); add(rax, sf.p[i]);
} }
} }
void gen2(int pNum, int tNum, int stackSizeByte)
{
StackFrame sf(this, pNum, tNum, stackSizeByte);
mov(rax, rsp);
}
}; };
static int errNum = 0; static int errNum = 0;
void check(int x, int y) void check(int x, int y)
{ {
@ -167,19 +222,19 @@ void verify(const Xbyak::uint8 *f, int pNum)
{ {
switch (pNum) { switch (pNum) {
case 0: case 0:
check(1, Xbyak::CastTo<int (*)()>(f)()); check(1, reinterpret_cast<int (*)()>(f)());
return; return;
case 1: case 1:
check(11, Xbyak::CastTo<int (*)(int)>(f)(10)); check(11, reinterpret_cast<int (*)(int)>(f)(10));
return; return;
case 2: case 2:
check(111, Xbyak::CastTo<int (*)(int, int)>(f)(10, 100)); check(111, reinterpret_cast<int (*)(int, int)>(f)(10, 100));
return; return;
case 3: case 3:
check(1111, Xbyak::CastTo<int (*)(int, int, int)>(f)(10, 100, 1000)); check(1111, reinterpret_cast<int (*)(int, int, int)>(f)(10, 100, 1000));
return; return;
case 4: case 4:
check(11111, Xbyak::CastTo<int (*)(int, int, int, int)>(f)(10, 100, 1000, 10000)); check(11111, reinterpret_cast<int (*)(int, int, int, int)>(f)(10, 100, 1000, 10000));
return; return;
default: default:
printf("ERR pNum=%d\n", pNum); printf("ERR pNum=%d\n", pNum);
@ -212,6 +267,15 @@ void testAll()
const Xbyak::uint8 *f = code.getCurr(); const Xbyak::uint8 *f = code.getCurr();
code.gen(pNum, tNum | opt, stackSize); code.gen(pNum, tNum | opt, stackSize);
verify(f, pNum); verify(f, pNum);
/*
check rsp is 16-byte aligned if stackSize > 0
*/
if (stackSize > 0) {
Code2 c2;
c2.gen2(pNum, tNum | opt, stackSize);
uint64_t addr = c2.getCode<uint64_t (*)()>()();
check(addr % 16, 0);
}
} }
} }
} }
@ -268,6 +332,20 @@ void testPartial()
int (*f12)(int, int, int, int) = code.getCurr<int (*)(int, int, int, int)>(); int (*f12)(int, int, int, int) = code.getCurr<int (*)(int, int, int, int)>();
code.gen12(); code.gen12();
check(24, f12(3, 5, 7, 9)); check(24, f12(3, 5, 7, 9));
{
int64_t tbl[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 };
int64_t (*f13)(const int64_t*) = code.getCurr<int64_t (*)(const int64_t*)>();
code.gen13();
check(91, f13(tbl));
int64_t (*f14)(const int64_t*) = code.getCurr<int64_t (*)(const int64_t*)>();
code.gen14();
check(91, f14(tbl));
}
int (*f15)() = code.getCurr<int (*)()>();
code.gen15();
check((1 << 15) - 1, f15());
} }
void put(const Xbyak::util::Pack& p) void put(const Xbyak::util::Pack& p)

View file

@ -40,6 +40,8 @@
// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft. // This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\ #if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\
((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__))) ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
#include <unordered_set>
#define XBYAK_STD_UNORDERED_SET std::unordered_set
#include <unordered_map> #include <unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::unordered_map #define XBYAK_STD_UNORDERED_MAP std::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap #define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
@ -49,16 +51,22 @@
libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version). libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
*/ */
#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || defined(__llvm__) #elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || defined(__llvm__)
#include <tr1/unordered_set>
#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
#include <tr1/unordered_map> #include <tr1/unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map #define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap #define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600) #elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
#include <unordered_set>
#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
#include <unordered_map> #include <unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map #define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap #define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
#else #else
#include <set>
#define XBYAK_STD_UNORDERED_SET std::set
#include <map> #include <map>
#define XBYAK_STD_UNORDERED_MAP std::map #define XBYAK_STD_UNORDERED_MAP std::map
#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap #define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
@ -105,7 +113,7 @@ namespace Xbyak {
enum { enum {
DEFAULT_MAX_CODE_SIZE = 4096, DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x5670 /* 0xABCD = A.BC(D) */ VERSION = 0x5770 /* 0xABCD = A.BC(D) */
}; };
#ifndef MIE_INTEGER_TYPE_DEFINED #ifndef MIE_INTEGER_TYPE_DEFINED
@ -178,7 +186,8 @@ enum {
ERR_INVALID_ZERO, ERR_INVALID_ZERO,
ERR_INVALID_RIP_IN_AUTO_GROW, ERR_INVALID_RIP_IN_AUTO_GROW,
ERR_INVALID_MIB_ADDRESS, ERR_INVALID_MIB_ADDRESS,
ERR_INTERNAL ERR_INTERNAL,
ERR_X2APIC_IS_NOT_SUPPORTED
}; };
class Error : public std::exception { class Error : public std::exception {
@ -240,6 +249,7 @@ public:
"invalid rip in AutoGrow", "invalid rip in AutoGrow",
"invalid mib address", "invalid mib address",
"internal error", "internal error",
"x2APIC is not supported"
}; };
assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl)); assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
return errTbl[err_]; return errTbl[err_];
@ -617,6 +627,12 @@ struct RegRip {
const Label* label_; const Label* label_;
bool isAddr_; bool isAddr_;
explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {}
friend const RegRip operator+(const RegRip& r, int disp) {
return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
}
friend const RegRip operator-(const RegRip& r, int disp) {
return RegRip(r.disp_ - disp, r.label_, r.isAddr_);
}
friend const RegRip operator+(const RegRip& r, sint64 disp) { friend const RegRip operator+(const RegRip& r, sint64 disp) {
return RegRip(r.disp_ + disp, r.label_, r.isAddr_); return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
} }
@ -786,6 +802,7 @@ inline RegExp operator-(const RegExp& e, size_t disp)
// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc) // 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
void *const AutoGrow = (void*)1; //-V566 void *const AutoGrow = (void*)1; //-V566
void *const DontSetProtectRWE = (void*)2; //-V566
class CodeArray { class CodeArray {
enum Type { enum Type {
@ -825,6 +842,7 @@ protected:
size_t size_; size_t size_;
bool isCalledCalcJmpAddress_; bool isCalledCalcJmpAddress_;
bool useProtect() const { return alloc_->useProtect(); }
/* /*
allocate new memory and copy old data to the new area allocate new memory and copy old data to the new area
*/ */
@ -848,7 +866,6 @@ protected:
uint64 disp = i->getVal(top_); uint64 disp = i->getVal(top_);
rewrite(i->codeOffset, disp, i->jmpSize); rewrite(i->codeOffset, disp, i->jmpSize);
} }
if (alloc_->useProtect() && !protect(top_, size_, PROTECT_RWE)) throw Error(ERR_CANT_PROTECT);
isCalledCalcJmpAddress_ = true; isCalledCalcJmpAddress_ = true;
} }
public: public:
@ -858,7 +875,7 @@ public:
PROTECT_RE = 2 // read/exec PROTECT_RE = 2 // read/exec
}; };
explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0) explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0)
: type_(userPtr == AutoGrow ? AUTO_GROW : userPtr ? USER_BUF : ALLOC_BUF) : type_(userPtr == AutoGrow ? AUTO_GROW : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF)
, alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_) , alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_)
, maxSize_(maxSize) , maxSize_(maxSize)
, top_(type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1))) , top_(type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1)))
@ -866,7 +883,7 @@ public:
, isCalledCalcJmpAddress_(false) , isCalledCalcJmpAddress_(false)
{ {
if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC); if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC);
if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, PROTECT_RWE)) { if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
alloc_->free(top_); alloc_->free(top_);
throw Error(ERR_CANT_PROTECT); throw Error(ERR_CANT_PROTECT);
} }
@ -874,10 +891,19 @@ public:
virtual ~CodeArray() virtual ~CodeArray()
{ {
if (isAllocType()) { if (isAllocType()) {
if (alloc_->useProtect()) protect(top_, maxSize_, PROTECT_RW); if (useProtect()) setProtectModeRW(false);
alloc_->free(top_); alloc_->free(top_);
} }
} }
bool setProtectMode(ProtectMode mode, bool throwException = true)
{
bool isOK = protect(top_, maxSize_, mode);
if (isOK) return true;
if (throwException) throw Error(ERR_CANT_PROTECT);
return false;
}
bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
void resetSize() void resetSize()
{ {
size_ = 0; size_ = 0;
@ -909,10 +935,10 @@ public:
void dq(uint64 code) { db(code, 8); } void dq(uint64 code) { db(code, 8); }
const uint8 *getCode() const { return top_; } const uint8 *getCode() const { return top_; }
template<class F> template<class F>
const F getCode() const { return CastTo<F>(top_); } const F getCode() const { return reinterpret_cast<F>(top_); }
const uint8 *getCurr() const { return &top_[size_]; } const uint8 *getCurr() const { return &top_[size_]; }
template<class F> template<class F>
const F getCurr() const { return CastTo<F>(&top_[size_]); } const F getCurr() const { return reinterpret_cast<F>(&top_[size_]); }
size_t getSize() const { return size_; } size_t getSize() const { return size_; }
void setSize(size_t size) void setSize(size_t size)
{ {
@ -995,6 +1021,9 @@ public:
size_t pageSize = sysconf(_SC_PAGESIZE); size_t pageSize = sysconf(_SC_PAGESIZE);
size_t iaddr = reinterpret_cast<size_t>(addr); size_t iaddr = reinterpret_cast<size_t>(addr);
size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1)); size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
#ifndef NDEBUG
if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize);
#endif
return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0; return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
#else #else
return true; return true;
@ -1115,6 +1144,7 @@ public:
Label(const Label& rhs); Label(const Label& rhs);
Label& operator=(const Label& rhs); Label& operator=(const Label& rhs);
~Label(); ~Label();
void clear() { mgr = 0; id = 0; }
int getId() const { return id; } int getId() const { return id; }
const uint8 *getAddress() const; const uint8 *getAddress() const;
@ -1153,6 +1183,7 @@ class LabelManager {
}; };
typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList; typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList; typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
CodeArray *base_; CodeArray *base_;
// global : stateList_.front(), local : stateList_.back() // global : stateList_.front(), local : stateList_.back()
@ -1160,6 +1191,7 @@ class LabelManager {
mutable int labelId_; mutable int labelId_;
ClabelDefList clabelDefList_; ClabelDefList clabelDefList_;
ClabelUndefList clabelUndefList_; ClabelUndefList clabelUndefList_;
LabelPtrList labelPtrList_;
int getId(const Label& label) const int getId(const Label& label) const
{ {
@ -1208,9 +1240,14 @@ class LabelManager {
return true; return true;
} }
friend class Label; friend class Label;
void incRefCount(int id) { clabelDefList_[id].refCount++; } void incRefCount(int id, Label *label)
void decRefCount(int id)
{ {
clabelDefList_[id].refCount++;
labelPtrList_.insert(label);
}
void decRefCount(int id, Label *label)
{
labelPtrList_.erase(label);
ClabelDefList::iterator i = clabelDefList_.find(id); ClabelDefList::iterator i = clabelDefList_.find(id);
if (i == clabelDefList_.end()) return; if (i == clabelDefList_.end()) return;
if (i->second.refCount == 1) { if (i->second.refCount == 1) {
@ -1229,11 +1266,23 @@ class LabelManager {
#endif #endif
return !list.empty(); return !list.empty();
} }
// detach all labels linked to LabelManager
void resetLabelPtrList()
{
for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
(*i)->clear();
}
labelPtrList_.clear();
}
public: public:
LabelManager() LabelManager()
{ {
reset(); reset();
} }
~LabelManager()
{
resetLabelPtrList();
}
void reset() void reset()
{ {
base_ = 0; base_ = 0;
@ -1243,6 +1292,7 @@ public:
stateList_.push_back(SlabelState()); stateList_.push_back(SlabelState());
clabelDefList_.clear(); clabelDefList_.clear();
clabelUndefList_.clear(); clabelUndefList_.clear();
resetLabelPtrList();
} }
void enterLocal() void enterLocal()
{ {
@ -1275,10 +1325,11 @@ public:
SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front(); SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
define_inner(st.defList, st.undefList, label, base_->getSize()); define_inner(st.defList, st.undefList, label, base_->getSize());
} }
void defineClabel(const Label& label) void defineClabel(Label& label)
{ {
define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize()); define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
label.mgr = this; label.mgr = this;
labelPtrList_.insert(&label);
} }
void assign(Label& dst, const Label& src) void assign(Label& dst, const Label& src)
{ {
@ -1286,6 +1337,7 @@ public:
if (i == clabelDefList_.end()) throw Error(ERR_LABEL_ISNOT_SET_BY_L); if (i == clabelDefList_.end()) throw Error(ERR_LABEL_ISNOT_SET_BY_L);
define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset); define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
dst.mgr = this; dst.mgr = this;
labelPtrList_.insert(&dst);
} }
bool getOffset(size_t *offset, std::string& label) const bool getOffset(size_t *offset, std::string& label) const
{ {
@ -1333,19 +1385,19 @@ inline Label::Label(const Label& rhs)
{ {
id = rhs.id; id = rhs.id;
mgr = rhs.mgr; mgr = rhs.mgr;
if (mgr) mgr->incRefCount(id); if (mgr) mgr->incRefCount(id, this);
} }
inline Label& Label::operator=(const Label& rhs) inline Label& Label::operator=(const Label& rhs)
{ {
if (id) throw Error(ERR_LABEL_IS_ALREADY_SET_BY_L); if (id) throw Error(ERR_LABEL_IS_ALREADY_SET_BY_L);
id = rhs.id; id = rhs.id;
mgr = rhs.mgr; mgr = rhs.mgr;
if (mgr) mgr->incRefCount(id); if (mgr) mgr->incRefCount(id, this);
return *this; return *this;
} }
inline Label::~Label() inline Label::~Label()
{ {
if (id && mgr) mgr->decRefCount(id); if (id && mgr) mgr->decRefCount(id, this);
} }
inline const uint8* Label::getAddress() const inline const uint8* Label::getAddress() const
{ {
@ -1463,6 +1515,7 @@ private:
T_B64 = 1 << 27, // m64bcst T_B64 = 1 << 27, // m64bcst
T_M_K = 1 << 28, // mem{k} T_M_K = 1 << 28, // mem{k}
T_VSIB = 1 << 29, T_VSIB = 1 << 29,
T_MEM_EVEX = 1 << 30, // use evex if mem
T_XXX T_XXX
}; };
void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false) void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
@ -1500,7 +1553,7 @@ private:
if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) return Error(err); if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) return Error(err);
return v; return v;
} }
int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0) int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0, bool Hi16Vidx = false)
{ {
if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID); if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID);
int w = (type & T_EW1) ? 1 : 0; int w = (type & T_EW1) ? 1 : 0;
@ -1543,7 +1596,7 @@ private:
} }
} }
} }
bool Vp = !(v ? v->isExtIdx2() : 0); bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false); bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET); if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET);
db(0x62); db(0x62);
@ -1935,10 +1988,11 @@ private:
const Address& addr = op2.getAddress(); const Address& addr = op2.getAddress();
const RegExp& regExp = addr.getRegExp(); const RegExp& regExp = addr.getRegExp();
const Reg& base = regExp.getBase(); const Reg& base = regExp.getBase();
const Reg& index = regExp.getIndex();
if (BIT == 64 && addr.is32bit()) db(0x67); if (BIT == 64 && addr.is32bit()) db(0x67);
int disp8N = 0; int disp8N = 0;
bool x = regExp.getIndex().isExtIdx(); bool x = index.isExtIdx();
if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) { if ((type & (T_MUST_EVEX|T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
int aaa = addr.getOpmaskIdx(); int aaa = addr.getOpmaskIdx();
if (aaa && !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY); if (aaa && !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY);
bool b = false; bool b = false;
@ -1946,8 +2000,8 @@ private:
if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST); if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST);
b = true; b = true;
} }
int VL = regExp.isVsib() ? regExp.getIndex().getBit() : 0; int VL = regExp.isVsib() ? index.getBit() : 0;
disp8N = evex(r, base, p1, type, code, x, b, aaa, VL); disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
} else { } else {
vex(r, base, p1, type, code, x); vex(r, base, p1, type, code, x);
} }
@ -2147,7 +2201,8 @@ public:
const Segment es, cs, ss, ds, fs, gs; const Segment es, cs, ss, ds, fs, gs;
#endif #endif
void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(const std::string& label) { labelMgr_.defineSlabel(label); }
void L(const Label& label) { labelMgr_.defineClabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); }
Label L() { Label label; L(label); return label; }
void inLocalLabel() { labelMgr_.enterLocal(); } void inLocalLabel() { labelMgr_.enterLocal(); }
void outLocalLabel() { labelMgr_.leaveLocal(); } void outLocalLabel() { labelMgr_.leaveLocal(); }
/* /*
@ -2178,7 +2233,7 @@ public:
// call(function pointer) // call(function pointer)
#ifdef XBYAK_VARIADIC_TEMPLATE #ifdef XBYAK_VARIADIC_TEMPLATE
template<class Ret, class... Params> template<class Ret, class... Params>
void call(Ret(*func)(Params...)) { call(CastTo<const void*>(func)); } void call(Ret(*func)(Params...)) { call(reinterpret_cast<const void*>(func)); }
#endif #endif
void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); } void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
@ -2436,11 +2491,16 @@ public:
MUST call ready() to complete generating code if you use AutoGrow mode. MUST call ready() to complete generating code if you use AutoGrow mode.
It is not necessary for the other mode if hasUndefinedLabel() is true. It is not necessary for the other mode if hasUndefinedLabel() is true.
*/ */
void ready() void ready(ProtectMode mode = PROTECT_RWE)
{ {
if (hasUndefinedLabel()) throw Error(ERR_LABEL_IS_NOT_FOUND); if (hasUndefinedLabel()) throw Error(ERR_LABEL_IS_NOT_FOUND);
if (isAutoGrow()) calcJmpAddress(); if (isAutoGrow()) {
calcJmpAddress();
if (useProtect()) setProtectMode(mode);
}
} }
// set read/exec
void readyRE() { return ready(PROTECT_RE); }
#ifdef XBYAK_TEST #ifdef XBYAK_TEST
void dump(bool doClear = true) void dump(bool doClear = true)
{ {

View file

@ -1,4 +1,4 @@
const char *getVersionString() const { return "5.67"; } const char *getVersionString() const { return "5.77"; }
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -1023,7 +1023,7 @@ void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D); } void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D); }
void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm); } void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm); }
void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm); } void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm); }
void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); } void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); }
void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); } void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); } void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); } void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
@ -1206,28 +1206,28 @@ void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm,
void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); } void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); } void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); } void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2); } void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2); }
void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); } void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); } void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3); } void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3); }
void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47); } void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47); }
void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47); } void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47); }
void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1); } void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1); }
void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2); } void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2); }
void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46); } void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46); }
void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1); } void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1); }
void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2); } void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2); }
void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); } void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); } void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3); } void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3); }
void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45); } void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45); }
void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45); } void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45); }
void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1); } void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1); }
void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8); } void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8); }
void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA); } void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA); }

View file

@ -9,6 +9,11 @@
*/ */
#include "xbyak.h" #include "xbyak.h"
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#define XBYAK_INTEL_CPU_SPECIFIC
#endif
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER #ifdef _MSC_VER
#if (_MSC_VER < 1400) && defined(XBYAK32) #if (_MSC_VER < 1400) && defined(XBYAK32)
static inline __declspec(naked) void __cpuid(int[4], int) static inline __declspec(naked) void __cpuid(int[4], int)
@ -47,14 +52,30 @@
#endif #endif
#endif #endif
#endif #endif
#endif
namespace Xbyak { namespace util { namespace Xbyak { namespace util {
typedef enum {
SmtLevel = 1,
CoreLevel = 2
} IntelCpuTopologyLevel;
/** /**
CPU detection class CPU detection class
*/ */
class Cpu { class Cpu {
uint64 type_; uint64 type_;
//system topology
bool x2APIC_supported_;
static const size_t maxTopologyLevels = 2;
unsigned int numCores_[maxTopologyLevels];
static const unsigned int maxNumberCacheLevels = 10;
unsigned int dataCacheSize_[maxNumberCacheLevels];
unsigned int coresSharignDataCache_[maxNumberCacheLevels];
unsigned int dataCacheLevels_;
unsigned int get32bitAsBE(const char *x) const unsigned int get32bitAsBE(const char *x) const
{ {
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
@ -65,7 +86,7 @@ class Cpu {
} }
void setFamily() void setFamily()
{ {
unsigned int data[4]; unsigned int data[4] = {};
getCpuid(1, data); getCpuid(1, data);
stepping = data[0] & mask(4); stepping = data[0] & mask(4);
model = (data[0] >> 4) & mask(4); model = (data[0] >> 4) & mask(4);
@ -88,6 +109,39 @@ class Cpu {
{ {
return (val >> base) & ((1u << (end - base)) - 1); return (val >> base) & ((1u << (end - base)) - 1);
} }
void setNumCores()
{
if ((type_ & tINTEL) == 0) return;
unsigned int data[4] = {};
/* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
/*
if leaf 11 exists(x2APIC is supported),
we use it to get the number of smt cores and cores on socket
leaf 0xB can be zeroed-out by a hypervisor
*/
x2APIC_supported_ = true;
for (unsigned int i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data);
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) {
numCores_[level - 1] = extractBit(data[1], 0, 15);
}
}
} else {
/*
Failed to deremine num of cores without x2APIC support.
TODO: USE initial APIC ID to determine ncores.
*/
numCores_[SmtLevel - 1] = 0;
numCores_[CoreLevel - 1] = 0;
}
}
void setCacheHierarchy() void setCacheHierarchy()
{ {
if ((type_ & tINTEL) == 0) return; if ((type_ & tINTEL) == 0) return;
@ -96,21 +150,12 @@ class Cpu {
// const unsigned int INSTRUCTION_CACHE = 2; // const unsigned int INSTRUCTION_CACHE = 2;
const unsigned int UNIFIED_CACHE = 3; const unsigned int UNIFIED_CACHE = 3;
unsigned int smt_width = 0; unsigned int smt_width = 0;
unsigned int n_cores = 0; unsigned int logical_cores = 0;
unsigned int data[4]; unsigned int data[4] = {};
/* if (x2APIC_supported_) {
if leaf 11 exists, we use it to get the number of smt cores and cores on socket smt_width = numCores_[0];
If x2APIC is supported, these are the only correct numbers. logical_cores = numCores_[1];
leaf 0xB can be zeroed-out by a hypervisor
*/
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
getCpuidEx(0xB, 0, data); // CPUID for SMT Level
smt_width = data[1] & 0x7FFF;
getCpuidEx(0xB, 1, data); // CPUID for CORE Level
n_cores = data[1] & 0x7FFF;
} }
/* /*
@ -118,29 +163,29 @@ class Cpu {
the first level of data cache is not shared (which is the the first level of data cache is not shared (which is the
case for every existing architecture) and use this to case for every existing architecture) and use this to
determine the SMT width for arch not supporting leaf 11. determine the SMT width for arch not supporting leaf 11.
when leaf 4 reports a number of core less than n_cores when leaf 4 reports a number of core less than numCores_
on socket reported by leaf 11, then it is a correct number on socket reported by leaf 11, then it is a correct number
of cores not an upperbound. of cores not an upperbound.
*/ */
for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) { for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
getCpuidEx(0x4, i, data); getCpuidEx(0x4, i, data);
unsigned int cacheType = extractBit(data[0], 0, 4); unsigned int cacheType = extractBit(data[0], 0, 4);
if (cacheType == NO_CACHE) break; if (cacheType == NO_CACHE) break;
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1; unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (n_cores != 0) { // true only if leaf 0xB is supported and valid if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
nb_logical_cores = (std::min)(nb_logical_cores, n_cores); actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
} }
assert(nb_logical_cores != 0); assert(actual_logical_cores != 0);
data_cache_size[data_cache_levels] = dataCacheSize_[dataCacheLevels_] =
(extractBit(data[1], 22, 31) + 1) (extractBit(data[1], 22, 31) + 1)
* (extractBit(data[1], 12, 21) + 1) * (extractBit(data[1], 12, 21) + 1)
* (extractBit(data[1], 0, 11) + 1) * (extractBit(data[1], 0, 11) + 1)
* (data[2] + 1); * (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores; if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0); assert(smt_width != 0);
cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u); coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
data_cache_levels++; dataCacheLevels_++;
} }
} }
} }
@ -154,22 +199,25 @@ public:
int displayFamily; // family + extFamily int displayFamily; // family + extFamily
int displayModel; // model + extModel int displayModel; // model + extModel
// may I move these members into private? unsigned int getNumCores(IntelCpuTopologyLevel level) {
static const unsigned int maxNumberCacheLevels = 10; if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
unsigned int data_cache_size[maxNumberCacheLevels]; switch (level) {
unsigned int cores_sharing_data_cache[maxNumberCacheLevels]; case SmtLevel: return numCores_[level - 1];
unsigned int data_cache_levels; case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
default: throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
}
}
unsigned int getDataCacheLevels() const { return data_cache_levels; } unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
unsigned int getCoresSharingDataCache(unsigned int i) const unsigned int getCoresSharingDataCache(unsigned int i) const
{ {
if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
return cores_sharing_data_cache[i]; return coresSharignDataCache_[i];
} }
unsigned int getDataCacheSize(unsigned int i) const unsigned int getDataCacheSize(unsigned int i) const
{ {
if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
return data_cache_size[i]; return dataCacheSize_[i];
} }
/* /*
@ -177,30 +225,45 @@ public:
*/ */
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
{ {
#ifdef _MSC_VER #ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuid(reinterpret_cast<int*>(data), eaxIn); __cpuid(reinterpret_cast<int*>(data), eaxIn);
#else #else
__cpuid(eaxIn, data[0], data[1], data[2], data[3]); __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)data;
#endif #endif
} }
static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
{ {
#ifdef _MSC_VER #ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn); __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else #else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)ecxIn;
(void)data;
#endif #endif
} }
static inline uint64 getXfeature() static inline uint64 getXfeature()
{ {
#ifdef _MSC_VER #ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return _xgetbv(0); return _xgetbv(0);
#else #else
unsigned int eax, edx; unsigned int eax, edx;
// xgetvb is not support on gcc 4.2 // xgetvb is not support on gcc 4.2
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64)edx << 32) | eax; return ((uint64)edx << 32) | eax;
#endif
#else
return 0;
#endif #endif
} }
typedef uint64 Type; typedef uint64 Type;
@ -271,9 +334,13 @@ public:
Cpu() Cpu()
: type_(NONE) : type_(NONE)
, data_cache_levels(0) , x2APIC_supported_(false)
, numCores_()
, dataCacheSize_()
, coresSharignDataCache_()
, dataCacheLevels_(0)
{ {
unsigned int data[4]; unsigned int data[4] = {};
const unsigned int& EAX = data[0]; const unsigned int& EAX = data[0];
const unsigned int& EBX = data[1]; const unsigned int& EBX = data[1];
const unsigned int& ECX = data[2]; const unsigned int& ECX = data[2];
@ -363,6 +430,7 @@ public:
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
} }
setFamily(); setFamily();
setNumCores();
setCacheHierarchy(); setCacheHierarchy();
} }
void putFamily() const void putFamily() const
@ -381,12 +449,17 @@ class Clock {
public: public:
static inline uint64 getRdtsc() static inline uint64 getRdtsc()
{ {
#ifdef _MSC_VER #ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return __rdtsc(); return __rdtsc();
#else #else
unsigned int eax, edx; unsigned int eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx)); __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64)edx << 32) | eax; return ((uint64)edx << 32) | eax;
#endif
#else
// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
return 0;
#endif #endif
} }
Clock() Clock()
@ -416,7 +489,7 @@ const int UseRCX = 1 << 6;
const int UseRDX = 1 << 7; const int UseRDX = 1 << 7;
class Pack { class Pack {
static const size_t maxTblNum = 10; static const size_t maxTblNum = 15;
const Xbyak::Reg64 *tbl_[maxTblNum]; const Xbyak::Reg64 *tbl_[maxTblNum];
size_t n_; size_t n_;
public: public:
@ -476,7 +549,7 @@ public:
const Xbyak::Reg64& operator[](size_t n) const const Xbyak::Reg64& operator[](size_t n) const
{ {
if (n >= n_) { if (n >= n_) {
fprintf(stderr, "ERR Pack bad n=%d\n", (int)n); fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
throw Error(ERR_BAD_PARAMETER); throw Error(ERR_BAD_PARAMETER);
} }
return *tbl_[n]; return *tbl_[n];
@ -518,6 +591,7 @@ class StackFrame {
static const int rcxPos = 3; static const int rcxPos = 3;
static const int rdxPos = 2; static const int rdxPos = 2;
#endif #endif
static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
Xbyak::CodeGenerator *code_; Xbyak::CodeGenerator *code_;
int pNum_; int pNum_;
int tNum_; int tNum_;
@ -527,7 +601,7 @@ class StackFrame {
int P_; int P_;
bool makeEpilog_; bool makeEpilog_;
Xbyak::Reg64 pTbl_[4]; Xbyak::Reg64 pTbl_[4];
Xbyak::Reg64 tTbl_[10]; Xbyak::Reg64 tTbl_[maxRegNum];
Pack p_; Pack p_;
Pack t_; Pack t_;
StackFrame(const StackFrame&); StackFrame(const StackFrame&);
@ -539,7 +613,7 @@ public:
make stack frame make stack frame
@param sf [in] this @param sf [in] this
@param pNum [in] num of function parameter(0 <= pNum <= 4) @param pNum [in] num of function parameter(0 <= pNum <= 4)
@param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX) @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
@param stackSizeByte [in] local stack size @param stackSizeByte [in] local stack size
@param makeEpilog [in] automatically call close() if true @param makeEpilog [in] automatically call close() if true
@ -566,27 +640,17 @@ public:
using namespace Xbyak; using namespace Xbyak;
if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM); if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM); if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM);
const Reg64& _rsp = code->rsp; const Reg64& _rsp = code->rsp;
const AddressFrame& _ptr = code->ptr;
saveNum_ = (std::max)(0, allRegNum - noSaveNum); saveNum_ = (std::max)(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum; const int *tbl = getOrderTbl() + noSaveNum;
P_ = saveNum_ + (stackSizeByte + 7) / 8; for (int i = 0; i < saveNum_; i++) {
if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment code->push(Reg64(tbl[i]));
}
P_ = (stackSizeByte + 7) / 8;
if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
P_ *= 8; P_ *= 8;
if (P_ > 0) code->sub(_rsp, P_); if (P_ > 0) code->sub(_rsp, P_);
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i]));
}
for (int i = 4; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#else
for (int i = 0; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#endif
int pos = 0; int pos = 0;
for (int i = 0; i < pNum; i++) { for (int i = 0; i < pNum; i++) {
pTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
@ -607,21 +671,11 @@ public:
{ {
using namespace Xbyak; using namespace Xbyak;
const Reg64& _rsp = code_->rsp; const Reg64& _rsp = code_->rsp;
const AddressFrame& _ptr = code_->ptr;
const int *tbl = getOrderTbl() + noSaveNum; const int *tbl = getOrderTbl() + noSaveNum;
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
}
for (int i = 4; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#else
for (int i = 0; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#endif
if (P_ > 0) code_->add(_rsp, P_); if (P_ > 0) code_->add(_rsp, P_);
for (int i = 0; i < saveNum_; i++) {
code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
}
if (callRet) code_->ret(); if (callRet) code_->ret();
} }
@ -633,9 +687,6 @@ public:
} catch (std::exception& e) { } catch (std::exception& e) {
printf("ERR:StackFrame %s\n", e.what()); printf("ERR:StackFrame %s\n", e.what());
exit(1); exit(1);
} catch (...) {
printf("ERR:StackFrame otherwise\n");
exit(1);
} }
} }
private: private:
@ -654,7 +705,7 @@ private:
} }
int getRegIdx(int& pos) const int getRegIdx(int& pos) const
{ {
assert(pos < 14); assert(pos < maxRegNum);
using namespace Xbyak; using namespace Xbyak;
const int *tbl = getOrderTbl(); const int *tbl = getOrderTbl();
int r = tbl[pos++]; int r = tbl[pos++];