diff --git a/.travis.yml b/.travis.yml index 77be4eb..b0cade9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,8 +37,8 @@ script: - make - cd .. - - ./build/asmjit_test + - ./build/asmjit_test_unit - ./build/asmjit_test_x86 after_success: - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then valgrind --leak-check=full --show-reachable=yes ./build/asmjit_test; fi; + - if [ "$TRAVIS_OS_NAME" = "linux" ]; then valgrind --leak-check=full --show-reachable=yes ./build/asmjit_test_unit; fi; diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ee62f7..77c2726 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -320,31 +320,31 @@ EndIf() # ============================================================================= # AsmJit library is always embedded into the tests executable. This way it's -# much easier to test private functions compared to just linking to AsmJit. +# much easier to test private functions than just linking to `libasmjit.so`. If(ASMJIT_BUILD_TEST) - AsmJit_AddSource(ASMJIT_TEST_SRC asmjit/test broken.cpp broken.h main.cpp) + AsmJit_AddSource(ASMJIT_TEST_SRC test asmjit_test_unit.cpp broken.cpp broken.h) Set(ASMJIT_TEST_CFLAGS ${ASMJIT_CFLAGS} ${ASMJIT_DEFINE}ASMJIT_STATIC ${ASMJIT_DEFINE}ASMJIT_TEST) - Add_Executable(asmjit_test ${ASMJIT_SRC} ${ASMJIT_TEST_SRC}) - Target_Link_Libraries(asmjit_test ${ASMJIT_DEPS}) + Add_Executable(asmjit_test_unit ${ASMJIT_SRC} ${ASMJIT_TEST_SRC}) + Target_Link_Libraries(asmjit_test_unit ${ASMJIT_DEPS}) If(${CMAKE_BUILD_TYPE}) If(${CMAKE_BUILD_TYPE} MATCHES "Debug") - Set_Target_Properties(asmjit_test PROPERTIES COMPILE_FLAGS ${ASMJIT_TEST_CFLAGS} ${ASMJIT_CFLAGS_DBG}) + Set_Target_Properties(asmjit_test_unit PROPERTIES COMPILE_FLAGS ${ASMJIT_TEST_CFLAGS} ${ASMJIT_CFLAGS_DBG}) Else() - Set_Target_Properties(asmjit_test PROPERTIES COMPILE_FLAGS ${ASMJIT_TEST_CFLAGS} ${ASMJIT_CFLAGS_REL}) + Set_Target_Properties(asmjit_test_unit PROPERTIES COMPILE_FLAGS ${ASMJIT_TEST_CFLAGS} ${ASMJIT_CFLAGS_REL}) EndIf() Else() - Target_Compile_Options(asmjit_test PUBLIC ${ASMJIT_TEST_CFLAGS} + Target_Compile_Options(asmjit_test_unit PUBLIC ${ASMJIT_TEST_CFLAGS} $<$:${ASMJIT_CFLAGS_DBG}> $<$>:${ASMJIT_CFLAGS_REL}>) EndIf() - Set_Target_Properties(asmjit_test PROPERTIES LINK_FLAGS "${ASMJIT_LFLAGS}") + Set_Target_Properties(asmjit_test_unit PROPERTIES LINK_FLAGS "${ASMJIT_LFLAGS}") EndIf() # ============================================================================= @@ -359,7 +359,7 @@ If(ASMJIT_BUILD_SAMPLES) ) ForEach(file ${ASMJIT_SRC_SAMPLES}) - Add_Executable(${file} src/app/test/${file}.cpp) + Add_Executable(${file} src/test/${file}.cpp) Target_Link_Libraries(${file} asmjit ${ASMJIT_DEPS}) EndForEach(file) EndIf() diff --git a/src/app/test/asmjit_test_opcode.cpp b/src/app/test/asmjit_test_opcode.cpp deleted file mode 100644 index eb71d40..0000000 --- a/src/app/test/asmjit_test_opcode.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// [AsmJit] -// Complete x86/x64 JIT and Remote Assembler for C++. -// -// [License] -// Zlib - See LICENSE.md file in the package. - -// This file is used to test opcodes generated by AsmJit. Output can be -// disassembled in your IDE or by your favourite disassembler. Instructions -// are grouped by category and then sorted alphabetically. - -// [Dependencies - AsmJit] -#include - -// [Dependencies - Test] -#include "genopcode.h" - -// [Dependencies - C] -#include -#include -#include - -typedef void (*VoidFunc)(void); - -int main(int argc, char* argv[]) { - using namespace asmjit; - using namespace asmjit::host; - - FileLogger logger(stdout); - logger.setOption(kLoggerOptionBinaryForm, true); - - JitRuntime runtime; - X86Assembler a(&runtime); - - a.setLogger(&logger); - asmgen::opcode(a); - - VoidFunc p = asmjit_cast(a.make()); - p(); - runtime.release((void*)p); - - return 0; -} diff --git a/src/app/test/genopcode.h b/src/app/test/genopcode.h deleted file mode 100644 index da5d0d5..0000000 --- a/src/app/test/genopcode.h +++ /dev/null @@ -1,2839 +0,0 @@ -// [AsmJit] -// Complete x86/x64 JIT and Remote Assembler for C++. -// -// [License] -// Zlib - See LICENSE.md file in the package. - -// [Guard] -#ifndef _APP_TEST_GENOPCODE_H -#define _APP_TEST_GENOPCODE_H - -// [Dependencies] -#include - -namespace asmgen { - -enum { kGenOpCodeInstCount = 2670 }; - -// Generate all instructions asmjit can emit. -static void opcode(asmjit::X86Assembler& a) { - using namespace asmjit; - using namespace asmjit::x86; - - const X86GpReg& zax = a.zax; - const X86GpReg& zdx = a.zdx; - const X86GpReg& zcx = a.zcx; - const X86GpReg& zbx = a.zbx; - const X86GpReg& zsp = a.zsp; - const X86GpReg& zbp = a.zbp; - const X86GpReg& zsi = a.zsi; - const X86GpReg& zdi = a.zdi; - - // Prevent crashing when the generated function is called (for debugging to - // see disassembly). - a.ret(); - - // When any problem is found this section can be used to customize the index - // of the registers used. - X86GpReg gp0 = zax; - X86GpReg gp1 = zsi; - X86FpReg fpx = fp6; - - X86Mem ptr_gp0 = ptr(gp0); - X86Mem ptr_gp1 = ptr(gp1); - - X86Mem vm32x = ptr(gp0, xmm1); - X86Mem vm32y = ptr(gp0, ymm1); - - X86Mem intptr_gp0 = a.intptr_ptr(gp0); - X86Mem intptr_gp1 = a.intptr_ptr(gp1); - - // Base. - a.adc(al, 1); - a.adc(ah, 1); - a.adc(bl, 1); - a.adc(bh, 1); - a.adc(ax, 1); - a.adc(bx, 1); - a.adc(eax, 1); - a.adc(ebx, 1); - a.adc(gp0, gp1); - a.adc(gp0, intptr_gp1); - a.adc(gp0, 0); - a.adc(intptr_gp0, gp1); - a.adc(intptr_gp0, 0); - a.add(al, 1); - a.add(ah, 1); - a.add(bl, 1); - a.add(bh, 1); - a.add(ax, 1); - a.add(bx, 1); - a.add(eax, 1); - a.add(ebx, 1); - a.add(gp0, gp1); - a.add(gp0, intptr_gp1); - a.add(gp0, 0); - a.add(intptr_gp0, gp1); - a.add(intptr_gp0, 0); - a.and_(al, 1); - a.and_(ah, 1); - a.and_(bl, 1); - a.and_(bh, 1); - a.and_(ax, 1); - a.and_(bx, 1); - a.and_(eax, 1); - a.and_(ebx, 1); - a.and_(gp0, gp1); - a.and_(gp0, intptr_gp1); - a.and_(gp0, 0); - a.and_(intptr_gp0, gp1); - a.and_(intptr_gp0, 0); - a.bswap(gp0); - a.bt(gp0, gp1); - a.bt(intptr_gp0, gp1); - a.bt(gp0, 0); - a.bt(intptr_gp0, 0); - a.btc(gp0, gp1); - a.btc(intptr_gp0, gp1); - a.btc(gp0, 0); - a.btc(intptr_gp0, 0); - a.btr(gp0, gp1); - a.btr(intptr_gp0, gp1); - a.btr(gp0, 0); - a.btr(intptr_gp0, 0); - a.bts(gp0, gp1); - a.bts(intptr_gp0, gp1); - a.bts(gp0, 0); - a.bts(intptr_gp0, 0); - a.call(gp0); - a.call(intptr_gp0); - a.cbw(); - a.cwde(); - a.clc(); - a.cld(); - a.cmc(); - a.cmp(al, 1); - a.cmp(ah, 1); - a.cmp(bl, 1); - a.cmp(bh, 1); - a.cmp(ax, 1); - a.cmp(bx, 1); - a.cmp(eax, 1); - a.cmp(ebx, 1); - a.cmp(gp0, gp1); - a.cmp(gp0, intptr_gp1); - a.cmp(gp0, 0); - a.cmp(intptr_gp0, gp1); - a.cmp(intptr_gp0, 0); - a.cmpxchg(gp0, gp1); - a.cmpxchg(intptr_gp0, gp1); - a.cmpxchg8b(ptr_gp0); - a.cpuid(); - a.crc32(gp0, ptr_gp1); - a.dec(gp0); - a.dec(intptr_gp0); - a.div(gp0); - a.div(intptr_gp0); - a.idiv(gp0); - a.idiv(intptr_gp0); - a.imul(gp0); - a.imul(intptr_gp0); - a.imul(gp0, gp1); - a.imul(gp0, intptr_gp1); - a.imul(gp0, 0); - a.imul(gp0, gp1, 0); - a.imul(gp0, intptr_gp1, 0); - a.inc(gp0); - a.inc(intptr_gp0); - a.int3(); - a.lea(gp0, intptr_gp1); - a.mov(gp0, gp1); - a.mov(gp0, intptr_gp1); - a.mov(gp0, 0); - a.mov(intptr_gp0, gp1); - a.mov(intptr_gp0, 0); - a.movsx(gp0, al); - a.movsx(gp0, byte_ptr(gp1)); - a.movzx(gp0, al); - a.movzx(gp0, byte_ptr(gp1)); - a.movbe(gp0, ptr_gp1); - a.movbe(ptr_gp0, gp1); - a.mul(gp0); - a.mul(intptr_gp0); - a.neg(gp0); - a.neg(intptr_gp0); - a.nop(); - a.not_(gp0); - a.not_(intptr_gp0); - a.or_(al, 1); - a.or_(ah, 1); - a.or_(bl, 1); - a.or_(bh, 1); - a.or_(ax, 1); - a.or_(bx, 1); - a.or_(eax, 1); - a.or_(ebx, 1); - a.or_(gp0, gp1); - a.or_(gp0, intptr_gp1); - a.or_(gp0, 0); - a.or_(intptr_gp0, gp1); - a.or_(intptr_gp0, 0); - a.pop(gp0); - a.pop(intptr_gp0); - a.push(gp0); - a.push(intptr_gp0); - a.push(0); - a.rcl(gp0, cl); - a.rcl(gp0, 0); - a.rcl(gp0, 1); - a.rcl(intptr_gp0, cl); - a.rcl(intptr_gp0, 0); - a.rcl(intptr_gp0, 1); - a.rcr(gp0, cl); - a.rcr(gp0, 0); - a.rcr(gp0, 1); - a.rcr(intptr_gp0, cl); - a.rcr(intptr_gp0, 0); - a.rcr(intptr_gp0, 1); - a.rdtsc(); - a.rdtscp(); - a.ret(); - a.ret(0); - a.rol(gp0, cl); - a.rol(gp0, 0); - a.rol(gp0, 1); - a.rol(intptr_gp0, cl); - a.rol(intptr_gp0, 0); - a.rol(intptr_gp0, 1); - a.ror(gp0, cl); - a.ror(gp0, 0); - a.ror(gp0, 1); - a.ror(intptr_gp0, cl); - a.ror(intptr_gp0, 0); - a.ror(intptr_gp0, 1); - a.sbb(al, 1); - a.sbb(ah, 1); - a.sbb(bl, 1); - a.sbb(bh, 1); - a.sbb(ax, 1); - a.sbb(bx, 1); - a.sbb(eax, 1); - a.sbb(ebx, 1); - a.sbb(gp0, gp1); - a.sbb(gp0, intptr_gp1); - a.sbb(gp0, 0); - a.sbb(intptr_gp0, gp1); - a.sbb(intptr_gp0, 0); - a.sal(gp0, cl); - a.sal(gp0, 0); - a.sal(gp0, 1); - a.sal(intptr_gp0, cl); - a.sal(intptr_gp0, 0); - a.sal(intptr_gp0, 1); - a.sar(gp0, cl); - a.sar(gp0, 0); - a.sar(gp0, 1); - a.sar(intptr_gp0, cl); - a.sar(intptr_gp0, 0); - a.sar(intptr_gp0, 1); - a.shl(gp0, cl); - a.shl(gp0, 0); - a.shl(gp0, 1); - a.shl(intptr_gp0, cl); - a.shl(intptr_gp0, 0); - a.shl(intptr_gp0, 1); - a.shr(gp0, cl); - a.shr(gp0, 0); - a.shr(gp0, 1); - a.shr(intptr_gp0, cl); - a.shr(intptr_gp0, 0); - a.shr(intptr_gp0, 1); - a.shld(gp0, gp1, cl); - a.shld(gp0, gp1, 0); - a.shld(intptr_gp0, gp1, cl); - a.shld(intptr_gp0, gp1, 0); - a.shrd(gp0, gp1, cl); - a.shrd(gp0, gp1, 0); - a.shrd(intptr_gp0, gp1, cl); - a.shrd(intptr_gp0, gp1, 0); - a.stc(); - a.std(); - a.sub(al, 1); - a.sub(ah, 1); - a.sub(bl, 1); - a.sub(bh, 1); - a.sub(ax, 1); - a.sub(bx, 1); - a.sub(eax, 1); - a.sub(ebx, 1); - a.sub(gp0, gp1); - a.sub(gp0, intptr_gp1); - a.sub(gp0, 0); - a.sub(intptr_gp0, gp1); - a.sub(intptr_gp0, 0); - a.test(gp0, gp1); - a.test(gp0, 0); - a.test(intptr_gp0, gp1); - a.test(intptr_gp0, 0); - a.ud2(); - a.xadd(gp0, gp1); - a.xadd(intptr_gp0, gp1); - a.xchg(gp0, gp1); - a.xchg(intptr_gp0, gp1); - a.xchg(gp0, intptr_gp1); - a.xor_(al, 1); - a.xor_(ah, 1); - a.xor_(bl, 1); - a.xor_(bh, 1); - a.xor_(ax, 1); - a.xor_(bx, 1); - a.xor_(eax, 1); - a.xor_(ebx, 1); - a.xor_(gp0, gp1); - a.xor_(gp0, intptr_gp1); - a.xor_(gp0, 0); - a.xor_(intptr_gp0, gp1); - a.xor_(intptr_gp0, 0); - - a.nop(); - - a.lodsb(); - a.lodsd(); - a.lodsw(); - a.rep_lodsb(); - a.rep_lodsd(); - a.rep_lodsw(); - - a.movsb(); - a.movsd(); - a.movsw(); - a.rep_movsb(); - a.rep_movsd(); - a.rep_movsw(); - - a.stosb(); - a.stosd(); - a.stosw(); - a.rep_stosb(); - a.rep_stosd(); - a.rep_stosw(); - - a.cmpsb(); - a.cmpsd(); - a.cmpsw(); - a.repe_cmpsb(); - a.repe_cmpsd(); - a.repe_cmpsw(); - a.repne_cmpsb(); - a.repne_cmpsd(); - a.repne_cmpsw(); - - a.scasb(); - a.scasd(); - a.scasw(); - a.repe_scasb(); - a.repe_scasd(); - a.repe_scasw(); - a.repne_scasb(); - a.repne_scasd(); - a.repne_scasw(); - - // Label...Jcc/Jecxz/Jmp. - { - a.nop(); - - Label L(a); - a.bind(L); - - a.ja(L); - a.jae(L); - a.jb(L); - a.jbe(L); - a.jc(L); - a.je(L); - a.jg(L); - a.jge(L); - a.jl(L); - a.jle(L); - a.jna(L); - a.jnae(L); - a.jnb(L); - a.jnbe(L); - a.jnc(L); - a.jne(L); - a.jng(L); - a.jnge(L); - a.jnl(L); - a.jnle(L); - a.jno(L); - a.jnp(L); - a.jns(L); - a.jnz(L); - a.jo(L); - a.jp(L); - a.jpe(L); - a.jpo(L); - a.js(L); - a.jz(L); - a.jecxz(ecx, L); - a.jmp(L); - } - - // Jcc/Jecxz/Jmp...Label. - { - a.nop(); - - Label L(a); - a.ja(L); - a.jae(L); - a.jb(L); - a.jbe(L); - a.jc(L); - a.je(L); - a.jg(L); - a.jge(L); - a.jl(L); - a.jle(L); - a.jna(L); - a.jnae(L); - a.jnb(L); - a.jnbe(L); - a.jnc(L); - a.jne(L); - a.jng(L); - a.jnge(L); - a.jnl(L); - a.jnle(L); - a.jno(L); - a.jnp(L); - a.jns(L); - a.jnz(L); - a.jo(L); - a.jp(L); - a.jpe(L); - a.jpo(L); - a.js(L); - a.jz(L); - a.jecxz(ecx, L); - a.jmp(L); - a.bind(L); - } - - // Fpu. - a.nop(); - - a.f2xm1(); - a.fabs(); - a.fadd(fp0, fpx); - a.fadd(fpx, fp0); - a.fadd(dword_ptr(gp0)); - a.fadd(qword_ptr(gp0)); - a.faddp(fpx); - a.faddp(); - a.fbld(dword_ptr(gp0)); - a.fbstp(dword_ptr(gp0)); - a.fchs(); - a.fclex(); - a.fcom(fpx); - a.fcom(); - a.fcom(dword_ptr(gp0)); - a.fcom(qword_ptr(gp0)); - a.fcomp(fpx); - a.fcomp(); - a.fcomp(dword_ptr(gp0)); - a.fcomp(qword_ptr(gp0)); - a.fcompp(); - a.fcos(); - a.fdecstp(); - a.fdiv(fp0, fpx); - a.fdiv(fpx, fp0); - a.fdiv(dword_ptr(gp0)); - a.fdiv(qword_ptr(gp0)); - a.fdivp(fpx); - a.fdivp(); - a.fdivr(fp0, fpx); - a.fdivr(fpx, fp0); - a.fdivr(dword_ptr(gp0)); - a.fdivr(qword_ptr(gp0)); - a.fdivrp(fpx); - a.fdivrp(); - a.fiadd(dword_ptr(gp0)); - a.ficom(word_ptr(gp0)); - a.ficom(dword_ptr(gp0)); - a.ficomp(word_ptr(gp0)); - a.ficomp(dword_ptr(gp0)); - a.fidiv(word_ptr(gp0)); - a.fidiv(dword_ptr(gp0)); - a.fidivr(word_ptr(gp0)); - a.fidivr(dword_ptr(gp0)); - a.fild(word_ptr(gp0)); - a.fild(dword_ptr(gp0)); - a.fild(qword_ptr(gp0)); - a.fimul(word_ptr(gp0)); - a.fimul(dword_ptr(gp0)); - a.fincstp(); - a.finit(); - a.fninit(); - a.fisub(word_ptr(gp0)); - a.fisub(dword_ptr(gp0)); - a.fisubr(word_ptr(gp0)); - a.fisubr(dword_ptr(gp0)); - a.fist(word_ptr(gp0)); - a.fist(dword_ptr(gp0)); - a.fistp(word_ptr(gp0)); - a.fistp(dword_ptr(gp0)); - a.fistp(qword_ptr(gp0)); - a.fld(dword_ptr(gp0)); - a.fld(qword_ptr(gp0)); - a.fld(tword_ptr(gp0)); - a.fld1(); - a.fldl2t(); - a.fldl2e(); - a.fldpi(); - a.fldlg2(); - a.fldln2(); - a.fldz(); - a.fldcw(ptr_gp0); - a.fldenv(ptr_gp0); - a.fmul(fp0, fpx); - a.fmul(fpx, fp0); - a.fmul(dword_ptr(gp0)); - a.fmul(qword_ptr(gp0)); - a.fmulp(fpx); - a.fmulp(); - a.fnclex(); - a.fnop(); - a.fnsave(ptr_gp0); - a.fnstenv(ptr_gp0); - a.fnstcw(ptr_gp0); - a.fpatan(); - a.fprem(); - a.fprem1(); - a.fptan(); - a.frndint(); - a.frstor(ptr_gp0); - a.fsave(ptr_gp0); - a.fscale(); - a.fsin(); - a.fsincos(); - a.fsqrt(); - a.fst(dword_ptr(gp0)); - a.fst(qword_ptr(gp0)); - a.fstp(dword_ptr(gp0)); - a.fstp(qword_ptr(gp0)); - a.fstp(tword_ptr(gp0)); - a.fstcw(ptr_gp0); - a.fstenv(ptr_gp0); - a.fsub(fp0, fpx); - a.fsub(fpx, fp0); - a.fsub(dword_ptr(gp0)); - a.fsub(qword_ptr(gp0)); - a.fsubp(fpx); - a.fsubp(); - a.fsubr(fp0, fpx); - a.fsubr(fpx, fp0); - a.fsubr(dword_ptr(gp0)); - a.fsubr(qword_ptr(gp0)); - a.fsubrp(fpx); - a.fsubrp(); - a.ftst(); - a.fucom(fpx); - a.fucom(); - a.fucom(fpx); - a.fucomi(fpx); - a.fucomip(fpx); - a.fucomp(fpx); - a.fucompp(); - a.fxam(); - a.fxrstor(ptr_gp0); - a.fxsave(ptr_gp0); - a.fxtract(); - a.fyl2x(); - a.fyl2xp1(); - - // MMX/MMX-EXT. - a.nop(); - - a.movd(ptr_gp0, mm7); - a.movd(eax, mm7); - a.movd(mm0, ptr_gp0); - a.movd(mm0, esi); - a.movq(mm0, mm7); - a.movq(ptr_gp0, mm7); - a.movq(mm0, ptr_gp0); - a.packuswb(mm0, mm7); - a.packuswb(mm0, ptr_gp0); - a.paddb(mm0, mm7); - a.paddb(mm0, ptr_gp0); - a.paddw(mm0, mm7); - a.paddw(mm0, ptr_gp0); - a.paddd(mm0, mm7); - a.paddd(mm0, ptr_gp0); - a.paddsb(mm0, mm7); - a.paddsb(mm0, ptr_gp0); - a.paddsw(mm0, mm7); - a.paddsw(mm0, ptr_gp0); - a.paddusb(mm0, mm7); - a.paddusb(mm0, ptr_gp0); - a.paddusw(mm0, mm7); - a.paddusw(mm0, ptr_gp0); - a.pand(mm0, mm7); - a.pand(mm0, ptr_gp0); - a.pandn(mm0, mm7); - a.pandn(mm0, ptr_gp0); - a.pcmpeqb(mm0, mm7); - a.pcmpeqb(mm0, ptr_gp0); - a.pcmpeqw(mm0, mm7); - a.pcmpeqw(mm0, ptr_gp0); - a.pcmpeqd(mm0, mm7); - a.pcmpeqd(mm0, ptr_gp0); - a.pcmpgtb(mm0, mm7); - a.pcmpgtb(mm0, ptr_gp0); - a.pcmpgtw(mm0, mm7); - a.pcmpgtw(mm0, ptr_gp0); - a.pcmpgtd(mm0, mm7); - a.pcmpgtd(mm0, ptr_gp0); - a.pmulhw(mm0, mm7); - a.pmulhw(mm0, ptr_gp0); - a.pmullw(mm0, mm7); - a.pmullw(mm0, ptr_gp0); - a.por(mm0, mm7); - a.por(mm0, ptr_gp0); - a.pmaddwd(mm0, mm7); - a.pmaddwd(mm0, ptr_gp0); - a.pslld(mm0, mm7); - a.pslld(mm0, ptr_gp0); - a.pslld(mm0, 0); - a.psllq(mm0, mm7); - a.psllq(mm0, ptr_gp0); - a.psllq(mm0, 0); - a.psllw(mm0, mm7); - a.psllw(mm0, ptr_gp0); - a.psllw(mm0, 0); - a.psrad(mm0, mm7); - a.psrad(mm0, ptr_gp0); - a.psrad(mm0, 0); - a.psraw(mm0, mm7); - a.psraw(mm0, ptr_gp0); - a.psraw(mm0, 0); - a.psrld(mm0, mm7); - a.psrld(mm0, ptr_gp0); - a.psrld(mm0, 0); - a.psrlq(mm0, mm7); - a.psrlq(mm0, ptr_gp0); - a.psrlq(mm0, 0); - a.psrlw(mm0, mm7); - a.psrlw(mm0, ptr_gp0); - a.psrlw(mm0, 0); - a.psubb(mm0, mm7); - a.psubb(mm0, ptr_gp0); - a.psubw(mm0, mm7); - a.psubw(mm0, ptr_gp0); - a.psubd(mm0, mm7); - a.psubd(mm0, ptr_gp0); - a.psubsb(mm0, mm7); - a.psubsb(mm0, ptr_gp0); - a.psubsw(mm0, mm7); - a.psubsw(mm0, ptr_gp0); - a.psubusb(mm0, mm7); - a.psubusb(mm0, ptr_gp0); - a.psubusw(mm0, mm7); - a.psubusw(mm0, ptr_gp0); - a.punpckhbw(mm0, mm7); - a.punpckhbw(mm0, ptr_gp0); - a.punpckhwd(mm0, mm7); - a.punpckhwd(mm0, ptr_gp0); - a.punpckhdq(mm0, mm7); - a.punpckhdq(mm0, ptr_gp0); - a.punpcklbw(mm0, mm7); - a.punpcklbw(mm0, ptr_gp0); - a.punpcklwd(mm0, mm7); - a.punpcklwd(mm0, ptr_gp0); - a.punpckldq(mm0, mm7); - a.punpckldq(mm0, ptr_gp0); - a.pxor(mm0, mm7); - a.pxor(mm0, ptr_gp0); - a.emms(); - - // 3DNOW! - a.nop(); - - a.pf2id(mm0, mm7); - a.pf2id(mm0, ptr_gp0); - a.pf2iw(mm0, mm7); - a.pf2iw(mm0, ptr_gp0); - a.pfacc(mm0, mm7); - a.pfacc(mm0, ptr_gp0); - a.pfadd(mm0, mm7); - a.pfadd(mm0, ptr_gp0); - a.pfcmpeq(mm0, mm7); - a.pfcmpeq(mm0, ptr_gp0); - a.pfcmpge(mm0, mm7); - a.pfcmpge(mm0, ptr_gp0); - a.pfcmpgt(mm0, mm7); - a.pfcmpgt(mm0, ptr_gp0); - a.pfmax(mm0, mm7); - a.pfmax(mm0, ptr_gp0); - a.pfmin(mm0, mm7); - a.pfmin(mm0, ptr_gp0); - a.pfmul(mm0, mm7); - a.pfmul(mm0, ptr_gp0); - a.pfnacc(mm0, mm7); - a.pfnacc(mm0, ptr_gp0); - a.pfpnacc(mm0, mm7); - a.pfpnacc(mm0, ptr_gp0); - a.pfrcp(mm0, mm7); - a.pfrcp(mm0, ptr_gp0); - a.pfrcpit1(mm0, mm7); - a.pfrcpit1(mm0, ptr_gp0); - a.pfrcpit2(mm0, mm7); - a.pfrcpit2(mm0, ptr_gp0); - a.pfrsqit1(mm0, mm7); - a.pfrsqit1(mm0, ptr_gp0); - a.pfrsqrt(mm0, mm7); - a.pfrsqrt(mm0, ptr_gp0); - a.pfsub(mm0, mm7); - a.pfsub(mm0, ptr_gp0); - a.pfsubr(mm0, mm7); - a.pfsubr(mm0, ptr_gp0); - a.pi2fd(mm0, mm7); - a.pi2fd(mm0, ptr_gp0); - a.pi2fw(mm0, mm7); - a.pi2fw(mm0, ptr_gp0); - a.pswapd(mm0, mm7); - a.pswapd(mm0, ptr_gp0); - a.prefetch3dnow(ptr_gp0); - a.prefetchw3dnow(ptr_gp0); - a.femms(); - - // SSE. - a.nop(); - - a.addps(xmm0, xmm7); - a.addps(xmm0, ptr_gp0); - a.addss(xmm0, xmm7); - a.addss(xmm0, ptr_gp0); - a.andnps(xmm0, xmm7); - a.andnps(xmm0, ptr_gp0); - a.andps(xmm0, xmm7); - a.andps(xmm0, ptr_gp0); - a.cmpps(xmm0, xmm7, 0); - a.cmpps(xmm0, ptr_gp0, 0); - a.cmpss(xmm0, xmm7, 0); - a.cmpss(xmm0, ptr_gp0, 0); - a.comiss(xmm0, xmm7); - a.comiss(xmm0, ptr_gp0); - a.cvtpi2ps(xmm0, mm7); - a.cvtpi2ps(xmm0, ptr_gp0); - a.cvtps2pi(mm0, xmm7); - a.cvtps2pi(mm0, ptr_gp0); - a.cvtsi2ss(xmm0, gp0); - a.cvtsi2ss(xmm0, ptr_gp0); - a.cvtss2si(gp0, xmm7); - a.cvtss2si(gp0, ptr_gp0); - a.cvttps2pi(mm0, xmm7); - a.cvttps2pi(mm0, ptr_gp0); - a.cvttss2si(gp0, xmm7); - a.cvttss2si(gp0, ptr_gp0); - a.divps(xmm0, xmm7); - a.divps(xmm0, ptr_gp0); - a.divss(xmm0, xmm7); - a.divss(xmm0, ptr_gp0); - a.ldmxcsr(ptr_gp0); - a.maskmovq(mm0, mm7); - a.maxps(xmm0, xmm7); - a.maxps(xmm0, ptr_gp0); - a.maxss(xmm0, xmm7); - a.maxss(xmm0, ptr_gp0); - a.minps(xmm0, xmm7); - a.minps(xmm0, ptr_gp0); - a.minss(xmm0, xmm7); - a.minss(xmm0, ptr_gp0); - a.movaps(xmm0, xmm7); - a.movaps(xmm0, ptr_gp0); - a.movaps(ptr_gp0, xmm7); - a.movd(ptr_gp0, xmm7); - a.movd(eax, xmm7); - a.movd(xmm0, ptr_gp0); - a.movd(xmm0, eax); - a.movq(mm0, mm7); - a.movq(xmm0, xmm7); - a.movq(ptr_gp0, xmm7); - a.movq(xmm0, ptr_gp0); - a.movntq(ptr_gp0, mm7); - a.movhlps(xmm0, xmm7); - a.movhps(xmm0, ptr_gp0); - a.movhps(ptr_gp0, xmm7); - a.movlhps(xmm0, xmm7); - a.movlps(xmm0, ptr_gp0); - a.movlps(ptr_gp0, xmm7); - a.movntps(ptr_gp0, xmm7); - a.movss(xmm0, ptr_gp0); - a.movss(ptr_gp0, xmm7); - a.movups(xmm0, xmm7); - a.movups(xmm0, ptr_gp0); - a.movups(ptr_gp0, xmm7); - a.mulps(xmm0, xmm7); - a.mulps(xmm0, ptr_gp0); - a.mulss(xmm0, xmm7); - a.mulss(xmm0, ptr_gp0); - a.orps(xmm0, xmm7); - a.orps(xmm0, ptr_gp0); - a.pavgb(mm0, mm7); - a.pavgb(mm0, ptr_gp0); - a.pavgw(mm0, mm7); - a.pavgw(mm0, ptr_gp0); - a.pextrw(gp0, mm7, 0); - a.pinsrw(mm0, eax, 0); - a.pinsrw(mm0, ptr_gp0, 0); - a.pmaxsw(mm0, mm7); - a.pmaxsw(mm0, ptr_gp0); - a.pmaxub(mm0, mm7); - a.pmaxub(mm0, ptr_gp0); - a.pminsw(mm0, mm7); - a.pminsw(mm0, ptr_gp0); - a.pminub(mm0, mm7); - a.pminub(mm0, ptr_gp0); - a.pmovmskb(gp0, mm7); - a.pmulhuw(mm0, mm7); - a.pmulhuw(mm0, ptr_gp0); - a.psadbw(mm0, mm7); - a.psadbw(mm0, ptr_gp0); - a.pshufw(mm0, mm7, 0); - a.pshufw(mm0, ptr_gp0, 0); - a.rcpps(xmm0, xmm7); - a.rcpps(xmm0, ptr_gp0); - a.rcpss(xmm0, xmm7); - a.rcpss(xmm0, ptr_gp0); - a.prefetch(ptr_gp0, 0); - a.psadbw(xmm0, xmm7); - a.psadbw(xmm0, ptr_gp0); - a.rsqrtps(xmm0, xmm7); - a.rsqrtps(xmm0, ptr_gp0); - a.rsqrtss(xmm0, xmm7); - a.rsqrtss(xmm0, ptr_gp0); - a.sfence(); - a.shufps(xmm0, xmm7, 0); - a.shufps(xmm0, ptr_gp0, 0); - a.sqrtps(xmm0, xmm7); - a.sqrtps(xmm0, ptr_gp0); - a.sqrtss(xmm0, xmm7); - a.sqrtss(xmm0, ptr_gp0); - a.stmxcsr(ptr_gp0); - a.subps(xmm0, xmm7); - a.subps(xmm0, ptr_gp0); - a.subss(xmm0, xmm7); - a.subss(xmm0, ptr_gp0); - a.ucomiss(xmm0, xmm7); - a.ucomiss(xmm0, ptr_gp0); - a.unpckhps(xmm0, xmm7); - a.unpckhps(xmm0, ptr_gp0); - a.unpcklps(xmm0, xmm7); - a.unpcklps(xmm0, ptr_gp0); - a.xorps(xmm0, xmm7); - a.xorps(xmm0, ptr_gp0); - - // SSE2. - a.nop(); - - a.addpd(xmm0, xmm7); - a.addpd(xmm0, ptr_gp0); - a.addsd(xmm0, xmm7); - a.addsd(xmm0, ptr_gp0); - a.andnpd(xmm0, xmm7); - a.andnpd(xmm0, ptr_gp0); - a.andpd(xmm0, xmm7); - a.andpd(xmm0, ptr_gp0); - a.clflush(ptr_gp0); - a.cmppd(xmm0, xmm7, 0); - a.cmppd(xmm0, ptr_gp0, 0); - a.cmpsd(xmm0, xmm7, 0); - a.cmpsd(xmm0, ptr_gp0, 0); - a.comisd(xmm0, xmm7); - a.comisd(xmm0, ptr_gp0); - a.cvtdq2pd(xmm0, xmm7); - a.cvtdq2pd(xmm0, ptr_gp0); - a.cvtdq2ps(xmm0, xmm7); - a.cvtdq2ps(xmm0, ptr_gp0); - a.cvtpd2dq(xmm0, xmm7); - a.cvtpd2dq(xmm0, ptr_gp0); - a.cvtpd2pi(mm0, xmm7); - a.cvtpd2pi(mm0, ptr_gp0); - a.cvtpd2ps(xmm0, xmm7); - a.cvtpd2ps(xmm0, ptr_gp0); - a.cvtpi2pd(xmm0, mm7); - a.cvtpi2pd(xmm0, ptr_gp0); - a.cvtps2dq(xmm0, xmm7); - a.cvtps2dq(xmm0, ptr_gp0); - a.cvtps2pd(xmm0, xmm7); - a.cvtps2pd(xmm0, ptr_gp0); - a.cvtsd2si(gp0, xmm7); - a.cvtsd2si(gp0, ptr_gp0); - a.cvtsd2ss(xmm0, xmm7); - a.cvtsd2ss(xmm0, ptr_gp0); - a.cvtsi2sd(xmm0, zsi); - a.cvtsi2sd(xmm0, ptr_gp0); - a.cvtss2sd(xmm0, xmm7); - a.cvtss2sd(xmm0, ptr_gp0); - a.cvtss2si(gp0, xmm7); - a.cvtss2si(gp0, ptr_gp0); - a.cvttpd2pi(mm0, xmm7); - a.cvttpd2pi(mm0, ptr_gp0); - a.cvttpd2dq(xmm0, xmm7); - a.cvttpd2dq(xmm0, ptr_gp0); - a.cvttps2dq(xmm0, xmm7); - a.cvttps2dq(xmm0, ptr_gp0); - a.cvttsd2si(gp0, xmm7); - a.cvttsd2si(gp0, ptr_gp0); - a.divpd(xmm0, xmm7); - a.divpd(xmm0, ptr_gp0); - a.divsd(xmm0, xmm7); - a.divsd(xmm0, ptr_gp0); - a.lfence(); - a.maskmovdqu(xmm0, xmm7); - a.maxpd(xmm0, xmm7); - a.maxpd(xmm0, ptr_gp0); - a.maxsd(xmm0, xmm7); - a.maxsd(xmm0, ptr_gp0); - a.mfence(); - a.minpd(xmm0, xmm7); - a.minpd(xmm0, ptr_gp0); - a.minsd(xmm0, xmm7); - a.minsd(xmm0, ptr_gp0); - a.movdqa(xmm0, xmm7); - a.movdqa(xmm0, ptr_gp0); - a.movdqa(ptr_gp0, xmm7); - a.movdqu(xmm0, xmm7); - a.movdqu(xmm0, ptr_gp0); - a.movdqu(ptr_gp0, xmm7); - a.movmskps(gp0, xmm7); - a.movmskpd(gp0, xmm7); - a.movsd(xmm0, xmm7); - a.movsd(xmm0, ptr_gp0); - a.movsd(ptr_gp0, xmm7); - a.movapd(xmm0, ptr_gp0); - a.movapd(ptr_gp0, xmm7); - a.movdq2q(mm0, xmm7); - a.movq2dq(xmm0, mm7); - a.movhpd(xmm0, ptr_gp0); - a.movhpd(ptr_gp0, xmm7); - a.movlpd(xmm0, ptr_gp0); - a.movlpd(ptr_gp0, xmm7); - a.movntdq(ptr_gp0, xmm7); - a.movnti(ptr_gp0, zsi); - a.movntpd(ptr_gp0, xmm7); - a.movupd(xmm0, ptr_gp0); - a.movupd(ptr_gp0, xmm7); - a.mulpd(xmm0, xmm7); - a.mulpd(xmm0, ptr_gp0); - a.mulsd(xmm0, xmm7); - a.mulsd(xmm0, ptr_gp0); - a.orpd(xmm0, xmm7); - a.orpd(xmm0, ptr_gp0); - a.packsswb(xmm0, xmm7); - a.packsswb(xmm0, ptr_gp0); - a.packssdw(xmm0, xmm7); - a.packssdw(xmm0, ptr_gp0); - a.packuswb(xmm0, xmm7); - a.packuswb(xmm0, ptr_gp0); - a.paddb(xmm0, xmm7); - a.paddb(xmm0, ptr_gp0); - a.paddw(xmm0, xmm7); - a.paddw(xmm0, ptr_gp0); - a.paddd(xmm0, xmm7); - a.paddd(xmm0, ptr_gp0); - a.paddq(mm0, mm7); - a.paddq(mm0, ptr_gp0); - a.paddq(xmm0, xmm7); - a.paddq(xmm0, ptr_gp0); - a.paddsb(xmm0, xmm7); - a.paddsb(xmm0, ptr_gp0); - a.paddsw(xmm0, xmm7); - a.paddsw(xmm0, ptr_gp0); - a.paddusb(xmm0, xmm7); - a.paddusb(xmm0, ptr_gp0); - a.paddusw(xmm0, xmm7); - a.paddusw(xmm0, ptr_gp0); - a.pand(xmm0, xmm7); - a.pand(xmm0, ptr_gp0); - a.pandn(xmm0, xmm7); - a.pandn(xmm0, ptr_gp0); - a.pause(); - a.pavgb(xmm0, xmm7); - a.pavgb(xmm0, ptr_gp0); - a.pavgw(xmm0, xmm7); - a.pavgw(xmm0, ptr_gp0); - a.pcmpeqb(xmm0, xmm7); - a.pcmpeqb(xmm0, ptr_gp0); - a.pcmpeqw(xmm0, xmm7); - a.pcmpeqw(xmm0, ptr_gp0); - a.pcmpeqd(xmm0, xmm7); - a.pcmpeqd(xmm0, ptr_gp0); - a.pcmpgtb(xmm0, xmm7); - a.pcmpgtb(xmm0, ptr_gp0); - a.pcmpgtw(xmm0, xmm7); - a.pcmpgtw(xmm0, ptr_gp0); - a.pcmpgtd(xmm0, xmm7); - a.pcmpgtd(xmm0, ptr_gp0); - a.pmaxsw(xmm0, xmm7); - a.pmaxsw(xmm0, ptr_gp0); - a.pmaxub(xmm0, xmm7); - a.pmaxub(xmm0, ptr_gp0); - a.pminsw(xmm0, xmm7); - a.pminsw(xmm0, ptr_gp0); - a.pminub(xmm0, xmm7); - a.pminub(xmm0, ptr_gp0); - a.pmovmskb(gp0, xmm7); - a.pmulhw(xmm0, xmm7); - a.pmulhw(xmm0, ptr_gp0); - a.pmulhuw(xmm0, xmm7); - a.pmulhuw(xmm0, ptr_gp0); - a.pmullw(xmm0, xmm7); - a.pmullw(xmm0, ptr_gp0); - a.pmuludq(mm0, mm7); - a.pmuludq(mm0, ptr_gp0); - a.pmuludq(xmm0, xmm7); - a.pmuludq(xmm0, ptr_gp0); - a.por(xmm0, xmm7); - a.por(xmm0, ptr_gp0); - a.pslld(xmm0, xmm7); - a.pslld(xmm0, ptr_gp0); - a.pslld(xmm0, 0); - a.psllq(xmm0, xmm7); - a.psllq(xmm0, ptr_gp0); - a.psllq(xmm0, 0); - a.psllw(xmm0, xmm7); - a.psllw(xmm0, ptr_gp0); - a.psllw(xmm0, 0); - a.pslldq(xmm0, 0); - a.psrad(xmm0, xmm7); - a.psrad(xmm0, ptr_gp0); - a.psrad(xmm0, 0); - a.psraw(xmm0, xmm7); - a.psraw(xmm0, ptr_gp0); - a.psraw(xmm0, 0); - a.psubb(xmm0, xmm7); - a.psubb(xmm0, ptr_gp0); - a.psubw(xmm0, xmm7); - a.psubw(xmm0, ptr_gp0); - a.psubd(xmm0, xmm7); - a.psubd(xmm0, ptr_gp0); - a.psubq(mm0, mm7); - a.psubq(mm0, ptr_gp0); - a.psubq(xmm0, xmm7); - a.psubq(xmm0, ptr_gp0); - a.pmaddwd(xmm0, xmm7); - a.pmaddwd(xmm0, ptr_gp0); - a.pshufd(xmm0, xmm7, 0); - a.pshufd(xmm0, ptr_gp0, 0); - a.pshufhw(xmm0, xmm7, 0); - a.pshufhw(xmm0, ptr_gp0, 0); - a.pshuflw(xmm0, xmm7, 0); - a.pshuflw(xmm0, ptr_gp0, 0); - a.psrld(xmm0, xmm7); - a.psrld(xmm0, ptr_gp0); - a.psrld(xmm0, 0); - a.psrlq(xmm0, xmm7); - a.psrlq(xmm0, ptr_gp0); - a.psrlq(xmm0, 0); - a.psrldq(xmm0, 0); - a.psrlw(xmm0, xmm7); - a.psrlw(xmm0, ptr_gp0); - a.psrlw(xmm0, 0); - a.psubsb(xmm0, xmm7); - a.psubsb(xmm0, ptr_gp0); - a.psubsw(xmm0, xmm7); - a.psubsw(xmm0, ptr_gp0); - a.psubusb(xmm0, xmm7); - a.psubusb(xmm0, ptr_gp0); - a.psubusw(xmm0, xmm7); - a.psubusw(xmm0, ptr_gp0); - a.punpckhbw(xmm0, xmm7); - a.punpckhbw(xmm0, ptr_gp0); - a.punpckhwd(xmm0, xmm7); - a.punpckhwd(xmm0, ptr_gp0); - a.punpckhdq(xmm0, xmm7); - a.punpckhdq(xmm0, ptr_gp0); - a.punpckhqdq(xmm0, xmm7); - a.punpckhqdq(xmm0, ptr_gp0); - a.punpcklbw(xmm0, xmm7); - a.punpcklbw(xmm0, ptr_gp0); - a.punpcklwd(xmm0, xmm7); - a.punpcklwd(xmm0, ptr_gp0); - a.punpckldq(xmm0, xmm7); - a.punpckldq(xmm0, ptr_gp0); - a.punpcklqdq(xmm0, xmm7); - a.punpcklqdq(xmm0, ptr_gp0); - a.pxor(xmm0, xmm7); - a.pxor(xmm0, ptr_gp0); - a.sqrtpd(xmm0, xmm7); - a.sqrtpd(xmm0, ptr_gp0); - a.sqrtsd(xmm0, xmm7); - a.sqrtsd(xmm0, ptr_gp0); - a.subpd(xmm0, xmm7); - a.subpd(xmm0, ptr_gp0); - a.subsd(xmm0, xmm7); - a.subsd(xmm0, ptr_gp0); - a.ucomisd(xmm0, xmm7); - a.ucomisd(xmm0, ptr_gp0); - a.unpckhpd(xmm0, xmm7); - a.unpckhpd(xmm0, ptr_gp0); - a.unpcklpd(xmm0, xmm7); - a.unpcklpd(xmm0, ptr_gp0); - a.xorpd(xmm0, xmm7); - a.xorpd(xmm0, ptr_gp0); - - // SSE3. - a.nop(); - - a.addsubpd(xmm0, xmm7); - a.addsubpd(xmm0, ptr_gp0); - a.addsubps(xmm0, xmm7); - a.addsubps(xmm0, ptr_gp0); - a.fisttp(dword_ptr(gp0)); - a.haddpd(xmm0, xmm7); - a.haddpd(xmm0, ptr_gp0); - a.haddps(xmm0, xmm7); - a.haddps(xmm0, ptr_gp0); - a.hsubpd(xmm0, xmm7); - a.hsubpd(xmm0, ptr_gp0); - a.hsubps(xmm0, xmm7); - a.hsubps(xmm0, ptr_gp0); - a.lddqu(xmm0, ptr_gp0); - a.monitor(); - a.movddup(xmm0, xmm7); - a.movddup(xmm0, ptr_gp0); - a.movshdup(xmm0, xmm7); - a.movshdup(xmm0, ptr_gp0); - a.movsldup(xmm0, xmm7); - a.movsldup(xmm0, ptr_gp0); - a.mwait(); - - // SSSE3. - a.nop(); - - a.psignb(mm0, mm7); - a.psignb(mm0, ptr_gp0); - a.psignb(xmm0, xmm7); - a.psignb(xmm0, ptr_gp0); - a.psignw(mm0, mm7); - a.psignw(mm0, ptr_gp0); - a.psignw(xmm0, xmm7); - a.psignw(xmm0, ptr_gp0); - a.psignd(mm0, mm7); - a.psignd(mm0, ptr_gp0); - a.psignd(xmm0, xmm7); - a.psignd(xmm0, ptr_gp0); - a.phaddw(mm0, mm7); - a.phaddw(mm0, ptr_gp0); - a.phaddw(xmm0, xmm7); - a.phaddw(xmm0, ptr_gp0); - a.phaddd(mm0, mm7); - a.phaddd(mm0, ptr_gp0); - a.phaddd(xmm0, xmm7); - a.phaddd(xmm0, ptr_gp0); - a.phaddsw(mm0, mm7); - a.phaddsw(mm0, ptr_gp0); - a.phaddsw(xmm0, xmm7); - a.phaddsw(xmm0, ptr_gp0); - a.phsubw(mm0, mm7); - a.phsubw(mm0, ptr_gp0); - a.phsubw(xmm0, xmm7); - a.phsubw(xmm0, ptr_gp0); - a.phsubd(mm0, mm7); - a.phsubd(mm0, ptr_gp0); - a.phsubd(xmm0, xmm7); - a.phsubd(xmm0, ptr_gp0); - a.phsubsw(mm0, mm7); - a.phsubsw(mm0, ptr_gp0); - a.phsubsw(xmm0, xmm7); - a.phsubsw(xmm0, ptr_gp0); - a.pmaddubsw(mm0, mm7); - a.pmaddubsw(mm0, ptr_gp0); - a.pmaddubsw(xmm0, xmm7); - a.pmaddubsw(xmm0, ptr_gp0); - a.pabsb(mm0, mm7); - a.pabsb(mm0, ptr_gp0); - a.pabsb(xmm0, xmm7); - a.pabsb(xmm0, ptr_gp0); - a.pabsw(mm0, mm7); - a.pabsw(mm0, ptr_gp0); - a.pabsw(xmm0, xmm7); - a.pabsw(xmm0, ptr_gp0); - a.pabsd(mm0, mm7); - a.pabsd(mm0, ptr_gp0); - a.pabsd(xmm0, xmm7); - a.pabsd(xmm0, ptr_gp0); - a.pmulhrsw(mm0, mm7); - a.pmulhrsw(mm0, ptr_gp0); - a.pmulhrsw(xmm0, xmm7); - a.pmulhrsw(xmm0, ptr_gp0); - a.pshufb(mm0, mm7); - a.pshufb(mm0, ptr_gp0); - a.pshufb(xmm0, xmm7); - a.pshufb(xmm0, ptr_gp0); - a.palignr(mm0, mm7, 0); - a.palignr(mm0, ptr_gp0, 0); - a.palignr(xmm0, xmm7, 0); - a.palignr(xmm0, ptr_gp0, 0); - - // SSE4.1. - a.nop(); - - a.blendpd(xmm0, xmm7, 0); - a.blendpd(xmm0, ptr_gp0, 0); - a.blendps(xmm0, xmm7, 0); - a.blendps(xmm0, ptr_gp0, 0); - a.blendvpd(xmm0, xmm7); - a.blendvpd(xmm0, ptr_gp0); - a.blendvps(xmm0, xmm7); - a.blendvps(xmm0, ptr_gp0); - a.dppd(xmm0, xmm7, 0); - a.dppd(xmm0, ptr_gp0, 0); - a.dpps(xmm0, xmm7, 0); - a.dpps(xmm0, ptr_gp0, 0); - a.extractps(gp0, xmm7, 0); - a.extractps(ptr_gp0, xmm7, 0); - a.insertps(xmm0, xmm1, 0); - a.insertps(xmm0, ptr_gp0, 0); - a.movntdqa(xmm0, ptr_gp0); - a.mpsadbw(xmm0, xmm7, 0); - a.mpsadbw(xmm0, ptr_gp0, 0); - a.packusdw(xmm0, xmm7); - a.packusdw(xmm0, ptr_gp0); - a.pblendvb(xmm0, xmm7); - a.pblendvb(xmm0, ptr_gp0); - a.pblendw(xmm0, xmm7, 0); - a.pblendw(xmm0, ptr_gp0, 0); - a.pcmpeqq(xmm0, xmm7); - a.pcmpeqq(xmm0, ptr_gp0); - a.pextrb(gp0, xmm0, 0); - a.pextrb(ptr_gp0, xmm7, 0); - a.pextrd(gp0, xmm0, 0); - a.pextrd(ptr_gp0, xmm7, 0); - a.pextrq(gp0, xmm0, 0); - a.pextrq(ptr_gp0, xmm7, 0); - a.pextrw(gp0, xmm0, 0); - a.pextrw(ptr_gp0, xmm7, 0); - a.phminposuw(xmm0, xmm7); - a.phminposuw(xmm0, ptr_gp0); - a.pinsrb(xmm0, eax, 0); - a.pinsrb(xmm0, ptr_gp0, 0); - a.pinsrd(xmm0, eax, 0); - a.pinsrd(xmm0, ptr_gp0, 0); - a.pinsrw(xmm0, eax, 0); - a.pinsrw(xmm0, ptr_gp0, 0); - a.pmaxuw(xmm0, xmm7); - a.pmaxuw(xmm0, ptr_gp0); - a.pmaxsb(xmm0, xmm7); - a.pmaxsb(xmm0, ptr_gp0); - a.pmaxsd(xmm0, xmm7); - a.pmaxsd(xmm0, ptr_gp0); - a.pmaxud(xmm0, xmm7); - a.pmaxud(xmm0, ptr_gp0); - a.pminsb(xmm0, xmm7); - a.pminsb(xmm0, ptr_gp0); - a.pminuw(xmm0, xmm7); - a.pminuw(xmm0, ptr_gp0); - a.pminud(xmm0, xmm7); - a.pminud(xmm0, ptr_gp0); - a.pminsd(xmm0, xmm7); - a.pminsd(xmm0, ptr_gp0); - a.pmovsxbw(xmm0, xmm7); - a.pmovsxbw(xmm0, ptr_gp0); - a.pmovsxbd(xmm0, xmm7); - a.pmovsxbd(xmm0, ptr_gp0); - a.pmovsxbq(xmm0, xmm7); - a.pmovsxbq(xmm0, ptr_gp0); - a.pmovsxwd(xmm0, xmm7); - a.pmovsxwd(xmm0, ptr_gp0); - a.pmovsxwq(xmm0, xmm7); - a.pmovsxwq(xmm0, ptr_gp0); - a.pmovsxdq(xmm0, xmm7); - a.pmovsxdq(xmm0, ptr_gp0); - a.pmovzxbw(xmm0, xmm7); - a.pmovzxbw(xmm0, ptr_gp0); - a.pmovzxbd(xmm0, xmm7); - a.pmovzxbd(xmm0, ptr_gp0); - a.pmovzxbq(xmm0, xmm7); - a.pmovzxbq(xmm0, ptr_gp0); - a.pmovzxwd(xmm0, xmm7); - a.pmovzxwd(xmm0, ptr_gp0); - a.pmovzxwq(xmm0, xmm7); - a.pmovzxwq(xmm0, ptr_gp0); - a.pmovzxdq(xmm0, xmm7); - a.pmovzxdq(xmm0, ptr_gp0); - a.pmuldq(xmm0, xmm7); - a.pmuldq(xmm0, ptr_gp0); - a.pmulld(xmm0, xmm7); - a.pmulld(xmm0, ptr_gp0); - a.ptest(xmm0, xmm7); - a.ptest(xmm0, ptr_gp0); - a.roundps(xmm0, xmm7, 0); - a.roundps(xmm0, ptr_gp0, 0); - a.roundss(xmm0, xmm7, 0); - a.roundss(xmm0, ptr_gp0, 0); - a.roundpd(xmm0, xmm7, 0); - a.roundpd(xmm0, ptr_gp0, 0); - a.roundsd(xmm0, xmm7, 0); - a.roundsd(xmm0, ptr_gp0, 0); - - // SSE4.2. - a.nop(); - - a.pcmpestri(xmm0, xmm7, 0); - a.pcmpestri(xmm0, ptr_gp0, 0); - a.pcmpestrm(xmm0, xmm7, 0); - a.pcmpestrm(xmm0, ptr_gp0, 0); - a.pcmpistri(xmm0, xmm7, 0); - a.pcmpistri(xmm0, ptr_gp0, 0); - a.pcmpistrm(xmm0, xmm7, 0); - a.pcmpistrm(xmm0, ptr_gp0, 0); - a.pcmpgtq(xmm0, xmm7); - a.pcmpgtq(xmm0, ptr_gp0); - - // SSE4a. - a.nop(); - - a.extrq(xmm0, xmm1); - a.extrq(xmm0, 0x1, 0x2); - a.insertq(xmm0, xmm1); - a.insertq(xmm0, xmm1, 0x1, 0x2); - a.movntsd(ptr_gp0, xmm7); - a.movntss(ptr_gp0, xmm7); - - // POPCNT. - a.nop(); - - a.popcnt(gp0, ptr_gp0); - - // AESNI. - a.nop(); - - a.aesdec(xmm0, xmm7); - a.aesdec(xmm0, ptr_gp0); - a.aesdeclast(xmm0, xmm7); - a.aesdeclast(xmm0, ptr_gp0); - a.aesenc(xmm0, xmm7); - a.aesenc(xmm0, ptr_gp0); - a.aesenclast(xmm0, xmm7); - a.aesenclast(xmm0, ptr_gp0); - a.aesimc(xmm0, xmm7); - a.aesimc(xmm0, ptr_gp0); - a.aeskeygenassist(xmm0, xmm7, 0); - a.aeskeygenassist(xmm0, ptr_gp0, 0); - - // PCLMULQDQ. - a.nop(); - - a.pclmulqdq(xmm0, xmm7, 0); - a.pclmulqdq(xmm0, ptr_gp0, 0); - - // XSAVE. - a.nop(); - - a.xgetbv(); - a.xsetbv(); - - a.xsave(ptr_gp0); - a.xsaveopt(ptr_gp0); - a.xrstor(ptr_gp0); - - // AVX. - a.nop(); - - a.vaddpd(xmm0, xmm1, xmm2); - a.vaddpd(xmm0, xmm1, ptr_gp0); - a.vaddpd(ymm0, ymm1, ymm2); - a.vaddpd(ymm0, ymm1, ptr_gp0); - a.vaddps(xmm0, xmm1, xmm2); - a.vaddps(xmm0, xmm1, ptr_gp0); - a.vaddps(ymm0, ymm1, ymm2); - a.vaddps(ymm0, ymm1, ptr_gp0); - a.vaddsd(xmm0, xmm1, xmm2); - a.vaddsd(xmm0, xmm1, ptr_gp0); - a.vaddss(xmm0, xmm1, xmm2); - a.vaddss(xmm0, xmm1, ptr_gp0); - a.vaddsubpd(xmm0, xmm1, xmm2); - a.vaddsubpd(xmm0, xmm1, ptr_gp0); - a.vaddsubpd(ymm0, ymm1, ymm2); - a.vaddsubpd(ymm0, ymm1, ptr_gp0); - a.vaddsubps(xmm0, xmm1, xmm2); - a.vaddsubps(xmm0, xmm1, ptr_gp0); - a.vaddsubps(ymm0, ymm1, ymm2); - a.vaddsubps(ymm0, ymm1, ptr_gp0); - a.vandpd(xmm0, xmm1, xmm2); - a.vandpd(xmm0, xmm1, ptr_gp0); - a.vandpd(ymm0, ymm1, ymm2); - a.vandpd(ymm0, ymm1, ptr_gp0); - a.vandps(xmm0, xmm1, xmm2); - a.vandps(xmm0, xmm1, ptr_gp0); - a.vandps(ymm0, ymm1, ymm2); - a.vandps(ymm0, ymm1, ptr_gp0); - a.vandnpd(xmm0, xmm1, xmm2); - a.vandnpd(xmm0, xmm1, ptr_gp0); - a.vandnpd(ymm0, ymm1, ymm2); - a.vandnpd(ymm0, ymm1, ptr_gp0); - a.vandnps(xmm0, xmm1, xmm2); - a.vandnps(xmm0, xmm1, ptr_gp0); - a.vandnps(ymm0, ymm1, ymm2); - a.vandnps(ymm0, ymm1, ptr_gp0); - a.vblendpd(xmm0, xmm1, xmm2, 0); - a.vblendpd(xmm0, xmm1, ptr_gp0, 0); - a.vblendpd(ymm0, ymm1, ymm2, 0); - a.vblendpd(ymm0, ymm1, ptr_gp0, 0); - a.vblendps(xmm0, xmm1, xmm2, 0); - a.vblendps(xmm0, xmm1, ptr_gp0, 0); - a.vblendps(ymm0, ymm1, ymm2, 0); - a.vblendps(ymm0, ymm1, ptr_gp0, 0); - a.vblendvpd(xmm0, xmm1, xmm2, xmm3); - a.vblendvpd(xmm0, xmm1, ptr_gp0, xmm3); - a.vblendvpd(ymm0, ymm1, ymm2, ymm3); - a.vblendvpd(ymm0, ymm1, ptr_gp0, ymm3); - a.vbroadcastf128(ymm0, ptr_gp0); - a.vbroadcastsd(ymm0, ptr_gp0); - a.vbroadcastss(xmm0, ptr_gp0); - a.vbroadcastss(ymm0, ptr_gp0); - a.vcmppd(xmm0, xmm1, xmm2, 0); - a.vcmppd(xmm0, xmm1, ptr_gp0, 0); - a.vcmppd(ymm0, ymm1, ymm2, 0); - a.vcmppd(ymm0, ymm1, ptr_gp0, 0); - a.vcmpps(xmm0, xmm1, xmm2, 0); - a.vcmpps(xmm0, xmm1, ptr_gp0, 0); - a.vcmpps(ymm0, ymm1, ymm2, 0); - a.vcmpps(ymm0, ymm1, ptr_gp0, 0); - a.vcmpsd(xmm0, xmm1, xmm2, 0); - a.vcmpsd(xmm0, xmm1, ptr_gp0, 0); - a.vcmpss(xmm0, xmm1, xmm2, 0); - a.vcmpss(xmm0, xmm1, ptr_gp0, 0); - a.vcomisd(xmm0, xmm1); - a.vcomisd(xmm0, ptr_gp0); - a.vcomiss(xmm0, xmm1); - a.vcomiss(xmm0, ptr_gp0); - a.vcvtdq2pd(xmm0, xmm1); - a.vcvtdq2pd(xmm0, ptr_gp0); - a.vcvtdq2pd(ymm0, xmm1); - a.vcvtdq2pd(ymm0, ptr_gp0); - a.vcvtdq2ps(xmm0, xmm1); - a.vcvtdq2ps(xmm0, ptr_gp0); - a.vcvtdq2ps(ymm0, ymm1); - a.vcvtdq2ps(ymm0, ptr_gp0); - a.vcvtpd2dq(xmm0, xmm1); - a.vcvtpd2dq(xmm0, ymm1); - a.vcvtpd2dq(xmm0, ptr_gp0); - a.vcvtpd2ps(xmm0, xmm1); - a.vcvtpd2ps(xmm0, ymm1); - a.vcvtpd2ps(xmm0, ptr_gp0); - a.vcvtps2dq(xmm0, xmm1); - a.vcvtps2dq(xmm0, ptr_gp0); - a.vcvtps2dq(ymm0, ymm1); - a.vcvtps2dq(ymm0, ptr_gp0); - a.vcvtps2pd(xmm0, xmm1); - a.vcvtps2pd(xmm0, ptr_gp0); - a.vcvtps2pd(ymm0, xmm1); - a.vcvtps2pd(ymm0, ptr_gp0); - a.vcvtsd2si(gp0, xmm1); - a.vcvtsd2si(gp0, ptr_gp1); - a.vcvtsd2ss(xmm0, xmm1, xmm2); - a.vcvtsd2ss(xmm0, xmm1, ptr_gp0); - a.vcvtsi2sd(xmm0, xmm1, gp0); - a.vcvtsi2sd(xmm0, xmm1, ptr_gp0); - a.vcvtsi2ss(xmm0, xmm1, gp0); - a.vcvtsi2ss(xmm0, xmm1, ptr_gp0); - a.vcvtss2sd(xmm0, xmm1, xmm2); - a.vcvtss2sd(xmm0, xmm1, ptr_gp0); - a.vcvtss2si(gp0, xmm1); - a.vcvtss2si(gp0, ptr_gp1); - a.vcvttpd2dq(xmm0, xmm1); - a.vcvttpd2dq(xmm0, ymm1); - a.vcvttpd2dq(xmm0, ptr_gp0); - a.vcvttps2dq(xmm0, xmm1); - a.vcvttps2dq(xmm0, ptr_gp0); - a.vcvttps2dq(ymm0, ymm1); - a.vcvttps2dq(ymm0, ptr_gp0); - a.vcvttsd2si(gp0, xmm1); - a.vcvttsd2si(gp0, ptr_gp1); - a.vcvttss2si(gp0, xmm1); - a.vcvttss2si(gp0, ptr_gp1); - a.vdivpd(xmm0, xmm1, xmm2); - a.vdivpd(xmm0, xmm1, ptr_gp0); - a.vdivpd(ymm0, ymm1, ymm2); - a.vdivpd(ymm0, ymm1, ptr_gp0); - a.vdivps(xmm0, xmm1, xmm2); - a.vdivps(xmm0, xmm1, ptr_gp0); - a.vdivps(ymm0, ymm1, ymm2); - a.vdivps(ymm0, ymm1, ptr_gp0); - a.vdivsd(xmm0, xmm1, xmm2); - a.vdivsd(xmm0, xmm1, ptr_gp0); - a.vdivss(xmm0, xmm1, xmm2); - a.vdivss(xmm0, xmm1, ptr_gp0); - a.vdppd(xmm0, xmm1, xmm2, 0); - a.vdppd(xmm0, xmm1, ptr_gp0, 0); - a.vdpps(xmm0, xmm1, xmm2, 0); - a.vdpps(xmm0, xmm1, ptr_gp0, 0); - a.vdpps(ymm0, ymm1, ymm2, 0); - a.vdpps(ymm0, ymm1, ptr_gp0, 0); - a.vextractf128(xmm0, ymm1, 0); - a.vextractf128(ptr_gp0, ymm1, 0); - a.vextractps(gp0, xmm1, 0); - a.vextractps(ptr_gp0, xmm1, 0); - a.vhaddpd(xmm0, xmm1, xmm2); - a.vhaddpd(xmm0, xmm1, ptr_gp0); - a.vhaddpd(ymm0, ymm1, ymm2); - a.vhaddpd(ymm0, ymm1, ptr_gp0); - a.vhaddps(xmm0, xmm1, xmm2); - a.vhaddps(xmm0, xmm1, ptr_gp0); - a.vhaddps(ymm0, ymm1, ymm2); - a.vhaddps(ymm0, ymm1, ptr_gp0); - a.vhsubpd(xmm0, xmm1, xmm2); - a.vhsubpd(xmm0, xmm1, ptr_gp0); - a.vhsubpd(ymm0, ymm1, ymm2); - a.vhsubpd(ymm0, ymm1, ptr_gp0); - a.vhsubps(xmm0, xmm1, xmm2); - a.vhsubps(xmm0, xmm1, ptr_gp0); - a.vhsubps(ymm0, ymm1, ymm2); - a.vhsubps(ymm0, ymm1, ptr_gp0); - a.vinsertf128(ymm0, ymm1, xmm2, 0); - a.vinsertf128(ymm0, ymm1, ptr_gp0, 0); - a.vinsertps(xmm0, xmm1, xmm2, 0); - a.vinsertps(xmm0, xmm1, ptr_gp0, 0); - a.vlddqu(xmm0, ptr_gp0); - a.vlddqu(ymm0, ptr_gp0); - a.vldmxcsr(ptr_gp0); - a.vmaskmovdqu(xmm0, xmm1); - a.vmaskmovps(xmm0, xmm1, ptr_gp0); - a.vmaskmovps(ymm0, ymm1, ptr_gp0); - a.vmaskmovpd(xmm0, xmm1, ptr_gp0); - a.vmaskmovpd(ymm0, ymm1, ptr_gp0); - a.vmaskmovps(ptr_gp0, xmm0, xmm1); - a.vmaskmovps(ptr_gp0, ymm0, ymm1); - a.vmaskmovpd(ptr_gp0, xmm0, xmm1); - a.vmaskmovpd(ptr_gp0, ymm0, ymm1); - a.vmaxpd(xmm0, xmm1, xmm2); - a.vmaxpd(xmm0, xmm1, ptr_gp0); - a.vmaxpd(ymm0, ymm1, ymm2); - a.vmaxpd(ymm0, ymm1, ptr_gp0); - a.vmaxps(xmm0, xmm1, xmm2); - a.vmaxps(xmm0, xmm1, ptr_gp0); - a.vmaxps(ymm0, ymm1, ymm2); - a.vmaxps(ymm0, ymm1, ptr_gp0); - a.vmaxsd(xmm0, xmm1, xmm2); - a.vmaxsd(xmm0, xmm1, ptr_gp0); - a.vmaxss(xmm0, xmm1, xmm2); - a.vmaxss(xmm0, xmm1, ptr_gp0); - a.vminpd(xmm0, xmm1, xmm2); - a.vminpd(xmm0, xmm1, ptr_gp0); - a.vminpd(ymm0, ymm1, ymm2); - a.vminpd(ymm0, ymm1, ptr_gp0); - a.vminps(xmm0, xmm1, xmm2); - a.vminps(xmm0, xmm1, ptr_gp0); - a.vminps(ymm0, ymm1, ymm2); - a.vminps(ymm0, ymm1, ptr_gp0); - a.vminsd(xmm0, xmm1, xmm2); - a.vminsd(xmm0, xmm1, ptr_gp0); - a.vminss(xmm0, xmm1, xmm2); - a.vminss(xmm0, xmm1, ptr_gp0); - a.vmovapd(xmm0, xmm1); - a.vmovapd(xmm0, ptr_gp0); - a.vmovapd(ptr_gp0, xmm1); - a.vmovapd(ymm0, ymm1); - a.vmovapd(ymm0, ptr_gp0); - a.vmovapd(ptr_gp0, ymm1); - a.vmovaps(xmm0, xmm1); - a.vmovaps(xmm0, ptr_gp0); - a.vmovaps(ptr_gp0, xmm1); - a.vmovaps(ymm0, ymm1); - a.vmovaps(ymm0, ptr_gp0); - a.vmovaps(ptr_gp0, ymm1); - a.vmovd(xmm0, gp0); - a.vmovd(xmm0, ptr_gp0); - a.vmovd(gp0, xmm1); - a.vmovd(ptr_gp0, xmm1); - a.vmovddup(xmm0, xmm1); - a.vmovddup(xmm0, ptr_gp0); - a.vmovddup(ymm0, ymm1); - a.vmovddup(ymm0, ptr_gp0); - a.vmovdqa(xmm0, xmm1); - a.vmovdqa(xmm0, ptr_gp0); - a.vmovdqa(ptr_gp0, xmm1); - a.vmovdqa(ymm0, ymm1); - a.vmovdqa(ymm0, ptr_gp0); - a.vmovdqa(ptr_gp0, ymm1); - a.vmovdqu(xmm0, xmm1); - a.vmovdqu(xmm0, ptr_gp0); - a.vmovdqu(ptr_gp0, xmm1); - a.vmovdqu(ymm0, ymm1); - a.vmovdqu(ymm0, ptr_gp0); - a.vmovdqu(ptr_gp0, ymm1); - a.vmovhlps(xmm0, xmm1, xmm2); - a.vmovhpd(xmm0, xmm1, ptr_gp0); - a.vmovhpd(ptr_gp0, xmm1); - a.vmovhps(xmm0, xmm1, ptr_gp0); - a.vmovhps(ptr_gp0, xmm1); - a.vmovlhps(xmm0, xmm1, xmm2); - a.vmovlpd(xmm0, xmm1, ptr_gp0); - a.vmovlpd(ptr_gp0, xmm1); - a.vmovlps(xmm0, xmm1, ptr_gp0); - a.vmovlps(ptr_gp0, xmm1); - a.vmovmskpd(gp0, xmm1); - a.vmovmskpd(gp0, ymm1); - a.vmovmskps(gp0, xmm1); - a.vmovmskps(gp0, ymm1); - a.vmovntdq(ptr_gp0, xmm1); - a.vmovntdq(ptr_gp0, ymm1); - a.vmovntdqa(xmm0, ptr_gp0); - a.vmovntpd(ptr_gp0, xmm1); - a.vmovntpd(ptr_gp0, ymm1); - a.vmovntps(ptr_gp0, xmm1); - a.vmovntps(ptr_gp0, ymm1); - a.vmovsd(xmm0, xmm1, xmm2); - a.vmovsd(xmm0, ptr_gp0); - a.vmovsd(ptr_gp0, xmm1); - a.vmovshdup(xmm0, xmm1); - a.vmovshdup(xmm0, ptr_gp0); - a.vmovshdup(ymm0, ymm1); - a.vmovshdup(ymm0, ptr_gp0); - a.vmovsldup(xmm0, xmm1); - a.vmovsldup(xmm0, ptr_gp0); - a.vmovsldup(ymm0, ymm1); - a.vmovsldup(ymm0, ptr_gp0); - a.vmovss(xmm0, xmm1, xmm2); - a.vmovss(xmm0, ptr_gp0); - a.vmovss(ptr_gp0, xmm1); - a.vmovupd(xmm0, xmm1); - a.vmovupd(xmm0, ptr_gp0); - a.vmovupd(ptr_gp0, xmm1); - a.vmovupd(ymm0, ymm1); - a.vmovupd(ymm0, ptr_gp0); - a.vmovupd(ptr_gp0, ymm1); - a.vmovups(xmm0, xmm1); - a.vmovups(xmm0, ptr_gp0); - a.vmovups(ptr_gp0, xmm1); - a.vmovups(ymm0, ymm1); - a.vmovups(ymm0, ptr_gp0); - a.vmovups(ptr_gp0, ymm1); - a.vmpsadbw(xmm0, xmm1, xmm2, 0); - a.vmpsadbw(xmm0, xmm1, ptr_gp0, 0); - a.vmulpd(xmm0, xmm1, xmm2); - a.vmulpd(xmm0, xmm1, ptr_gp0); - a.vmulpd(ymm0, ymm1, ymm2); - a.vmulpd(ymm0, ymm1, ptr_gp0); - a.vmulps(xmm0, xmm1, xmm2); - a.vmulps(xmm0, xmm1, ptr_gp0); - a.vmulps(ymm0, ymm1, ymm2); - a.vmulps(ymm0, ymm1, ptr_gp0); - a.vmulsd(xmm0, xmm1, xmm2); - a.vmulsd(xmm0, xmm1, ptr_gp0); - a.vmulss(xmm0, xmm1, xmm2); - a.vmulss(xmm0, xmm1, ptr_gp0); - a.vorpd(xmm0, xmm1, xmm2); - a.vorpd(xmm0, xmm1, ptr_gp0); - a.vorpd(ymm0, ymm1, ymm2); - a.vorpd(ymm0, ymm1, ptr_gp0); - a.vorps(xmm0, xmm1, xmm2); - a.vorps(xmm0, xmm1, ptr_gp0); - a.vorps(ymm0, ymm1, ymm2); - a.vorps(ymm0, ymm1, ptr_gp0); - a.vpabsb(xmm0, xmm1); - a.vpabsb(xmm0, ptr_gp0); - a.vpabsd(xmm0, xmm1); - a.vpabsd(xmm0, ptr_gp0); - a.vpabsw(xmm0, xmm1); - a.vpabsw(xmm0, ptr_gp0); - a.vpackssdw(xmm0, xmm1, xmm2); - a.vpackssdw(xmm0, xmm1, ptr_gp0); - a.vpacksswb(xmm0, xmm1, xmm2); - a.vpacksswb(xmm0, xmm1, ptr_gp0); - a.vpackusdw(xmm0, xmm1, xmm2); - a.vpackusdw(xmm0, xmm1, ptr_gp0); - a.vpackuswb(xmm0, xmm1, xmm2); - a.vpackuswb(xmm0, xmm1, ptr_gp0); - a.vpaddb(xmm0, xmm1, xmm2); - a.vpaddb(xmm0, xmm1, ptr_gp0); - a.vpaddd(xmm0, xmm1, xmm2); - a.vpaddd(xmm0, xmm1, ptr_gp0); - a.vpaddq(xmm0, xmm1, xmm2); - a.vpaddq(xmm0, xmm1, ptr_gp0); - a.vpaddw(xmm0, xmm1, xmm2); - a.vpaddw(xmm0, xmm1, ptr_gp0); - a.vpaddsb(xmm0, xmm1, xmm2); - a.vpaddsb(xmm0, xmm1, ptr_gp0); - a.vpaddsw(xmm0, xmm1, xmm2); - a.vpaddsw(xmm0, xmm1, ptr_gp0); - a.vpaddusb(xmm0, xmm1, xmm2); - a.vpaddusb(xmm0, xmm1, ptr_gp0); - a.vpaddusw(xmm0, xmm1, xmm2); - a.vpaddusw(xmm0, xmm1, ptr_gp0); - a.vpalignr(xmm0, xmm1, xmm2, 0); - a.vpalignr(xmm0, xmm1, ptr_gp0, 0); - a.vpand(xmm0, xmm1, xmm2); - a.vpand(xmm0, xmm1, ptr_gp0); - a.vpandn(xmm0, xmm1, xmm2); - a.vpandn(xmm0, xmm1, ptr_gp0); - a.vpavgb(xmm0, xmm1, xmm2); - a.vpavgb(xmm0, xmm1, ptr_gp0); - a.vpavgw(xmm0, xmm1, xmm2); - a.vpavgw(xmm0, xmm1, ptr_gp0); - a.vpblendvb(xmm0, xmm1, xmm2, xmm3); - a.vpblendvb(xmm0, xmm1, ptr_gp0, xmm3); - a.vpblendw(xmm0, xmm1, xmm2, 0); - a.vpblendw(xmm0, xmm1, ptr_gp0, 0); - a.vpcmpeqb(xmm0, xmm1, xmm2); - a.vpcmpeqb(xmm0, xmm1, ptr_gp0); - a.vpcmpeqd(xmm0, xmm1, xmm2); - a.vpcmpeqd(xmm0, xmm1, ptr_gp0); - a.vpcmpeqq(xmm0, xmm1, xmm2); - a.vpcmpeqq(xmm0, xmm1, ptr_gp0); - a.vpcmpeqw(xmm0, xmm1, xmm2); - a.vpcmpeqw(xmm0, xmm1, ptr_gp0); - a.vpcmpgtb(xmm0, xmm1, xmm2); - a.vpcmpgtb(xmm0, xmm1, ptr_gp0); - a.vpcmpgtd(xmm0, xmm1, xmm2); - a.vpcmpgtd(xmm0, xmm1, ptr_gp0); - a.vpcmpgtq(xmm0, xmm1, xmm2); - a.vpcmpgtq(xmm0, xmm1, ptr_gp0); - a.vpcmpgtw(xmm0, xmm1, xmm2); - a.vpcmpgtw(xmm0, xmm1, ptr_gp0); - a.vpcmpestri(xmm0, xmm1, 0); - a.vpcmpestri(xmm0, ptr_gp0, 0); - a.vpcmpestrm(xmm0, xmm1, 0); - a.vpcmpestrm(xmm0, ptr_gp0, 0); - a.vpcmpistri(xmm0, xmm1, 0); - a.vpcmpistri(xmm0, ptr_gp0, 0); - a.vpcmpistrm(xmm0, xmm1, 0); - a.vpcmpistrm(xmm0, ptr_gp0, 0); - a.vpermilpd(xmm0, xmm1, xmm2); - a.vpermilpd(xmm0, xmm1, ptr_gp0); - a.vpermilpd(ymm0, ymm1, ymm2); - a.vpermilpd(ymm0, ymm1, ptr_gp0); - a.vpermilpd(xmm0, xmm1, 0); - a.vpermilpd(xmm0, ptr_gp0, 0); - a.vpermilpd(ymm0, ymm1, 0); - a.vpermilpd(ymm0, ptr_gp0, 0); - a.vpermilps(xmm0, xmm1, xmm2); - a.vpermilps(xmm0, xmm1, ptr_gp0); - a.vpermilps(ymm0, ymm1, ymm2); - a.vpermilps(ymm0, ymm1, ptr_gp0); - a.vpermilps(xmm0, xmm1, 0); - a.vpermilps(xmm0, ptr_gp0, 0); - a.vpermilps(ymm0, ymm1, 0); - a.vpermilps(ymm0, ptr_gp0, 0); - a.vperm2f128(ymm0, ymm1, ymm2, 0); - a.vperm2f128(ymm0, ymm1, ptr_gp0, 0); - a.vpextrb(gp0, xmm1, 0); - a.vpextrb(ptr_gp0, xmm1, 0); - a.vpextrd(gp0, xmm1, 0); - a.vpextrd(ptr_gp0, xmm1, 0); - a.vpextrw(gp0, xmm1, 0); - a.vpextrw(ptr_gp0, xmm1, 0); - a.vphaddd(xmm0, xmm1, xmm2); - a.vphaddd(xmm0, xmm1, ptr_gp0); - a.vphaddsw(xmm0, xmm1, xmm2); - a.vphaddsw(xmm0, xmm1, ptr_gp0); - a.vphaddw(xmm0, xmm1, xmm2); - a.vphaddw(xmm0, xmm1, ptr_gp0); - a.vphminposuw(xmm0, xmm1); - a.vphminposuw(xmm0, ptr_gp0); - a.vphsubd(xmm0, xmm1, xmm2); - a.vphsubd(xmm0, xmm1, ptr_gp0); - a.vphsubsw(xmm0, xmm1, xmm2); - a.vphsubsw(xmm0, xmm1, ptr_gp0); - a.vphsubw(xmm0, xmm1, xmm2); - a.vphsubw(xmm0, xmm1, ptr_gp0); - a.vpinsrb(xmm0, xmm1, gp0, 0); - a.vpinsrb(xmm0, xmm1, ptr_gp0, 0); - a.vpinsrd(xmm0, xmm1, gp0, 0); - a.vpinsrd(xmm0, xmm1, ptr_gp0, 0); - a.vpinsrw(xmm0, xmm1, gp0, 0); - a.vpinsrw(xmm0, xmm1, ptr_gp0, 0); - a.vpmaddubsw(xmm0, xmm1, xmm2); - a.vpmaddubsw(xmm0, xmm1, ptr_gp0); - a.vpmaddwd(xmm0, xmm1, xmm2); - a.vpmaddwd(xmm0, xmm1, ptr_gp0); - a.vpmaxsb(xmm0, xmm1, xmm2); - a.vpmaxsb(xmm0, xmm1, ptr_gp0); - a.vpmaxsd(xmm0, xmm1, xmm2); - a.vpmaxsd(xmm0, xmm1, ptr_gp0); - a.vpmaxsw(xmm0, xmm1, xmm2); - a.vpmaxsw(xmm0, xmm1, ptr_gp0); - a.vpmaxub(xmm0, xmm1, xmm2); - a.vpmaxub(xmm0, xmm1, ptr_gp0); - a.vpmaxud(xmm0, xmm1, xmm2); - a.vpmaxud(xmm0, xmm1, ptr_gp0); - a.vpmaxuw(xmm0, xmm1, xmm2); - a.vpmaxuw(xmm0, xmm1, ptr_gp0); - a.vpminsb(xmm0, xmm1, xmm2); - a.vpminsb(xmm0, xmm1, ptr_gp0); - a.vpminsd(xmm0, xmm1, xmm2); - a.vpminsd(xmm0, xmm1, ptr_gp0); - a.vpminsw(xmm0, xmm1, xmm2); - a.vpminsw(xmm0, xmm1, ptr_gp0); - a.vpminub(xmm0, xmm1, xmm2); - a.vpminub(xmm0, xmm1, ptr_gp0); - a.vpminud(xmm0, xmm1, xmm2); - a.vpminud(xmm0, xmm1, ptr_gp0); - a.vpminuw(xmm0, xmm1, xmm2); - a.vpminuw(xmm0, xmm1, ptr_gp0); - a.vpmovmskb(gp0, xmm1); - a.vpmovsxbd(xmm0, xmm1); - a.vpmovsxbd(xmm0, ptr_gp0); - a.vpmovsxbq(xmm0, xmm1); - a.vpmovsxbq(xmm0, ptr_gp0); - a.vpmovsxbw(xmm0, xmm1); - a.vpmovsxbw(xmm0, ptr_gp0); - a.vpmovsxdq(xmm0, xmm1); - a.vpmovsxdq(xmm0, ptr_gp0); - a.vpmovsxwd(xmm0, xmm1); - a.vpmovsxwd(xmm0, ptr_gp0); - a.vpmovsxwq(xmm0, xmm1); - a.vpmovsxwq(xmm0, ptr_gp0); - a.vpmovzxbd(xmm0, xmm1); - a.vpmovzxbd(xmm0, ptr_gp0); - a.vpmovzxbq(xmm0, xmm1); - a.vpmovzxbq(xmm0, ptr_gp0); - a.vpmovzxbw(xmm0, xmm1); - a.vpmovzxbw(xmm0, ptr_gp0); - a.vpmovzxdq(xmm0, xmm1); - a.vpmovzxdq(xmm0, ptr_gp0); - a.vpmovzxwd(xmm0, xmm1); - a.vpmovzxwd(xmm0, ptr_gp0); - a.vpmovzxwq(xmm0, xmm1); - a.vpmovzxwq(xmm0, ptr_gp0); - a.vpmuldq(xmm0, xmm1, xmm2); - a.vpmuldq(xmm0, xmm1, ptr_gp0); - a.vpmulhrsw(xmm0, xmm1, xmm2); - a.vpmulhrsw(xmm0, xmm1, ptr_gp0); - a.vpmulhuw(xmm0, xmm1, xmm2); - a.vpmulhuw(xmm0, xmm1, ptr_gp0); - a.vpmulhw(xmm0, xmm1, xmm2); - a.vpmulhw(xmm0, xmm1, ptr_gp0); - a.vpmulld(xmm0, xmm1, xmm2); - a.vpmulld(xmm0, xmm1, ptr_gp0); - a.vpmullw(xmm0, xmm1, xmm2); - a.vpmullw(xmm0, xmm1, ptr_gp0); - a.vpmuludq(xmm0, xmm1, xmm2); - a.vpmuludq(xmm0, xmm1, ptr_gp0); - a.vpor(xmm0, xmm1, xmm2); - a.vpor(xmm0, xmm1, ptr_gp0); - a.vpsadbw(xmm0, xmm1, xmm2); - a.vpsadbw(xmm0, xmm1, ptr_gp0); - a.vpshufb(xmm0, xmm1, xmm2); - a.vpshufb(xmm0, xmm1, ptr_gp0); - a.vpshufd(xmm0, xmm1, 0); - a.vpshufd(xmm0, ptr_gp0, 0); - a.vpshufhw(xmm0, xmm1, 0); - a.vpshufhw(xmm0, ptr_gp0, 0); - a.vpshuflw(xmm0, xmm1, 0); - a.vpshuflw(xmm0, ptr_gp0, 0); - a.vpsignb(xmm0, xmm1, xmm2); - a.vpsignb(xmm0, xmm1, ptr_gp0); - a.vpsignd(xmm0, xmm1, xmm2); - a.vpsignd(xmm0, xmm1, ptr_gp0); - a.vpsignw(xmm0, xmm1, xmm2); - a.vpsignw(xmm0, xmm1, ptr_gp0); - a.vpslld(xmm0, xmm1, xmm2); - a.vpslld(xmm0, xmm1, ptr_gp0); - a.vpslld(xmm0, xmm1, 0); - a.vpslldq(xmm0, xmm1, 0); - a.vpsllq(xmm0, xmm1, xmm2); - a.vpsllq(xmm0, xmm1, ptr_gp0); - a.vpsllq(xmm0, xmm1, 0); - a.vpsllw(xmm0, xmm1, xmm2); - a.vpsllw(xmm0, xmm1, ptr_gp0); - a.vpsllw(xmm0, xmm1, 0); - a.vpsrad(xmm0, xmm1, xmm2); - a.vpsrad(xmm0, xmm1, ptr_gp0); - a.vpsrad(xmm0, xmm1, 0); - a.vpsraw(xmm0, xmm1, xmm2); - a.vpsraw(xmm0, xmm1, ptr_gp0); - a.vpsraw(xmm0, xmm1, 0); - a.vpsrld(xmm0, xmm1, xmm2); - a.vpsrld(xmm0, xmm1, ptr_gp0); - a.vpsrld(xmm0, xmm1, 0); - a.vpsrldq(xmm0, xmm1, 0); - a.vpsrlq(xmm0, xmm1, xmm2); - a.vpsrlq(xmm0, xmm1, ptr_gp0); - a.vpsrlq(xmm0, xmm1, 0); - a.vpsrlw(xmm0, xmm1, xmm2); - a.vpsrlw(xmm0, xmm1, ptr_gp0); - a.vpsrlw(xmm0, xmm1, 0); - a.vpsubb(xmm0, xmm1, xmm2); - a.vpsubb(xmm0, xmm1, ptr_gp0); - a.vpsubd(xmm0, xmm1, xmm2); - a.vpsubd(xmm0, xmm1, ptr_gp0); - a.vpsubq(xmm0, xmm1, xmm2); - a.vpsubq(xmm0, xmm1, ptr_gp0); - a.vpsubw(xmm0, xmm1, xmm2); - a.vpsubw(xmm0, xmm1, ptr_gp0); - a.vpsubsb(xmm0, xmm1, xmm2); - a.vpsubsb(xmm0, xmm1, ptr_gp0); - a.vpsubsw(xmm0, xmm1, xmm2); - a.vpsubsw(xmm0, xmm1, ptr_gp0); - a.vpsubusb(xmm0, xmm1, xmm2); - a.vpsubusb(xmm0, xmm1, ptr_gp0); - a.vpsubusw(xmm0, xmm1, xmm2); - a.vpsubusw(xmm0, xmm1, ptr_gp0); - a.vptest(xmm0, xmm1); - a.vptest(xmm0, ptr_gp0); - a.vptest(ymm0, ymm1); - a.vptest(ymm0, ptr_gp0); - a.vpunpckhbw(xmm0, xmm1, xmm2); - a.vpunpckhbw(xmm0, xmm1, ptr_gp0); - a.vpunpckhdq(xmm0, xmm1, xmm2); - a.vpunpckhdq(xmm0, xmm1, ptr_gp0); - a.vpunpckhqdq(xmm0, xmm1, xmm2); - a.vpunpckhqdq(xmm0, xmm1, ptr_gp0); - a.vpunpckhwd(xmm0, xmm1, xmm2); - a.vpunpckhwd(xmm0, xmm1, ptr_gp0); - a.vpunpcklbw(xmm0, xmm1, xmm2); - a.vpunpcklbw(xmm0, xmm1, ptr_gp0); - a.vpunpckldq(xmm0, xmm1, xmm2); - a.vpunpckldq(xmm0, xmm1, ptr_gp0); - a.vpunpcklqdq(xmm0, xmm1, xmm2); - a.vpunpcklqdq(xmm0, xmm1, ptr_gp0); - a.vpunpcklwd(xmm0, xmm1, xmm2); - a.vpunpcklwd(xmm0, xmm1, ptr_gp0); - a.vpxor(xmm0, xmm1, xmm2); - a.vpxor(xmm0, xmm1, ptr_gp0); - a.vrcpps(xmm0, xmm1); - a.vrcpps(xmm0, ptr_gp0); - a.vrcpps(ymm0, ymm1); - a.vrcpps(ymm0, ptr_gp0); - a.vrcpss(xmm0, xmm1, xmm2); - a.vrcpss(xmm0, xmm1, ptr_gp0); - a.vrsqrtps(xmm0, xmm1); - a.vrsqrtps(xmm0, ptr_gp0); - a.vrsqrtps(ymm0, ymm1); - a.vrsqrtps(ymm0, ptr_gp0); - a.vrsqrtss(xmm0, xmm1, xmm2); - a.vrsqrtss(xmm0, xmm1, ptr_gp0); - a.vroundpd(xmm0, xmm1, 0); - a.vroundpd(xmm0, ptr_gp0, 0); - a.vroundpd(ymm0, ymm1, 0); - a.vroundpd(ymm0, ptr_gp0, 0); - a.vroundps(xmm0, xmm1, 0); - a.vroundps(xmm0, ptr_gp0, 0); - a.vroundps(ymm0, ymm1, 0); - a.vroundps(ymm0, ptr_gp0, 0); - a.vroundsd(xmm0, xmm1, xmm2, 0); - a.vroundsd(xmm0, xmm1, ptr_gp0, 0); - a.vroundss(xmm0, xmm1, xmm2, 0); - a.vroundss(xmm0, xmm1, ptr_gp0, 0); - a.vshufpd(xmm0, xmm1, xmm2, 0); - a.vshufpd(xmm0, xmm1, ptr_gp0, 0); - a.vshufpd(ymm0, ymm1, ymm2, 0); - a.vshufpd(ymm0, ymm1, ptr_gp0, 0); - a.vshufps(xmm0, xmm1, xmm2, 0); - a.vshufps(xmm0, xmm1, ptr_gp0, 0); - a.vshufps(ymm0, ymm1, ymm2, 0); - a.vshufps(ymm0, ymm1, ptr_gp0, 0); - a.vsqrtpd(xmm0, xmm1); - a.vsqrtpd(xmm0, ptr_gp0); - a.vsqrtpd(ymm0, ymm1); - a.vsqrtpd(ymm0, ptr_gp0); - a.vsqrtps(xmm0, xmm1); - a.vsqrtps(xmm0, ptr_gp0); - a.vsqrtps(ymm0, ymm1); - a.vsqrtps(ymm0, ptr_gp0); - a.vsqrtsd(xmm0, xmm1, xmm2); - a.vsqrtsd(xmm0, xmm1, ptr_gp0); - a.vsqrtss(xmm0, xmm1, xmm2); - a.vsqrtss(xmm0, xmm1, ptr_gp0); - a.vstmxcsr(ptr_gp0); - a.vsubpd(xmm0, xmm1, xmm2); - a.vsubpd(xmm0, xmm1, ptr_gp0); - a.vsubpd(ymm0, ymm1, ymm2); - a.vsubpd(ymm0, ymm1, ptr_gp0); - a.vsubps(xmm0, xmm1, xmm2); - a.vsubps(xmm0, xmm1, ptr_gp0); - a.vsubps(ymm0, ymm1, ymm2); - a.vsubps(ymm0, ymm1, ptr_gp0); - a.vsubsd(xmm0, xmm1, xmm2); - a.vsubsd(xmm0, xmm1, ptr_gp0); - a.vsubss(xmm0, xmm1, xmm2); - a.vsubss(xmm0, xmm1, ptr_gp0); - a.vtestps(xmm0, xmm1); - a.vtestps(xmm0, ptr_gp0); - a.vtestps(ymm0, ymm1); - a.vtestps(ymm0, ptr_gp0); - a.vtestpd(xmm0, xmm1); - a.vtestpd(xmm0, ptr_gp0); - a.vtestpd(ymm0, ymm1); - a.vtestpd(ymm0, ptr_gp0); - a.vucomisd(xmm0, xmm1); - a.vucomisd(xmm0, ptr_gp0); - a.vucomiss(xmm0, xmm1); - a.vucomiss(xmm0, ptr_gp0); - a.vunpckhpd(xmm0, xmm1, xmm2); - a.vunpckhpd(xmm0, xmm1, ptr_gp0); - a.vunpckhpd(ymm0, ymm1, ymm2); - a.vunpckhpd(ymm0, ymm1, ptr_gp0); - a.vunpckhps(xmm0, xmm1, xmm2); - a.vunpckhps(xmm0, xmm1, ptr_gp0); - a.vunpckhps(ymm0, ymm1, ymm2); - a.vunpckhps(ymm0, ymm1, ptr_gp0); - a.vunpcklpd(xmm0, xmm1, xmm2); - a.vunpcklpd(xmm0, xmm1, ptr_gp0); - a.vunpcklpd(ymm0, ymm1, ymm2); - a.vunpcklpd(ymm0, ymm1, ptr_gp0); - a.vunpcklps(xmm0, xmm1, xmm2); - a.vunpcklps(xmm0, xmm1, ptr_gp0); - a.vunpcklps(ymm0, ymm1, ymm2); - a.vunpcklps(ymm0, ymm1, ptr_gp0); - a.vxorpd(xmm0, xmm1, xmm2); - a.vxorpd(xmm0, xmm1, ptr_gp0); - a.vxorpd(ymm0, ymm1, ymm2); - a.vxorpd(ymm0, ymm1, ptr_gp0); - a.vxorps(xmm0, xmm1, xmm2); - a.vxorps(xmm0, xmm1, ptr_gp0); - a.vxorps(ymm0, ymm1, ymm2); - a.vxorps(ymm0, ymm1, ptr_gp0); - a.vzeroall(); - a.vzeroupper(); - - // AVX+AESNI. - a.nop(); - - a.vaesdec(xmm0, xmm1, xmm2); - a.vaesdec(xmm0, xmm1, ptr_gp0); - a.vaesdeclast(xmm0, xmm1, xmm2); - a.vaesdeclast(xmm0, xmm1, ptr_gp0); - a.vaesenc(xmm0, xmm1, xmm2); - a.vaesenc(xmm0, xmm1, ptr_gp0); - a.vaesenclast(xmm0, xmm1, xmm2); - a.vaesenclast(xmm0, xmm1, ptr_gp0); - a.vaesimc(xmm0, xmm1); - a.vaesimc(xmm0, ptr_gp0); - a.vaeskeygenassist(xmm0, xmm1, 0); - a.vaeskeygenassist(xmm0, ptr_gp0, 0); - - // AVX+PCLMULQDQ. - a.nop(); - - a.vpclmulqdq(xmm0, xmm1, xmm2, 0); - a.vpclmulqdq(xmm0, xmm1, ptr_gp0, 0); - - // AVX2. - a.nop(); - - a.vbroadcasti128(ymm0, ptr_gp0); - a.vbroadcastsd(ymm0, xmm1); - a.vbroadcastss(xmm0, xmm1); - a.vbroadcastss(ymm0, xmm1); - a.vextracti128(xmm0, ymm1, 0); - a.vextracti128(ptr_gp0, ymm1, 0); - a.vgatherdpd(xmm0, vm32x, xmm2); - a.vgatherdpd(ymm0, vm32y, ymm2); - a.vgatherdps(xmm0, vm32x, xmm2); - a.vgatherdps(ymm0, vm32y, ymm2); - a.vgatherqpd(xmm0, vm32x, xmm2); - a.vgatherqpd(ymm0, vm32y, ymm2); - a.vgatherqps(xmm0, vm32x, xmm2); - a.vgatherqps(xmm0, vm32y, xmm2); - a.vinserti128(ymm0, ymm1, xmm2, 0); - a.vinserti128(ymm0, ymm1, ptr_gp0, 0); - a.vmovntdqa(ymm0, ptr_gp0); - a.vmpsadbw(ymm0, ymm1, ymm2, 0); - a.vmpsadbw(ymm0, ymm1, ptr_gp0, 0); - a.vpabsb(ymm0, ymm1); - a.vpabsb(ymm0, ptr_gp0); - a.vpabsd(ymm0, ymm1); - a.vpabsd(ymm0, ptr_gp0); - a.vpabsw(ymm0, ymm1); - a.vpabsw(ymm0, ptr_gp0); - a.vpackssdw(ymm0, ymm1, ymm2); - a.vpackssdw(ymm0, ymm1, ptr_gp0); - a.vpacksswb(ymm0, ymm1, ymm2); - a.vpacksswb(ymm0, ymm1, ptr_gp0); - a.vpackusdw(ymm0, ymm1, ymm2); - a.vpackusdw(ymm0, ymm1, ptr_gp0); - a.vpackuswb(ymm0, ymm1, ymm2); - a.vpackuswb(ymm0, ymm1, ptr_gp0); - a.vpaddb(ymm0, ymm1, ymm2); - a.vpaddb(ymm0, ymm1, ptr_gp0); - a.vpaddd(ymm0, ymm1, ymm2); - a.vpaddd(ymm0, ymm1, ptr_gp0); - a.vpaddq(ymm0, ymm1, ymm2); - a.vpaddq(ymm0, ymm1, ptr_gp0); - a.vpaddw(ymm0, ymm1, ymm2); - a.vpaddw(ymm0, ymm1, ptr_gp0); - a.vpaddsb(ymm0, ymm1, ymm2); - a.vpaddsb(ymm0, ymm1, ptr_gp0); - a.vpaddsw(ymm0, ymm1, ymm2); - a.vpaddsw(ymm0, ymm1, ptr_gp0); - a.vpaddusb(ymm0, ymm1, ymm2); - a.vpaddusb(ymm0, ymm1, ptr_gp0); - a.vpaddusw(ymm0, ymm1, ymm2); - a.vpaddusw(ymm0, ymm1, ptr_gp0); - a.vpalignr(ymm0, ymm1, ymm2, 0); - a.vpalignr(ymm0, ymm1, ptr_gp0, 0); - a.vpand(ymm0, ymm1, ymm2); - a.vpand(ymm0, ymm1, ptr_gp0); - a.vpandn(ymm0, ymm1, ymm2); - a.vpandn(ymm0, ymm1, ptr_gp0); - a.vpavgb(ymm0, ymm1, ymm2); - a.vpavgb(ymm0, ymm1, ptr_gp0); - a.vpavgw(ymm0, ymm1, ymm2); - a.vpavgw(ymm0, ymm1, ptr_gp0); - a.vpblendd(xmm0, xmm1, xmm2, 0); - a.vpblendd(xmm0, xmm1, ptr_gp0, 0); - a.vpblendd(ymm0, ymm1, ymm2, 0); - a.vpblendd(ymm0, ymm1, ptr_gp0, 0); - a.vpblendvb(ymm0, ymm1, ymm2, ymm3); - a.vpblendvb(ymm0, ymm1, ptr_gp0, ymm3); - a.vpblendw(ymm0, ymm1, ymm2, 0); - a.vpblendw(ymm0, ymm1, ptr_gp0, 0); - a.vpbroadcastb(xmm0, xmm1); - a.vpbroadcastb(xmm0, ptr_gp0); - a.vpbroadcastb(ymm0, xmm1); - a.vpbroadcastb(ymm0, ptr_gp0); - a.vpbroadcastd(xmm0, xmm1); - a.vpbroadcastd(xmm0, ptr_gp0); - a.vpbroadcastd(ymm0, xmm1); - a.vpbroadcastd(ymm0, ptr_gp0); - a.vpbroadcastq(xmm0, xmm1); - a.vpbroadcastq(xmm0, ptr_gp0); - a.vpbroadcastq(ymm0, xmm1); - a.vpbroadcastq(ymm0, ptr_gp0); - a.vpbroadcastw(xmm0, xmm1); - a.vpbroadcastw(xmm0, ptr_gp0); - a.vpbroadcastw(ymm0, xmm1); - a.vpbroadcastw(ymm0, ptr_gp0); - a.vpcmpeqb(ymm0, ymm1, ymm2); - a.vpcmpeqb(ymm0, ymm1, ptr_gp0); - a.vpcmpeqd(ymm0, ymm1, ymm2); - a.vpcmpeqd(ymm0, ymm1, ptr_gp0); - a.vpcmpeqq(ymm0, ymm1, ymm2); - a.vpcmpeqq(ymm0, ymm1, ptr_gp0); - a.vpcmpeqw(ymm0, ymm1, ymm2); - a.vpcmpeqw(ymm0, ymm1, ptr_gp0); - a.vpcmpgtb(ymm0, ymm1, ymm2); - a.vpcmpgtb(ymm0, ymm1, ptr_gp0); - a.vpcmpgtd(ymm0, ymm1, ymm2); - a.vpcmpgtd(ymm0, ymm1, ptr_gp0); - a.vpcmpgtq(ymm0, ymm1, ymm2); - a.vpcmpgtq(ymm0, ymm1, ptr_gp0); - a.vpcmpgtw(ymm0, ymm1, ymm2); - a.vpcmpgtw(ymm0, ymm1, ptr_gp0); - a.vperm2i128(ymm0, ymm1, ymm2, 0); - a.vperm2i128(ymm0, ymm1, ptr_gp0, 0); - a.vpermd(ymm0, ymm1, ymm2); - a.vpermd(ymm0, ymm1, ptr_gp0); - a.vpermps(ymm0, ymm1, ymm2); - a.vpermps(ymm0, ymm1, ptr_gp0); - a.vpermpd(ymm0, ymm1, 0); - a.vpermpd(ymm0, ptr_gp0, 0); - a.vpermq(ymm0, ymm1, 0); - a.vpermq(ymm0, ptr_gp0, 0); - a.vpgatherdd(xmm0, vm32x, xmm2); - a.vpgatherdd(ymm0, vm32y, ymm2); - a.vpgatherdq(xmm0, vm32x, xmm2); - a.vpgatherdq(ymm0, vm32y, ymm2); - a.vpgatherqd(xmm0, vm32x, xmm2); - a.vpgatherqd(xmm0, vm32y, xmm2); - a.vpgatherqq(xmm0, vm32x, xmm2); - a.vpgatherqq(ymm0, vm32y, ymm2); - a.vpmovmskb(gp0, ymm1); - a.vpmovsxbd(ymm0, ptr_gp0); - a.vpmovsxbd(ymm0, xmm1); - a.vpmovsxbq(ymm0, ptr_gp0); - a.vpmovsxbq(ymm0, xmm1); - a.vpmovsxbw(ymm0, ptr_gp0); - a.vpmovsxbw(ymm0, xmm1); - a.vpmovsxdq(ymm0, ptr_gp0); - a.vpmovsxdq(ymm0, xmm1); - a.vpmovsxwd(ymm0, ptr_gp0); - a.vpmovsxwd(ymm0, xmm1); - a.vpmovsxwq(ymm0, ptr_gp0); - a.vpmovsxwq(ymm0, xmm1); - a.vpmovzxbd(ymm0, ptr_gp0); - a.vpmovzxbd(ymm0, xmm1); - a.vpmovzxbq(ymm0, ptr_gp0); - a.vpmovzxbq(ymm0, xmm1); - a.vpmovzxbw(ymm0, ptr_gp0); - a.vpmovzxbw(ymm0, xmm1); - a.vpmovzxdq(ymm0, ptr_gp0); - a.vpmovzxdq(ymm0, xmm1); - a.vpmovzxwd(ymm0, ptr_gp0); - a.vpmovzxwd(ymm0, xmm1); - a.vpmovzxwq(ymm0, ptr_gp0); - a.vpmovzxwq(ymm0, xmm1); - a.vpshufd(ymm0, ptr_gp0, 0); - a.vpshufd(ymm0, ymm1, 0); - a.vpshufhw(ymm0, ptr_gp0, 0); - a.vpshufhw(ymm0, ymm1, 0); - a.vpshuflw(ymm0, ptr_gp0, 0); - a.vpshuflw(ymm0, ymm1, 0); - a.vpslld(ymm0, ymm1, 0); - a.vpslldq(ymm0, ymm1, 0); - a.vpsllq(ymm0, ymm1, 0); - a.vpsllw(ymm0, ymm1, 0); - a.vpsrad(ymm0, ymm1, 0); - a.vpsraw(ymm0, ymm1, 0); - a.vpsrld(ymm0, ymm1, 0); - a.vpsrldq(ymm0, ymm1, 0); - a.vpsrlq(ymm0, ymm1, 0); - a.vpsrlw(ymm0, ymm1, 0); - a.vphaddd(ymm0, ymm1, ptr_gp0); - a.vphaddd(ymm0, ymm1, ymm2); - a.vphaddsw(ymm0, ymm1, ptr_gp0); - a.vphaddsw(ymm0, ymm1, ymm2); - a.vphaddw(ymm0, ymm1, ptr_gp0); - a.vphaddw(ymm0, ymm1, ymm2); - a.vphsubd(ymm0, ymm1, ptr_gp0); - a.vphsubd(ymm0, ymm1, ymm2); - a.vphsubsw(ymm0, ymm1, ptr_gp0); - a.vphsubsw(ymm0, ymm1, ymm2); - a.vphsubw(ymm0, ymm1, ptr_gp0); - a.vphsubw(ymm0, ymm1, ymm2); - a.vpmaddubsw(ymm0, ymm1, ptr_gp0); - a.vpmaddubsw(ymm0, ymm1, ymm2); - a.vpmaddwd(ymm0, ymm1, ptr_gp0); - a.vpmaddwd(ymm0, ymm1, ymm2); - a.vpmaskmovd(ptr_gp0, xmm1, xmm2); - a.vpmaskmovd(ptr_gp0, ymm1, ymm2); - a.vpmaskmovd(xmm0, xmm1, ptr_gp0); - a.vpmaskmovd(ymm0, ymm1, ptr_gp0); - a.vpmaskmovq(ptr_gp0, xmm1, xmm2); - a.vpmaskmovq(ptr_gp0, ymm1, ymm2); - a.vpmaskmovq(xmm0, xmm1, ptr_gp0); - a.vpmaskmovq(ymm0, ymm1, ptr_gp0); - a.vpmaxsb(ymm0, ymm1, ptr_gp0); - a.vpmaxsb(ymm0, ymm1, ymm2); - a.vpmaxsd(ymm0, ymm1, ptr_gp0); - a.vpmaxsd(ymm0, ymm1, ymm2); - a.vpmaxsw(ymm0, ymm1, ptr_gp0); - a.vpmaxsw(ymm0, ymm1, ymm2); - a.vpmaxub(ymm0, ymm1, ptr_gp0); - a.vpmaxub(ymm0, ymm1, ymm2); - a.vpmaxud(ymm0, ymm1, ptr_gp0); - a.vpmaxud(ymm0, ymm1, ymm2); - a.vpmaxuw(ymm0, ymm1, ptr_gp0); - a.vpmaxuw(ymm0, ymm1, ymm2); - a.vpminsb(ymm0, ymm1, ptr_gp0); - a.vpminsb(ymm0, ymm1, ymm2); - a.vpminsd(ymm0, ymm1, ptr_gp0); - a.vpminsd(ymm0, ymm1, ymm2); - a.vpminsw(ymm0, ymm1, ptr_gp0); - a.vpminsw(ymm0, ymm1, ymm2); - a.vpminub(ymm0, ymm1, ptr_gp0); - a.vpminub(ymm0, ymm1, ymm2); - a.vpminud(ymm0, ymm1, ptr_gp0); - a.vpminud(ymm0, ymm1, ymm2); - a.vpminuw(ymm0, ymm1, ptr_gp0); - a.vpminuw(ymm0, ymm1, ymm2); - a.vpmuldq(ymm0, ymm1, ptr_gp0); - a.vpmuldq(ymm0, ymm1, ymm2); - a.vpmulhrsw(ymm0, ymm1, ptr_gp0); - a.vpmulhrsw(ymm0, ymm1, ymm2); - a.vpmulhuw(ymm0, ymm1, ptr_gp0); - a.vpmulhuw(ymm0, ymm1, ymm2); - a.vpmulhw(ymm0, ymm1, ptr_gp0); - a.vpmulhw(ymm0, ymm1, ymm2); - a.vpmulld(ymm0, ymm1, ptr_gp0); - a.vpmulld(ymm0, ymm1, ymm2); - a.vpmullw(ymm0, ymm1, ptr_gp0); - a.vpmullw(ymm0, ymm1, ymm2); - a.vpmuludq(ymm0, ymm1, ptr_gp0); - a.vpmuludq(ymm0, ymm1, ymm2); - a.vpor(ymm0, ymm1, ptr_gp0); - a.vpor(ymm0, ymm1, ymm2); - a.vpsadbw(ymm0, ymm1, ptr_gp0); - a.vpsadbw(ymm0, ymm1, ymm2); - a.vpshufb(ymm0, ymm1, ptr_gp0); - a.vpshufb(ymm0, ymm1, ymm2); - a.vpsignb(ymm0, ymm1, ptr_gp0); - a.vpsignb(ymm0, ymm1, ymm2); - a.vpsignd(ymm0, ymm1, ptr_gp0); - a.vpsignd(ymm0, ymm1, ymm2); - a.vpsignw(ymm0, ymm1, ptr_gp0); - a.vpsignw(ymm0, ymm1, ymm2); - a.vpslld(ymm0, ymm1, ptr_gp0); - a.vpslld(ymm0, ymm1, xmm2); - a.vpsllq(ymm0, ymm1, ptr_gp0); - a.vpsllq(ymm0, ymm1, xmm2); - a.vpsllvd(xmm0, xmm1, ptr_gp0); - a.vpsllvd(xmm0, xmm1, xmm2); - a.vpsllvd(ymm0, ymm1, ptr_gp0); - a.vpsllvd(ymm0, ymm1, ymm2); - a.vpsllvq(xmm0, xmm1, ptr_gp0); - a.vpsllvq(xmm0, xmm1, xmm2); - a.vpsllvq(ymm0, ymm1, ptr_gp0); - a.vpsllvq(ymm0, ymm1, ymm2); - a.vpsllw(ymm0, ymm1, ptr_gp0); - a.vpsllw(ymm0, ymm1, xmm2); - a.vpsrad(ymm0, ymm1, ptr_gp0); - a.vpsrad(ymm0, ymm1, xmm2); - a.vpsravd(xmm0, xmm1, ptr_gp0); - a.vpsravd(xmm0, xmm1, xmm2); - a.vpsravd(ymm0, ymm1, ptr_gp0); - a.vpsravd(ymm0, ymm1, ymm2); - a.vpsraw(ymm0, ymm1, ptr_gp0); - a.vpsraw(ymm0, ymm1, xmm2); - a.vpsrld(ymm0, ymm1, ptr_gp0); - a.vpsrld(ymm0, ymm1, xmm2); - a.vpsrlq(ymm0, ymm1, ptr_gp0); - a.vpsrlq(ymm0, ymm1, xmm2); - a.vpsrlvd(xmm0, xmm1, ptr_gp0); - a.vpsrlvd(xmm0, xmm1, xmm2); - a.vpsrlvd(ymm0, ymm1, ptr_gp0); - a.vpsrlvd(ymm0, ymm1, ymm2); - a.vpsrlvq(xmm0, xmm1, ptr_gp0); - a.vpsrlvq(xmm0, xmm1, xmm2); - a.vpsrlvq(ymm0, ymm1, ptr_gp0); - a.vpsrlvq(ymm0, ymm1, ymm2); - a.vpsrlw(ymm0, ymm1, ptr_gp0); - a.vpsrlw(ymm0, ymm1, xmm2); - a.vpsubb(ymm0, ymm1, ptr_gp0); - a.vpsubb(ymm0, ymm1, ymm2); - a.vpsubd(ymm0, ymm1, ptr_gp0); - a.vpsubd(ymm0, ymm1, ymm2); - a.vpsubq(ymm0, ymm1, ptr_gp0); - a.vpsubq(ymm0, ymm1, ymm2); - a.vpsubsb(ymm0, ymm1, ptr_gp0); - a.vpsubsb(ymm0, ymm1, ymm2); - a.vpsubsw(ymm0, ymm1, ptr_gp0); - a.vpsubsw(ymm0, ymm1, ymm2); - a.vpsubusb(ymm0, ymm1, ptr_gp0); - a.vpsubusb(ymm0, ymm1, ymm2); - a.vpsubusw(ymm0, ymm1, ptr_gp0); - a.vpsubusw(ymm0, ymm1, ymm2); - a.vpsubw(ymm0, ymm1, ptr_gp0); - a.vpsubw(ymm0, ymm1, ymm2); - a.vpunpckhbw(ymm0, ymm1, ptr_gp0); - a.vpunpckhbw(ymm0, ymm1, ymm2); - a.vpunpckhdq(ymm0, ymm1, ptr_gp0); - a.vpunpckhdq(ymm0, ymm1, ymm2); - a.vpunpckhqdq(ymm0, ymm1, ptr_gp0); - a.vpunpckhqdq(ymm0, ymm1, ymm2); - a.vpunpckhwd(ymm0, ymm1, ptr_gp0); - a.vpunpckhwd(ymm0, ymm1, ymm2); - a.vpunpcklbw(ymm0, ymm1, ptr_gp0); - a.vpunpcklbw(ymm0, ymm1, ymm2); - a.vpunpckldq(ymm0, ymm1, ptr_gp0); - a.vpunpckldq(ymm0, ymm1, ymm2); - a.vpunpcklqdq(ymm0, ymm1, ptr_gp0); - a.vpunpcklqdq(ymm0, ymm1, ymm2); - a.vpunpcklwd(ymm0, ymm1, ptr_gp0); - a.vpunpcklwd(ymm0, ymm1, ymm2); - a.vpxor(ymm0, ymm1, ptr_gp0); - a.vpxor(ymm0, ymm1, ymm2); - - // FMA3. - a.nop(); - - a.vfmadd132pd(xmm0, xmm1, ptr_gp0); - a.vfmadd132pd(xmm0, xmm1, xmm2); - a.vfmadd132pd(ymm0, ymm1, ptr_gp0); - a.vfmadd132pd(ymm0, ymm1, ymm2); - a.vfmadd132ps(xmm0, xmm1, ptr_gp0); - a.vfmadd132ps(xmm0, xmm1, xmm2); - a.vfmadd132ps(ymm0, ymm1, ptr_gp0); - a.vfmadd132ps(ymm0, ymm1, ymm2); - a.vfmadd132sd(xmm0, xmm1, ptr_gp0); - a.vfmadd132sd(xmm0, xmm1, xmm2); - a.vfmadd132ss(xmm0, xmm1, ptr_gp0); - a.vfmadd132ss(xmm0, xmm1, xmm2); - a.vfmadd213pd(xmm0, xmm1, ptr_gp0); - a.vfmadd213pd(xmm0, xmm1, xmm2); - a.vfmadd213pd(ymm0, ymm1, ptr_gp0); - a.vfmadd213pd(ymm0, ymm1, ymm2); - a.vfmadd213ps(xmm0, xmm1, ptr_gp0); - a.vfmadd213ps(xmm0, xmm1, xmm2); - a.vfmadd213ps(ymm0, ymm1, ptr_gp0); - a.vfmadd213ps(ymm0, ymm1, ymm2); - a.vfmadd213sd(xmm0, xmm1, ptr_gp0); - a.vfmadd213sd(xmm0, xmm1, xmm2); - a.vfmadd213ss(xmm0, xmm1, ptr_gp0); - a.vfmadd213ss(xmm0, xmm1, xmm2); - a.vfmadd231pd(xmm0, xmm1, ptr_gp0); - a.vfmadd231pd(xmm0, xmm1, xmm2); - a.vfmadd231pd(ymm0, ymm1, ptr_gp0); - a.vfmadd231pd(ymm0, ymm1, ymm2); - a.vfmadd231ps(xmm0, xmm1, ptr_gp0); - a.vfmadd231ps(xmm0, xmm1, xmm2); - a.vfmadd231ps(ymm0, ymm1, ptr_gp0); - a.vfmadd231ps(ymm0, ymm1, ymm2); - a.vfmadd231sd(xmm0, xmm1, ptr_gp0); - a.vfmadd231sd(xmm0, xmm1, xmm2); - a.vfmadd231ss(xmm0, xmm1, ptr_gp0); - a.vfmadd231ss(xmm0, xmm1, xmm2); - a.vfmaddsub132pd(xmm0, xmm1, ptr_gp0); - a.vfmaddsub132pd(xmm0, xmm1, xmm2); - a.vfmaddsub132pd(ymm0, ymm1, ptr_gp0); - a.vfmaddsub132pd(ymm0, ymm1, ymm2); - a.vfmaddsub132ps(xmm0, xmm1, ptr_gp0); - a.vfmaddsub132ps(xmm0, xmm1, xmm2); - a.vfmaddsub132ps(ymm0, ymm1, ptr_gp0); - a.vfmaddsub132ps(ymm0, ymm1, ymm2); - a.vfmaddsub213pd(xmm0, xmm1, ptr_gp0); - a.vfmaddsub213pd(xmm0, xmm1, xmm2); - a.vfmaddsub213pd(ymm0, ymm1, ptr_gp0); - a.vfmaddsub213pd(ymm0, ymm1, ymm2); - a.vfmaddsub213ps(xmm0, xmm1, ptr_gp0); - a.vfmaddsub213ps(xmm0, xmm1, xmm2); - a.vfmaddsub213ps(ymm0, ymm1, ptr_gp0); - a.vfmaddsub213ps(ymm0, ymm1, ymm2); - a.vfmaddsub231pd(xmm0, xmm1, ptr_gp0); - a.vfmaddsub231pd(xmm0, xmm1, xmm2); - a.vfmaddsub231pd(ymm0, ymm1, ptr_gp0); - a.vfmaddsub231pd(ymm0, ymm1, ymm2); - a.vfmaddsub231ps(xmm0, xmm1, ptr_gp0); - a.vfmaddsub231ps(xmm0, xmm1, xmm2); - a.vfmaddsub231ps(ymm0, ymm1, ptr_gp0); - a.vfmaddsub231ps(ymm0, ymm1, ymm2); - a.vfmsub132pd(xmm0, xmm1, ptr_gp0); - a.vfmsub132pd(xmm0, xmm1, xmm2); - a.vfmsub132pd(ymm0, ymm1, ptr_gp0); - a.vfmsub132pd(ymm0, ymm1, ymm2); - a.vfmsub132ps(xmm0, xmm1, ptr_gp0); - a.vfmsub132ps(xmm0, xmm1, xmm2); - a.vfmsub132ps(ymm0, ymm1, ptr_gp0); - a.vfmsub132ps(ymm0, ymm1, ymm2); - a.vfmsub132sd(xmm0, xmm1, ptr_gp0); - a.vfmsub132sd(xmm0, xmm1, xmm2); - a.vfmsub132ss(xmm0, xmm1, ptr_gp0); - a.vfmsub132ss(xmm0, xmm1, xmm2); - a.vfmsub213pd(xmm0, xmm1, ptr_gp0); - a.vfmsub213pd(xmm0, xmm1, xmm2); - a.vfmsub213pd(ymm0, ymm1, ptr_gp0); - a.vfmsub213pd(ymm0, ymm1, ymm2); - a.vfmsub213ps(xmm0, xmm1, ptr_gp0); - a.vfmsub213ps(xmm0, xmm1, xmm2); - a.vfmsub213ps(ymm0, ymm1, ptr_gp0); - a.vfmsub213ps(ymm0, ymm1, ymm2); - a.vfmsub213sd(xmm0, xmm1, ptr_gp0); - a.vfmsub213sd(xmm0, xmm1, xmm2); - a.vfmsub213ss(xmm0, xmm1, ptr_gp0); - a.vfmsub213ss(xmm0, xmm1, xmm2); - a.vfmsub231pd(xmm0, xmm1, ptr_gp0); - a.vfmsub231pd(xmm0, xmm1, xmm2); - a.vfmsub231pd(ymm0, ymm1, ptr_gp0); - a.vfmsub231pd(ymm0, ymm1, ymm2); - a.vfmsub231ps(xmm0, xmm1, ptr_gp0); - a.vfmsub231ps(xmm0, xmm1, xmm2); - a.vfmsub231ps(ymm0, ymm1, ptr_gp0); - a.vfmsub231ps(ymm0, ymm1, ymm2); - a.vfmsub231sd(xmm0, xmm1, ptr_gp0); - a.vfmsub231sd(xmm0, xmm1, xmm2); - a.vfmsub231ss(xmm0, xmm1, ptr_gp0); - a.vfmsub231ss(xmm0, xmm1, xmm2); - a.vfmsubadd132pd(xmm0, xmm1, ptr_gp0); - a.vfmsubadd132pd(xmm0, xmm1, xmm2); - a.vfmsubadd132pd(ymm0, ymm1, ptr_gp0); - a.vfmsubadd132pd(ymm0, ymm1, ymm2); - a.vfmsubadd132ps(xmm0, xmm1, ptr_gp0); - a.vfmsubadd132ps(xmm0, xmm1, xmm2); - a.vfmsubadd132ps(ymm0, ymm1, ptr_gp0); - a.vfmsubadd132ps(ymm0, ymm1, ymm2); - a.vfmsubadd213pd(xmm0, xmm1, ptr_gp0); - a.vfmsubadd213pd(xmm0, xmm1, xmm2); - a.vfmsubadd213pd(ymm0, ymm1, ptr_gp0); - a.vfmsubadd213pd(ymm0, ymm1, ymm2); - a.vfmsubadd213ps(xmm0, xmm1, ptr_gp0); - a.vfmsubadd213ps(xmm0, xmm1, xmm2); - a.vfmsubadd213ps(ymm0, ymm1, ptr_gp0); - a.vfmsubadd213ps(ymm0, ymm1, ymm2); - a.vfmsubadd231pd(xmm0, xmm1, ptr_gp0); - a.vfmsubadd231pd(xmm0, xmm1, xmm2); - a.vfmsubadd231pd(ymm0, ymm1, ptr_gp0); - a.vfmsubadd231pd(ymm0, ymm1, ymm2); - a.vfmsubadd231ps(xmm0, xmm1, ptr_gp0); - a.vfmsubadd231ps(xmm0, xmm1, xmm2); - a.vfmsubadd231ps(ymm0, ymm1, ptr_gp0); - a.vfmsubadd231ps(ymm0, ymm1, ymm2); - a.vfnmadd132pd(xmm0, xmm1, ptr_gp0); - a.vfnmadd132pd(xmm0, xmm1, xmm2); - a.vfnmadd132pd(ymm0, ymm1, ptr_gp0); - a.vfnmadd132pd(ymm0, ymm1, ymm2); - a.vfnmadd132ps(xmm0, xmm1, ptr_gp0); - a.vfnmadd132ps(xmm0, xmm1, xmm2); - a.vfnmadd132ps(ymm0, ymm1, ptr_gp0); - a.vfnmadd132ps(ymm0, ymm1, ymm2); - a.vfnmadd132sd(xmm0, xmm1, ptr_gp0); - a.vfnmadd132sd(xmm0, xmm1, xmm2); - a.vfnmadd132ss(xmm0, xmm1, ptr_gp0); - a.vfnmadd132ss(xmm0, xmm1, xmm2); - a.vfnmadd213pd(xmm0, xmm1, ptr_gp0); - a.vfnmadd213pd(xmm0, xmm1, xmm2); - a.vfnmadd213pd(ymm0, ymm1, ptr_gp0); - a.vfnmadd213pd(ymm0, ymm1, ymm2); - a.vfnmadd213ps(xmm0, xmm1, ptr_gp0); - a.vfnmadd213ps(xmm0, xmm1, xmm2); - a.vfnmadd213ps(ymm0, ymm1, ptr_gp0); - a.vfnmadd213ps(ymm0, ymm1, ymm2); - a.vfnmadd213sd(xmm0, xmm1, ptr_gp0); - a.vfnmadd213sd(xmm0, xmm1, xmm2); - a.vfnmadd213ss(xmm0, xmm1, ptr_gp0); - a.vfnmadd213ss(xmm0, xmm1, xmm2); - a.vfnmadd231pd(xmm0, xmm1, ptr_gp0); - a.vfnmadd231pd(xmm0, xmm1, xmm2); - a.vfnmadd231pd(ymm0, ymm1, ptr_gp0); - a.vfnmadd231pd(ymm0, ymm1, ymm2); - a.vfnmadd231ps(xmm0, xmm1, ptr_gp0); - a.vfnmadd231ps(xmm0, xmm1, xmm2); - a.vfnmadd231ps(ymm0, ymm1, ptr_gp0); - a.vfnmadd231ps(ymm0, ymm1, ymm2); - a.vfnmadd231sd(xmm0, xmm1, ptr_gp0); - a.vfnmadd231sd(xmm0, xmm1, xmm2); - a.vfnmadd231ss(xmm0, xmm1, ptr_gp0); - a.vfnmadd231ss(xmm0, xmm1, xmm2); - a.vfnmsub132pd(xmm0, xmm1, ptr_gp0); - a.vfnmsub132pd(xmm0, xmm1, xmm2); - a.vfnmsub132pd(ymm0, ymm1, ptr_gp0); - a.vfnmsub132pd(ymm0, ymm1, ymm2); - a.vfnmsub132ps(xmm0, xmm1, ptr_gp0); - a.vfnmsub132ps(xmm0, xmm1, xmm2); - a.vfnmsub132ps(ymm0, ymm1, ptr_gp0); - a.vfnmsub132ps(ymm0, ymm1, ymm2); - a.vfnmsub132sd(xmm0, xmm1, ptr_gp0); - a.vfnmsub132sd(xmm0, xmm1, xmm2); - a.vfnmsub132ss(xmm0, xmm1, ptr_gp0); - a.vfnmsub132ss(xmm0, xmm1, xmm2); - a.vfnmsub213pd(xmm0, xmm1, ptr_gp0); - a.vfnmsub213pd(xmm0, xmm1, xmm2); - a.vfnmsub213pd(ymm0, ymm1, ptr_gp0); - a.vfnmsub213pd(ymm0, ymm1, ymm2); - a.vfnmsub213ps(xmm0, xmm1, ptr_gp0); - a.vfnmsub213ps(xmm0, xmm1, xmm2); - a.vfnmsub213ps(ymm0, ymm1, ptr_gp0); - a.vfnmsub213ps(ymm0, ymm1, ymm2); - a.vfnmsub213sd(xmm0, xmm1, ptr_gp0); - a.vfnmsub213sd(xmm0, xmm1, xmm2); - a.vfnmsub213ss(xmm0, xmm1, ptr_gp0); - a.vfnmsub213ss(xmm0, xmm1, xmm2); - a.vfnmsub231pd(xmm0, xmm1, ptr_gp0); - a.vfnmsub231pd(xmm0, xmm1, xmm2); - a.vfnmsub231pd(ymm0, ymm1, ptr_gp0); - a.vfnmsub231pd(ymm0, ymm1, ymm2); - a.vfnmsub231ps(xmm0, xmm1, ptr_gp0); - a.vfnmsub231ps(xmm0, xmm1, xmm2); - a.vfnmsub231ps(ymm0, ymm1, ptr_gp0); - a.vfnmsub231ps(ymm0, ymm1, ymm2); - a.vfnmsub231sd(xmm0, xmm1, ptr_gp0); - a.vfnmsub231sd(xmm0, xmm1, xmm2); - a.vfnmsub231ss(xmm0, xmm1, ptr_gp0); - a.vfnmsub231ss(xmm0, xmm1, xmm2); - - // FMA4. - a.nop(); - - a.vfmaddpd(xmm0, xmm1, xmm2, xmm3); - a.vfmaddpd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmaddpd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmaddpd(ymm0, ymm1, ymm2, ymm3); - a.vfmaddpd(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmaddpd(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmaddps(xmm0, xmm1, xmm2, xmm3); - a.vfmaddps(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmaddps(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmaddps(ymm0, ymm1, ymm2, ymm3); - a.vfmaddps(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmaddps(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmaddsd(xmm0, xmm1, xmm2, xmm3); - a.vfmaddsd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmaddsd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmaddss(xmm0, xmm1, xmm2, xmm3); - a.vfmaddss(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmaddss(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmaddsubpd(xmm0, xmm1, xmm2, xmm3); - a.vfmaddsubpd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmaddsubpd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmaddsubpd(ymm0, ymm1, ymm2, ymm3); - a.vfmaddsubpd(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmaddsubpd(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmaddsubps(xmm0, xmm1, xmm2, xmm3); - a.vfmaddsubps(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmaddsubps(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmaddsubps(ymm0, ymm1, ymm2, ymm3); - a.vfmaddsubps(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmaddsubps(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmsubaddpd(xmm0, xmm1, xmm2, xmm3); - a.vfmsubaddpd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmsubaddpd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmsubaddpd(ymm0, ymm1, ymm2, ymm3); - a.vfmsubaddpd(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmsubaddpd(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmsubaddps(xmm0, xmm1, xmm2, xmm3); - a.vfmsubaddps(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmsubaddps(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmsubaddps(ymm0, ymm1, ymm2, ymm3); - a.vfmsubaddps(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmsubaddps(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmsubpd(xmm0, xmm1, xmm2, xmm3); - a.vfmsubpd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmsubpd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmsubpd(ymm0, ymm1, ymm2, ymm3); - a.vfmsubpd(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmsubpd(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmsubps(xmm0, xmm1, xmm2, xmm3); - a.vfmsubps(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmsubps(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmsubps(ymm0, ymm1, ymm2, ymm3); - a.vfmsubps(ymm0, ymm1, ptr_gp0, ymm3); - a.vfmsubps(ymm0, ymm1, ymm2, ptr_gp0); - a.vfmsubsd(xmm0, xmm1, xmm2, xmm3); - a.vfmsubsd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmsubsd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfmsubss(xmm0, xmm1, xmm2, xmm3); - a.vfmsubss(xmm0, xmm1, ptr_gp0, xmm3); - a.vfmsubss(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmaddpd(xmm0, xmm1, xmm2, xmm3); - a.vfnmaddpd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmaddpd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmaddpd(ymm0, ymm1, ymm2, ymm3); - a.vfnmaddpd(ymm0, ymm1, ptr_gp0, ymm3); - a.vfnmaddpd(ymm0, ymm1, ymm2, ptr_gp0); - a.vfnmaddps(xmm0, xmm1, xmm2, xmm3); - a.vfnmaddps(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmaddps(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmaddps(ymm0, ymm1, ymm2, ymm3); - a.vfnmaddps(ymm0, ymm1, ptr_gp0, ymm3); - a.vfnmaddps(ymm0, ymm1, ymm2, ptr_gp0); - a.vfnmaddsd(xmm0, xmm1, xmm2, xmm3); - a.vfnmaddsd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmaddsd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmaddss(xmm0, xmm1, xmm2, xmm3); - a.vfnmaddss(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmaddss(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmsubpd(xmm0, xmm1, xmm2, xmm3); - a.vfnmsubpd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmsubpd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmsubpd(ymm0, ymm1, ymm2, ymm3); - a.vfnmsubpd(ymm0, ymm1, ptr_gp0, ymm3); - a.vfnmsubpd(ymm0, ymm1, ymm2, ptr_gp0); - a.vfnmsubps(xmm0, xmm1, xmm2, xmm3); - a.vfnmsubps(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmsubps(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmsubps(ymm0, ymm1, ymm2, ymm3); - a.vfnmsubps(ymm0, ymm1, ptr_gp0, ymm3); - a.vfnmsubps(ymm0, ymm1, ymm2, ptr_gp0); - a.vfnmsubsd(xmm0, xmm1, xmm2, xmm3); - a.vfnmsubsd(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmsubsd(xmm0, xmm1, xmm2, ptr_gp0); - a.vfnmsubss(xmm0, xmm1, xmm2, xmm3); - a.vfnmsubss(xmm0, xmm1, ptr_gp0, xmm3); - a.vfnmsubss(xmm0, xmm1, xmm2, ptr_gp0); - - // XOP. - a.nop(); - - a.vfrczpd(xmm0, xmm1); - a.vfrczpd(xmm0, ptr_gp0); - a.vfrczpd(ymm0, ymm1); - a.vfrczpd(ymm0, ptr_gp0); - a.vfrczps(xmm0, xmm1); - a.vfrczps(xmm0, ptr_gp0); - a.vfrczps(ymm0, ymm1); - a.vfrczps(ymm0, ptr_gp0); - a.vfrczsd(xmm0, xmm1); - a.vfrczsd(xmm0, ptr_gp0); - a.vfrczss(xmm0, xmm1); - a.vfrczss(xmm0, ptr_gp0); - a.vpcmov(xmm0, xmm1, xmm2, xmm3); - a.vpcmov(xmm0, xmm1, ptr_gp0, xmm3); - a.vpcmov(xmm0, xmm1, xmm2, ptr_gp0); - a.vpcmov(ymm0, ymm1, ymm2, ymm3); - a.vpcmov(ymm0, ymm1, ptr_gp0, ymm3); - a.vpcmov(ymm0, ymm1, ymm2, ptr_gp0); - a.vpcomb(xmm0, xmm1, xmm2, 0); - a.vpcomb(xmm0, xmm1, ptr_gp0, 0); - a.vpcomd(xmm0, xmm1, xmm2, 0); - a.vpcomd(xmm0, xmm1, ptr_gp0, 0); - a.vpcomq(xmm0, xmm1, xmm2, 0); - a.vpcomq(xmm0, xmm1, ptr_gp0, 0); - a.vpcomw(xmm0, xmm1, xmm2, 0); - a.vpcomw(xmm0, xmm1, ptr_gp0, 0); - a.vpcomub(xmm0, xmm1, xmm2, 0); - a.vpcomub(xmm0, xmm1, ptr_gp0, 0); - a.vpcomud(xmm0, xmm1, xmm2, 0); - a.vpcomud(xmm0, xmm1, ptr_gp0, 0); - a.vpcomuq(xmm0, xmm1, xmm2, 0); - a.vpcomuq(xmm0, xmm1, ptr_gp0, 0); - a.vpcomuw(xmm0, xmm1, xmm2, 0); - a.vpcomuw(xmm0, xmm1, ptr_gp0, 0); - a.vpermil2pd(xmm0, xmm1, xmm2, xmm3); - a.vpermil2pd(xmm0, xmm1, ptr_gp0, xmm3); - a.vpermil2pd(xmm0, xmm1, xmm2, ptr_gp0); - a.vpermil2pd(ymm0, ymm1, ymm2, ymm3); - a.vpermil2pd(ymm0, ymm1, ptr_gp0, ymm3); - a.vpermil2pd(ymm0, ymm1, ymm2, ptr_gp0); - a.vpermil2ps(xmm0, xmm1, xmm2, xmm3); - a.vpermil2ps(xmm0, xmm1, ptr_gp0, xmm3); - a.vpermil2ps(xmm0, xmm1, xmm2, ptr_gp0); - a.vpermil2ps(ymm0, ymm1, ymm2, ymm3); - a.vpermil2ps(ymm0, ymm1, ptr_gp0, ymm3); - a.vpermil2ps(ymm0, ymm1, ymm2, ptr_gp0); - a.vphaddbd(xmm0, xmm1); - a.vphaddbd(xmm0, ptr_gp0); - a.vphaddbq(xmm0, xmm1); - a.vphaddbq(xmm0, ptr_gp0); - a.vphaddbw(xmm0, xmm1); - a.vphaddbw(xmm0, ptr_gp0); - a.vphadddq(xmm0, xmm1); - a.vphadddq(xmm0, ptr_gp0); - a.vphaddwd(xmm0, xmm1); - a.vphaddwd(xmm0, ptr_gp0); - a.vphaddwq(xmm0, xmm1); - a.vphaddwq(xmm0, ptr_gp0); - a.vphaddubd(xmm0, xmm1); - a.vphaddubd(xmm0, ptr_gp0); - a.vphaddubq(xmm0, xmm1); - a.vphaddubq(xmm0, ptr_gp0); - a.vphaddubw(xmm0, xmm1); - a.vphaddubw(xmm0, ptr_gp0); - a.vphaddudq(xmm0, xmm1); - a.vphaddudq(xmm0, ptr_gp0); - a.vphadduwd(xmm0, xmm1); - a.vphadduwd(xmm0, ptr_gp0); - a.vphadduwq(xmm0, xmm1); - a.vphadduwq(xmm0, ptr_gp0); - a.vphsubbw(xmm0, xmm1); - a.vphsubbw(xmm0, ptr_gp0); - a.vphsubdq(xmm0, xmm1); - a.vphsubdq(xmm0, ptr_gp0); - a.vphsubwd(xmm0, xmm1); - a.vphsubwd(xmm0, ptr_gp0); - a.vpmacsdd(xmm0, xmm1, xmm2, xmm3); - a.vpmacsdd(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacsdqh(xmm0, xmm1, xmm2, xmm3); - a.vpmacsdqh(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacsdql(xmm0, xmm1, xmm2, xmm3); - a.vpmacsdql(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacswd(xmm0, xmm1, xmm2, xmm3); - a.vpmacswd(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacsww(xmm0, xmm1, xmm2, xmm3); - a.vpmacsww(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacssdd(xmm0, xmm1, xmm2, xmm3); - a.vpmacssdd(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacssdqh(xmm0, xmm1, xmm2, xmm3); - a.vpmacssdqh(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacssdql(xmm0, xmm1, xmm2, xmm3); - a.vpmacssdql(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacsswd(xmm0, xmm1, xmm2, xmm3); - a.vpmacsswd(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmacssww(xmm0, xmm1, xmm2, xmm3); - a.vpmacssww(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmadcsswd(xmm0, xmm1, xmm2, xmm3); - a.vpmadcsswd(xmm0, xmm1, ptr_gp0, xmm3); - a.vpmadcswd(xmm0, xmm1, xmm2, xmm3); - a.vpmadcswd(xmm0, xmm1, ptr_gp0, xmm3); - a.vpperm(xmm0, xmm1, xmm2, xmm3); - a.vpperm(xmm0, xmm1, ptr_gp0, xmm3); - a.vpperm(xmm0, xmm1, xmm2, ptr_gp0); - a.vprotb(xmm0, xmm1, xmm2); - a.vprotb(xmm0, ptr_gp0, xmm2); - a.vprotb(xmm0, xmm1, ptr_gp0); - a.vprotb(xmm0, xmm1, 0); - a.vprotb(xmm0, ptr_gp0, 0); - a.vprotd(xmm0, xmm1, xmm2); - a.vprotd(xmm0, ptr_gp0, xmm2); - a.vprotd(xmm0, xmm1, ptr_gp0); - a.vprotd(xmm0, xmm1, 0); - a.vprotd(xmm0, ptr_gp0, 0); - a.vprotq(xmm0, xmm1, xmm2); - a.vprotq(xmm0, ptr_gp0, xmm2); - a.vprotq(xmm0, xmm1, ptr_gp0); - a.vprotq(xmm0, xmm1, 0); - a.vprotq(xmm0, ptr_gp0, 0); - a.vprotw(xmm0, xmm1, xmm2); - a.vprotw(xmm0, ptr_gp0, xmm2); - a.vprotw(xmm0, xmm1, ptr_gp0); - a.vprotw(xmm0, xmm1, 0); - a.vprotw(xmm0, ptr_gp0, 0); - a.vpshab(xmm0, xmm1, xmm2); - a.vpshab(xmm0, ptr_gp0, xmm2); - a.vpshab(xmm0, xmm1, ptr_gp0); - a.vpshad(xmm0, xmm1, xmm2); - a.vpshad(xmm0, ptr_gp0, xmm2); - a.vpshad(xmm0, xmm1, ptr_gp0); - a.vpshaq(xmm0, xmm1, xmm2); - a.vpshaq(xmm0, ptr_gp0, xmm2); - a.vpshaq(xmm0, xmm1, ptr_gp0); - a.vpshaw(xmm0, xmm1, xmm2); - a.vpshaw(xmm0, ptr_gp0, xmm2); - a.vpshaw(xmm0, xmm1, ptr_gp0); - a.vpshlb(xmm0, xmm1, xmm2); - a.vpshlb(xmm0, ptr_gp0, xmm2); - a.vpshlb(xmm0, xmm1, ptr_gp0); - a.vpshld(xmm0, xmm1, xmm2); - a.vpshld(xmm0, ptr_gp0, xmm2); - a.vpshld(xmm0, xmm1, ptr_gp0); - a.vpshlq(xmm0, xmm1, xmm2); - a.vpshlq(xmm0, ptr_gp0, xmm2); - a.vpshlq(xmm0, xmm1, ptr_gp0); - a.vpshlw(xmm0, xmm1, xmm2); - a.vpshlw(xmm0, ptr_gp0, xmm2); - a.vpshlw(xmm0, xmm1, ptr_gp0); - - // BMI. - a.nop(); - - a.andn(gp0, gp1, zcx); - a.andn(gp0, gp1, ptr_gp1); - a.bextr(gp0, gp1, zcx); - a.bextr(gp0, ptr_gp1, zcx); - a.blsi(gp0, gp1); - a.blsi(gp0, ptr_gp1); - a.blsmsk(gp0, gp1); - a.blsmsk(gp0, ptr_gp1); - a.blsr(gp0, gp1); - a.blsr(gp0, ptr_gp1); - - // LZCNT. - a.nop(); - - a.lzcnt(gp0, gp1); - a.lzcnt(gp0, ptr_gp1); - - // TZCNT. - a.nop(); - - a.tzcnt(gp0, gp1); - a.tzcnt(gp0, ptr_gp1); - - // BMI2. - a.nop(); - - a.bzhi(gp0, gp1, zcx); - a.bzhi(gp0, ptr_gp1, zcx); - a.mulx(gp0, gp1, zcx); - a.mulx(gp0, gp1, ptr_gp1); - a.pdep(gp0, gp1, zcx); - a.pdep(gp0, gp1, ptr_gp1); - a.pext(gp0, gp1, zcx); - a.pext(gp0, gp1, ptr_gp1); - a.rorx(gp0, gp1, 0); - a.rorx(gp0, ptr_gp1, 0); - a.sarx(gp0, gp1, zcx); - a.sarx(gp0, ptr_gp1, zcx); - a.shlx(gp0, gp1, zcx); - a.shlx(gp0, ptr_gp1, zcx); - a.shrx(gp0, gp1, zcx); - a.shrx(gp0, ptr_gp1, zcx); - - // RDRAND. - a.nop(); - - a.rdrand(gp0); - - // F16C. - a.nop(); - - a.vcvtph2ps(xmm0, xmm1); - a.vcvtph2ps(xmm0, ptr_gp1); - a.vcvtph2ps(ymm0, xmm1); - a.vcvtph2ps(ymm0, ptr_gp1); - a.vcvtps2ph(xmm0, xmm1, 0); - a.vcvtps2ph(ptr_gp0, xmm1, 0); - a.vcvtps2ph(xmm0, ymm1, 0); - a.vcvtps2ph(ptr_gp0, ymm1, 0); - - // Mark the end of the stream. - a.nop(); -} - -} // asmgen namespace - -// [Guard] -#endif // _APP_TEST_GENOPCODE_H diff --git a/src/asmjit/asmjit.h b/src/asmjit/asmjit.h index 9b143d4..fb6cdf8 100644 --- a/src/asmjit/asmjit.h +++ b/src/asmjit/asmjit.h @@ -365,15 +365,15 @@ //! \brief Contributions. // [Dependencies - Base] -#include "base.h" +#include "./base.h" // [Dependencies - X86/X64] #if defined(ASMJIT_BUILD_X86) || defined(ASMJIT_BUILD_X64) -#include "x86.h" +#include "./x86.h" #endif // ASMJIT_BUILD_X86 || ASMJIT_BUILD_X64 // [Dependencies - Host] -#include "host.h" +#include "./host.h" // [Guard] #endif // _ASMJIT_ASMJIT_H diff --git a/src/asmjit/base.h b/src/asmjit/base.h index 47e1bfe..73ec4b3 100644 --- a/src/asmjit/base.h +++ b/src/asmjit/base.h @@ -9,26 +9,26 @@ #define _ASMJIT_BASE_H // [Dependencies - AsmJit] -#include "build.h" +#include "./build.h" -#include "base/assembler.h" -#include "base/codegen.h" -#include "base/compiler.h" -#include "base/constpool.h" -#include "base/containers.h" -#include "base/cpuinfo.h" -#include "base/cputicks.h" -#include "base/error.h" -#include "base/globals.h" -#include "base/intutil.h" -#include "base/lock.h" -#include "base/logger.h" -#include "base/operand.h" -#include "base/runtime.h" -#include "base/string.h" -#include "base/vectypes.h" -#include "base/vmem.h" -#include "base/zone.h" +#include "./base/assembler.h" +#include "./base/codegen.h" +#include "./base/compiler.h" +#include "./base/constpool.h" +#include "./base/containers.h" +#include "./base/cpuinfo.h" +#include "./base/cputicks.h" +#include "./base/error.h" +#include "./base/globals.h" +#include "./base/intutil.h" +#include "./base/lock.h" +#include "./base/logger.h" +#include "./base/operand.h" +#include "./base/runtime.h" +#include "./base/string.h" +#include "./base/vectypes.h" +#include "./base/vmem.h" +#include "./base/zone.h" // [Guard] #endif // _ASMJIT_BASE_H diff --git a/src/asmjit/build.h b/src/asmjit/build.h index c61c222..b08d995 100644 --- a/src/asmjit/build.h +++ b/src/asmjit/build.h @@ -389,7 +389,7 @@ typedef unsigned __int64 uint64_t; // Include a unit testing package if this is a `asmjit_test` build. #if defined(ASMJIT_TEST) -#include "./test/broken.h" +#include "../test/broken.h" #endif // ASMJIT_TEST // [Guard] diff --git a/src/asmjit/host.h b/src/asmjit/host.h index 987f485..9c0e5eb 100644 --- a/src/asmjit/host.h +++ b/src/asmjit/host.h @@ -9,14 +9,14 @@ #define _ASMJIT_HOST_H // [Dependencies - Core] -#include "base.h" +#include "./base.h" // ============================================================================ // [asmjit::host - X86 / X64] // ============================================================================ #if defined(ASMJIT_ARCH_X86) || defined(ASMJIT_ARCH_X64) -#include "x86.h" +#include "./x86.h" namespace asmjit { diff --git a/src/asmjit/x86.h b/src/asmjit/x86.h index ef5e006..a1be281 100644 --- a/src/asmjit/x86.h +++ b/src/asmjit/x86.h @@ -9,13 +9,13 @@ #define _ASMJIT_X86_H // [Dependencies - AsmJit] -#include "base.h" +#include "./base.h" -#include "x86/x86assembler.h" -#include "x86/x86compiler.h" -#include "x86/x86cpuinfo.h" -#include "x86/x86inst.h" -#include "x86/x86operand.h" +#include "./x86/x86assembler.h" +#include "./x86/x86compiler.h" +#include "./x86/x86cpuinfo.h" +#include "./x86/x86inst.h" +#include "./x86/x86operand.h" // [Guard] #endif // _ASMJIT_X86_H diff --git a/src/asmjit/x86/x86assembler.cpp b/src/asmjit/x86/x86assembler.cpp index a7487d4..f431e48 100644 --- a/src/asmjit/x86/x86assembler.cpp +++ b/src/asmjit/x86/x86assembler.cpp @@ -155,11 +155,17 @@ static ASMJIT_INLINE bool x86RexIsInvalid(uint32_t rex) { //! Encode ModR/M. static ASMJIT_INLINE uint32_t x86EncodeMod(uint32_t m, uint32_t o, uint32_t rm) { + ASMJIT_ASSERT(m <= 7); + ASMJIT_ASSERT(o <= 7); + ASMJIT_ASSERT(rm <= 7); return (m << 6) + (o << 3) + rm; } //! Encode SIB. static ASMJIT_INLINE uint32_t x86EncodeSib(uint32_t s, uint32_t i, uint32_t b) { + ASMJIT_ASSERT(s <= 7); + ASMJIT_ASSERT(i <= 7); + ASMJIT_ASSERT(b <= 7); return (s << 6) + (i << 3) + b; } @@ -187,7 +193,7 @@ static ASMJIT_INLINE uint32_t x86RegAndVvvv(uint32_t regIndex, uint32_t vvvvInde //! Get `O` field of `opCode`. static ASMJIT_INLINE uint32_t x86ExtractO(uint32_t opCode) { - return (opCode >> kX86InstOpCode_O_Shift) & 0x7; + return (opCode >> kX86InstOpCode_O_Shift) & 0x07; } // ============================================================================ @@ -1429,7 +1435,7 @@ _Prepare: // INC r16|r32 is not encodable in 64-bit mode. if (Arch == kArchX86 && (o0->getSize() == 2 || o0->getSize() == 4)) { opCode &= kX86InstOpCode_PP_66 | kX86InstOpCode_W; - opCode |= extendedInfo.getSecondaryOpCode() + (static_cast(rmReg) & 0x7); + opCode |= extendedInfo.getSecondaryOpCode() + (static_cast(rmReg) & 0x07); goto _EmitX86Op; } else { @@ -3493,7 +3499,7 @@ _EmitX86OpWithOpReg: if (rex & ~static_cast(_kX86InstOptionNoRex)) { rex |= kX86ByteRex; - opReg &= 0x7; + opReg &= 0x07; EMIT_BYTE(rex); if (x86RexIsInvalid(rex)) @@ -3524,8 +3530,8 @@ _EmitX86R: if (rex & ~static_cast(_kX86InstOptionNoRex)) { rex |= kX86ByteRex; - opReg &= 0x7; - rmReg &= 0x7; + opReg &= 0x07; + rmReg &= 0x07; EMIT_BYTE(rex); if (x86RexIsInvalid(rex)) @@ -3582,14 +3588,14 @@ _EmitX86M: if (rex & ~static_cast(_kX86InstOptionNoRex)) { rex |= kX86ByteRex; - opReg &= 0x7; + opReg &= 0x07; EMIT_BYTE(rex); if (x86RexIsInvalid(rex)) goto _IllegalInst; } - mBase &= 0x7; + mBase &= 0x07; } // Instruction opcodes. @@ -3643,7 +3649,7 @@ _EmitSib: uint32_t shift = rmMem->getShift(); // Esp/Rsp/R12 register can't be used as an index. - mIndex &= 0x7; + mIndex &= 0x07; ASMJIT_ASSERT(mIndex != kX86RegIndexSp); if (mBase != kX86RegIndexBp && dispOffset == 0) { @@ -3734,7 +3740,7 @@ _EmitSib: } else { // [Disp32 + Index * Scale]. - mIndex &= 0x7; + mIndex &= 0x07; ASMJIT_ASSERT(mIndex != kX86RegIndexSp); uint32_t shift = rmMem->getShift(); @@ -3857,8 +3863,8 @@ _EmitFpuOp: } \ } \ \ - mBase &= 0x7; \ - opReg &= 0x7; + mBase &= 0x07; \ + opReg &= 0x07; _EmitAvxOp: { @@ -3920,7 +3926,7 @@ _EmitAvxR: } } - EMIT_BYTE(x86EncodeMod(3, opReg, static_cast(rmReg))); + EMIT_BYTE(x86EncodeMod(3, opReg & 0x07, static_cast(rmReg))); if (imLen == 0) goto _EmitDone; @@ -3939,7 +3945,7 @@ _EmitAvxV: goto _IllegalInst; if (Arch == kArchX64) - mIndex &= 0x7; + mIndex &= 0x07; dispOffset = rmMem->getDisplacement(); if (rmMem->isBaseIndexType()) { @@ -4046,8 +4052,8 @@ _EmitAvxV: EMIT_OP(opCode); \ } \ \ - mBase &= 0x7; \ - opReg &= 0x7; + mBase &= 0x07; \ + opReg &= 0x07; _EmitXopR: { @@ -4074,7 +4080,7 @@ _EmitXopR: rmReg &= 0x07; } - EMIT_BYTE(x86EncodeMod(3, opReg, static_cast(rmReg))); + EMIT_BYTE(x86EncodeMod(3, opReg & 0x07, static_cast(rmReg))); if (imLen == 0) goto _EmitDone; diff --git a/src/asmjit/x86/x86operand.h b/src/asmjit/x86/x86operand.h index 7e6929a..f64e1e0 100644 --- a/src/asmjit/x86/x86operand.h +++ b/src/asmjit/x86/x86operand.h @@ -1887,7 +1887,7 @@ static ASMJIT_INLINE X86Mem ptr(const Label& label, const X86GpReg& index, uint3 //! Create `[RIP + disp]` memory operand with no/custom size information. static ASMJIT_INLINE X86Mem ptr(const X86RipReg& rip, int32_t disp = 0, uint32_t size = 0) { - return X86Mem(rip, disp, size); + return X86Mem(rip, disp, size); } //! Create `[pAbs + disp]` absolute memory operand with no/custom size information. diff --git a/src/app/test/asmjit_bench_x86.cpp b/src/test/asmjit_bench_x86.cpp similarity index 96% rename from src/app/test/asmjit_bench_x86.cpp rename to src/test/asmjit_bench_x86.cpp index ef7b97a..c33b53f 100644 --- a/src/app/test/asmjit_bench_x86.cpp +++ b/src/test/asmjit_bench_x86.cpp @@ -5,11 +5,11 @@ // Zlib - See LICENSE.md file in the package. // [Dependencies - AsmJit] -#include +#include "../asmjit/asmjit.h" // [Dependencies - Test] -#include "genblend.h" -#include "genopcode.h" +#include "./asmjit_test_opcode.h" +#include "./genblend.h" // [Dependencies - C] #include diff --git a/src/test/asmjit_test_opcode.cpp b/src/test/asmjit_test_opcode.cpp new file mode 100644 index 0000000..ba5e2ca --- /dev/null +++ b/src/test/asmjit_test_opcode.cpp @@ -0,0 +1,75 @@ +// [AsmJit] +// Complete x86/x64 JIT and Remote Assembler for C++. +// +// [License] +// Zlib - See LICENSE.md file in the package. + +// This file is used to test opcodes generated by AsmJit. Output can be +// disassembled in your IDE or by your favourite disassembler. Instructions +// are grouped by category and then sorted alphabetically. + +// [Dependencies - AsmJit] +#include "../asmjit/asmjit.h" + +// [Dependencies - Test] +#include "./asmjit_test_opcode.h" + +// [Dependencies - C] +#include +#include +#include + +typedef void (*VoidFunc)(void); + +struct OpcodeDumpInfo { + uint32_t arch; + bool useRex; +}; + +static const char* archIdToString(uint32_t archId) { + switch (archId) { + case asmjit::kArchNone: return "None"; + case asmjit::kArchX86: return "X86"; + case asmjit::kArchX64: return "X64"; + case asmjit::kArchArm: return "ARM"; + default: return ""; + } +} + +int main(int argc, char* argv[]) { + asmjit::FileLogger logger(stdout); + logger.setOption(asmjit::kLoggerOptionBinaryForm, true); + + OpcodeDumpInfo infoList[] = { +# if defined(ASMJIT_BUILD_X86) + { asmjit::kArchX86, false }, +# endif // ASMJIT_BUILD_X86 +# if defined(ASMJIT_BUILD_X64) + { asmjit::kArchX64, false }, + { asmjit::kArchX64, true } +# endif // ASMJIT_BUILD_X64 + }; + + for (int i = 0; i < ASMJIT_ARRAY_SIZE(infoList); i++) { + const OpcodeDumpInfo& info = infoList[i]; + + printf("Opcodes [ARCH=%s REX=%s]\n", + archIdToString(info.arch), + info.useRex ? "true" : "false"); + + asmjit::JitRuntime runtime; + asmjit::X86Assembler a(&runtime, info.arch); + a.setLogger(&logger); + + asmgen::opcode(a, info.useRex); + VoidFunc p = asmjit_cast(a.make()); + + // Only run if disassembly makes sense. + if (info.arch == asmjit::kArchHost) + p(); + + runtime.release((void*)p); + } + + return 0; +} diff --git a/src/test/asmjit_test_opcode.h b/src/test/asmjit_test_opcode.h new file mode 100644 index 0000000..fd0f75b --- /dev/null +++ b/src/test/asmjit_test_opcode.h @@ -0,0 +1,2859 @@ +// [AsmJit] +// Complete x86/x64 JIT and Remote Assembler for C++. +// +// [License] +// Zlib - See LICENSE.md file in the package. + +// [Guard] +#ifndef _TEST_ASMJIT_TEST_OPCODE_H +#define _TEST_ASMJIT_TEST_OPCODE_H + +// [Dependencies] +#include "../asmjit/asmjit.h" + +namespace asmgen { + +enum { kGenOpCodeInstCount = 2670 }; + +// Generate all instructions asmjit can emit. +static void opcode(asmjit::X86Assembler& a, bool useRex = false) { + using namespace asmjit; + using namespace asmjit::x86; + + // Prevent crash when the generated function is called to see disassembly. + a.ret(); + + // All instructions use the following register that can be changed to see if + // the `X86Assembler` is properly encoding all possible combinations. If the + // `useRexRegs` argument is true the `A` version will in most cases contain + // a register having index 8 (if encodable). + X86GpReg gLoA = useRex ? r8b : al; + X86GpReg gLoB = bl; + + X86GpReg gHiA = ah; + X86GpReg gHiB = bh; + + X86GpReg gwA = useRex ? r8w : ax; + X86GpReg gwB = si; + + X86GpReg gdA = useRex ? r8d : eax; + X86GpReg gdB = esi; + + X86GpReg gzA = useRex ? r8 : a.zax; + X86GpReg gzB = a.zsi; + X86GpReg gzC = a.zcx; + + X86FpReg fpA = fp0; + X86FpReg fpB = fp7; + + X86MmReg mmA = mm0; + X86MmReg mmB = mm7; + + X86XmmReg xmmA = useRex ? xmm8 : xmm0; + X86XmmReg xmmB = xmm1; + X86XmmReg xmmC = xmm2; + X86XmmReg xmmD = xmm3; + + X86YmmReg ymmA = useRex ? ymm8 : ymm0; + X86YmmReg ymmB = ymm1; + X86YmmReg ymmC = ymm2; + X86YmmReg ymmD = ymm3; + + X86Mem anyptr_gpA = ptr(gzA); + X86Mem anyptr_gpB = ptr(gzB); + X86Mem anyptr_gpC = ptr(gzC); + X86Mem intptr_gpA = a.intptr_ptr(gzA); + X86Mem intptr_gpB = a.intptr_ptr(gzB); + + X86Mem vmxptr_gpB = ptr(gzB, xmmB); + X86Mem vmyptr_gpB = ptr(gzB, ymmB); + + // Base. + a.adc(gLoA, 1); + a.adc(gLoB, 1); + a.adc(gHiA, 1); + a.adc(gHiB, 1); + a.adc(gwA, 1); + a.adc(gwB, 1); + a.adc(gdA, 1); + a.adc(gdB, 1); + a.adc(gzA, 1); + a.adc(gzA, gzB); + a.adc(gzA, intptr_gpB); + a.adc(intptr_gpA, 1); + a.adc(intptr_gpA, gzB); + a.add(gLoA, 1); + a.add(gLoB, 1); + a.add(gHiA, 1); + a.add(gHiB, 1); + a.add(gwA, 1); + a.add(gwB, 1); + a.add(gdA, 1); + a.add(gdB, 1); + a.add(gzA, 1); + a.add(gzA, gzB); + a.add(gzA, intptr_gpB); + a.add(intptr_gpA, 1); + a.add(intptr_gpA, gzB); + a.and_(gLoA, 1); + a.and_(gLoB, 1); + a.and_(gHiA, 1); + a.and_(gHiB, 1); + a.and_(gwA, 1); + a.and_(gwB, 1); + a.and_(gdA, 1); + a.and_(gdB, 1); + a.and_(gzA, 1); + a.and_(gzA, gzB); + a.and_(gzA, intptr_gpB); + a.and_(intptr_gpA, 1); + a.and_(intptr_gpA, gzB); + a.bswap(gzA); + a.bt(gzA, 1); + a.bt(gzA, gzB); + a.bt(intptr_gpA, 1); + a.bt(intptr_gpA, gzB); + a.btc(gzA, 1); + a.btc(gzA, gzB); + a.btc(intptr_gpA, 1); + a.btc(intptr_gpA, gzB); + a.btr(gzA, 1); + a.btr(gzA, gzB); + a.btr(intptr_gpA, 1); + a.btr(intptr_gpA, gzB); + a.bts(gzA, 1); + a.bts(gzA, gzB); + a.bts(intptr_gpA, 1); + a.bts(intptr_gpA, gzB); + a.call(gzA); + a.call(intptr_gpA); + a.cbw(); + a.cwde(); + a.clc(); + a.cld(); + a.cmc(); + a.cmp(gLoA, 1); + a.cmp(gLoB, 1); + a.cmp(gHiA, 1); + a.cmp(gHiB, 1); + a.cmp(gwA, 1); + a.cmp(gwB, 1); + a.cmp(gdA, 1); + a.cmp(gdB, 1); + a.cmp(gzA, 1); + a.cmp(gzA, gzB); + a.cmp(gzA, intptr_gpB); + a.cmp(intptr_gpA, 1); + a.cmp(intptr_gpA, gzB); + a.cmpxchg(gzA, gzB); + a.cmpxchg(intptr_gpA, gzB); + a.cmpxchg8b(anyptr_gpA); + a.cpuid(); + a.crc32(gzA, anyptr_gpB); + a.dec(gzA); + a.dec(intptr_gpA); + a.div(gzA); + a.div(intptr_gpA); + a.idiv(gzA); + a.idiv(intptr_gpA); + a.imul(gzA); + a.imul(intptr_gpA); + a.imul(gzA, 1); + a.imul(gzA, gzB); + a.imul(gzA, gzB, 1); + a.imul(gzA, intptr_gpB); + a.imul(gzA, intptr_gpB, 1); + a.inc(gzA); + a.inc(intptr_gpA); + a.int3(); + a.lea(gzA, intptr_gpB); + a.mov(gzA, 1); + a.mov(gzA, gzB); + a.mov(gzA, intptr_gpB); + a.mov(intptr_gpA, 1); + a.mov(intptr_gpA, gzB); + a.movsx(gzA, gLoB); + a.movsx(gzA, byte_ptr(gzB)); + a.movzx(gzA, gLoB); + a.movzx(gzA, byte_ptr(gzB)); + a.movbe(gzA, anyptr_gpB); + a.movbe(anyptr_gpA, gzB); + a.mul(gzA); + a.mul(intptr_gpA); + a.neg(gzA); + a.neg(intptr_gpA); + a.nop(); + a.not_(gzA); + a.not_(intptr_gpA); + a.or_(gLoA, 1); + a.or_(gLoB, 1); + a.or_(gHiA, 1); + a.or_(gHiB, 1); + a.or_(gwA, 1); + a.or_(gwB, 1); + a.or_(gdA, 1); + a.or_(gdB, 1); + a.or_(gzA, 1); + a.or_(gzA, gzB); + a.or_(gzA, intptr_gpB); + a.or_(intptr_gpA, 1); + a.or_(intptr_gpA, gzB); + a.pop(gzA); + a.pop(intptr_gpA); + a.push(gzA); + a.push(intptr_gpA); + a.push(0); + a.rcl(gzA, 0); + a.rcl(gzA, 1); + a.rcl(gzA, cl); + a.rcl(intptr_gpA, 0); + a.rcl(intptr_gpA, 1); + a.rcl(intptr_gpA, cl); + a.rcr(gzA, 0); + a.rcr(gzA, 1); + a.rcr(gzA, cl); + a.rcr(intptr_gpA, 0); + a.rcr(intptr_gpA, 1); + a.rcr(intptr_gpA, cl); + a.rdtsc(); + a.rdtscp(); + a.ret(); + a.ret(0); + a.rol(gzA, 0); + a.rol(gzA, 1); + a.rol(gzA, cl); + a.rol(intptr_gpA, 0); + a.rol(intptr_gpA, 1); + a.rol(intptr_gpA, cl); + a.ror(gzA, 0); + a.ror(gzA, 1); + a.ror(gzA, cl); + a.ror(intptr_gpA, 0); + a.ror(intptr_gpA, 1); + a.ror(intptr_gpA, cl); + a.sbb(gLoA, 1); + a.sbb(gLoB, 1); + a.sbb(gHiA, 1); + a.sbb(gHiB, 1); + a.sbb(gwA, 1); + a.sbb(gwB, 1); + a.sbb(gdA, 1); + a.sbb(gdB, 1); + a.sbb(gzA, 1); + a.sbb(gzA, gzB); + a.sbb(gzA, intptr_gpB); + a.sbb(intptr_gpA, 1); + a.sbb(intptr_gpA, gzB); + a.sal(gzA, cl); + a.sal(gzA, 0); + a.sal(gzA, 1); + a.sal(intptr_gpA, 0); + a.sal(intptr_gpA, 1); + a.sal(intptr_gpA, cl); + a.sar(gzA, 0); + a.sar(gzA, 1); + a.sar(gzA, cl); + a.sar(intptr_gpA, 0); + a.sar(intptr_gpA, 1); + a.sar(intptr_gpA, cl); + a.shl(gzA, 0); + a.shl(gzA, 1); + a.shl(gzA, cl); + a.shl(intptr_gpA, 0); + a.shl(intptr_gpA, 1); + a.shl(intptr_gpA, cl); + a.shr(gzA, 0); + a.shr(gzA, 1); + a.shr(gzA, cl); + a.shr(intptr_gpA, 0); + a.shr(intptr_gpA, 1); + a.shr(intptr_gpA, cl); + a.shld(gzA, gzB, 0); + a.shld(gzA, gzB, cl); + a.shld(intptr_gpA, gzB, 0); + a.shld(intptr_gpA, gzB, cl); + a.shrd(gzA, gzB, 0); + a.shrd(gzA, gzB, cl); + a.shrd(intptr_gpA, gzB, 0); + a.shrd(intptr_gpA, gzB, cl); + a.stc(); + a.std(); + a.sub(gLoA, 1); + a.sub(gLoB, 1); + a.sub(gHiA, 1); + a.sub(gHiB, 1); + a.sub(gwA, 1); + a.sub(gwB, 1); + a.sub(gdA, 1); + a.sub(gdB, 1); + a.sub(gzA, 1); + a.sub(gzA, gzB); + a.sub(gzA, intptr_gpB); + a.sub(intptr_gpA, 1); + a.sub(intptr_gpA, gzB); + a.test(gzA, 1); + a.test(gzA, gzB); + a.test(intptr_gpA, 1); + a.test(intptr_gpA, gzB); + a.ud2(); + a.xadd(gzA, gzB); + a.xadd(intptr_gpA, gzB); + a.xchg(gzA, gzB); + a.xchg(intptr_gpA, gzB); + a.xchg(gzA, intptr_gpB); + a.xor_(gLoA, 1); + a.xor_(gLoB, 1); + a.xor_(gHiA, 1); + a.xor_(gHiB, 1); + a.xor_(gwA, 1); + a.xor_(gwB, 1); + a.xor_(gdA, 1); + a.xor_(gdB, 1); + a.xor_(gzA, 1); + a.xor_(gzA, gzB); + a.xor_(gzA, intptr_gpB); + a.xor_(intptr_gpA, 1); + a.xor_(intptr_gpA, gzB); + + a.nop(); + + a.lodsb(); + a.lodsd(); + a.lodsw(); + a.rep_lodsb(); + a.rep_lodsd(); + a.rep_lodsw(); + + a.movsb(); + a.movsd(); + a.movsw(); + a.rep_movsb(); + a.rep_movsd(); + a.rep_movsw(); + + a.stosb(); + a.stosd(); + a.stosw(); + a.rep_stosb(); + a.rep_stosd(); + a.rep_stosw(); + + a.cmpsb(); + a.cmpsd(); + a.cmpsw(); + a.repe_cmpsb(); + a.repe_cmpsd(); + a.repe_cmpsw(); + a.repne_cmpsb(); + a.repne_cmpsd(); + a.repne_cmpsw(); + + a.scasb(); + a.scasd(); + a.scasw(); + a.repe_scasb(); + a.repe_scasd(); + a.repe_scasw(); + a.repne_scasb(); + a.repne_scasd(); + a.repne_scasw(); + + // Label...Jcc/Jecxz/Jmp. + { + a.nop(); + + Label L(a); + a.bind(L); + + a.ja(L); + a.jae(L); + a.jb(L); + a.jbe(L); + a.jc(L); + a.je(L); + a.jg(L); + a.jge(L); + a.jl(L); + a.jle(L); + a.jna(L); + a.jnae(L); + a.jnb(L); + a.jnbe(L); + a.jnc(L); + a.jne(L); + a.jng(L); + a.jnge(L); + a.jnl(L); + a.jnle(L); + a.jno(L); + a.jnp(L); + a.jns(L); + a.jnz(L); + a.jo(L); + a.jp(L); + a.jpe(L); + a.jpo(L); + a.js(L); + a.jz(L); + a.jecxz(ecx, L); + a.jmp(L); + } + + // Jcc/Jecxz/Jmp...Label. + { + a.nop(); + + Label L(a); + a.ja(L); + a.jae(L); + a.jb(L); + a.jbe(L); + a.jc(L); + a.je(L); + a.jg(L); + a.jge(L); + a.jl(L); + a.jle(L); + a.jna(L); + a.jnae(L); + a.jnb(L); + a.jnbe(L); + a.jnc(L); + a.jne(L); + a.jng(L); + a.jnge(L); + a.jnl(L); + a.jnle(L); + a.jno(L); + a.jnp(L); + a.jns(L); + a.jnz(L); + a.jo(L); + a.jp(L); + a.jpe(L); + a.jpo(L); + a.js(L); + a.jz(L); + a.jecxz(ecx, L); + a.jmp(L); + a.bind(L); + } + + // Fpu. + a.nop(); + + a.f2xm1(); + a.fabs(); + a.fadd(fpA, fpB); + a.fadd(fpB, fpA); + a.fadd(dword_ptr(gzA)); + a.fadd(qword_ptr(gzA)); + a.faddp(fpB); + a.faddp(); + a.fbld(dword_ptr(gzA)); + a.fbstp(dword_ptr(gzA)); + a.fchs(); + a.fclex(); + a.fcom(fpB); + a.fcom(); + a.fcom(dword_ptr(gzA)); + a.fcom(qword_ptr(gzA)); + a.fcomp(fpB); + a.fcomp(); + a.fcomp(dword_ptr(gzA)); + a.fcomp(qword_ptr(gzA)); + a.fcompp(); + a.fcos(); + a.fdecstp(); + a.fdiv(fpA, fpB); + a.fdiv(fpB, fpA); + a.fdiv(dword_ptr(gzA)); + a.fdiv(qword_ptr(gzA)); + a.fdivp(fpB); + a.fdivp(); + a.fdivr(fpA, fpB); + a.fdivr(fpB, fpA); + a.fdivr(dword_ptr(gzA)); + a.fdivr(qword_ptr(gzA)); + a.fdivrp(fpB); + a.fdivrp(); + a.fiadd(dword_ptr(gzA)); + a.ficom(word_ptr(gzA)); + a.ficom(dword_ptr(gzA)); + a.ficomp(word_ptr(gzA)); + a.ficomp(dword_ptr(gzA)); + a.fidiv(word_ptr(gzA)); + a.fidiv(dword_ptr(gzA)); + a.fidivr(word_ptr(gzA)); + a.fidivr(dword_ptr(gzA)); + a.fild(word_ptr(gzA)); + a.fild(dword_ptr(gzA)); + a.fild(qword_ptr(gzA)); + a.fimul(word_ptr(gzA)); + a.fimul(dword_ptr(gzA)); + a.fincstp(); + a.finit(); + a.fninit(); + a.fisub(word_ptr(gzA)); + a.fisub(dword_ptr(gzA)); + a.fisubr(word_ptr(gzA)); + a.fisubr(dword_ptr(gzA)); + a.fist(word_ptr(gzA)); + a.fist(dword_ptr(gzA)); + a.fistp(word_ptr(gzA)); + a.fistp(dword_ptr(gzA)); + a.fistp(qword_ptr(gzA)); + a.fld(dword_ptr(gzA)); + a.fld(qword_ptr(gzA)); + a.fld(tword_ptr(gzA)); + a.fld1(); + a.fldl2t(); + a.fldl2e(); + a.fldpi(); + a.fldlg2(); + a.fldln2(); + a.fldz(); + a.fldcw(anyptr_gpA); + a.fldenv(anyptr_gpA); + a.fmul(fpA, fpB); + a.fmul(fpB, fpA); + a.fmul(dword_ptr(gzA)); + a.fmul(qword_ptr(gzA)); + a.fmulp(fpB); + a.fmulp(); + a.fnclex(); + a.fnop(); + a.fnsave(anyptr_gpA); + a.fnstenv(anyptr_gpA); + a.fnstcw(anyptr_gpA); + a.fpatan(); + a.fprem(); + a.fprem1(); + a.fptan(); + a.frndint(); + a.frstor(anyptr_gpA); + a.fsave(anyptr_gpA); + a.fscale(); + a.fsin(); + a.fsincos(); + a.fsqrt(); + a.fst(dword_ptr(gzA)); + a.fst(qword_ptr(gzA)); + a.fstp(dword_ptr(gzA)); + a.fstp(qword_ptr(gzA)); + a.fstp(tword_ptr(gzA)); + a.fstcw(anyptr_gpA); + a.fstenv(anyptr_gpA); + a.fsub(fpA, fpB); + a.fsub(fpB, fpA); + a.fsub(dword_ptr(gzA)); + a.fsub(qword_ptr(gzA)); + a.fsubp(fpB); + a.fsubp(); + a.fsubr(fpA, fpB); + a.fsubr(fpB, fpA); + a.fsubr(dword_ptr(gzA)); + a.fsubr(qword_ptr(gzA)); + a.fsubrp(fpB); + a.fsubrp(); + a.ftst(); + a.fucom(fpB); + a.fucom(); + a.fucom(fpB); + a.fucomi(fpB); + a.fucomip(fpB); + a.fucomp(fpB); + a.fucompp(); + a.fxam(); + a.fxrstor(anyptr_gpA); + a.fxsave(anyptr_gpA); + a.fxtract(); + a.fyl2x(); + a.fyl2xp1(); + + // MMX/MMX-EXT. + a.nop(); + + a.movd(anyptr_gpA, mmB); + a.movd(gdA, mmB); + a.movd(mmA, anyptr_gpB); + a.movd(mmA, esi); + a.movq(mmA, mmB); + a.movq(anyptr_gpA, mmB); + a.movq(mmA, anyptr_gpB); + a.packuswb(mmA, mmB); + a.packuswb(mmA, anyptr_gpB); + a.paddb(mmA, mmB); + a.paddb(mmA, anyptr_gpB); + a.paddw(mmA, mmB); + a.paddw(mmA, anyptr_gpB); + a.paddd(mmA, mmB); + a.paddd(mmA, anyptr_gpB); + a.paddsb(mmA, mmB); + a.paddsb(mmA, anyptr_gpB); + a.paddsw(mmA, mmB); + a.paddsw(mmA, anyptr_gpB); + a.paddusb(mmA, mmB); + a.paddusb(mmA, anyptr_gpB); + a.paddusw(mmA, mmB); + a.paddusw(mmA, anyptr_gpB); + a.pand(mmA, mmB); + a.pand(mmA, anyptr_gpB); + a.pandn(mmA, mmB); + a.pandn(mmA, anyptr_gpB); + a.pcmpeqb(mmA, mmB); + a.pcmpeqb(mmA, anyptr_gpB); + a.pcmpeqw(mmA, mmB); + a.pcmpeqw(mmA, anyptr_gpB); + a.pcmpeqd(mmA, mmB); + a.pcmpeqd(mmA, anyptr_gpB); + a.pcmpgtb(mmA, mmB); + a.pcmpgtb(mmA, anyptr_gpB); + a.pcmpgtw(mmA, mmB); + a.pcmpgtw(mmA, anyptr_gpB); + a.pcmpgtd(mmA, mmB); + a.pcmpgtd(mmA, anyptr_gpB); + a.pmulhw(mmA, mmB); + a.pmulhw(mmA, anyptr_gpB); + a.pmullw(mmA, mmB); + a.pmullw(mmA, anyptr_gpB); + a.por(mmA, mmB); + a.por(mmA, anyptr_gpB); + a.pmaddwd(mmA, mmB); + a.pmaddwd(mmA, anyptr_gpB); + a.pslld(mmA, mmB); + a.pslld(mmA, anyptr_gpB); + a.pslld(mmA, 0); + a.psllq(mmA, mmB); + a.psllq(mmA, anyptr_gpB); + a.psllq(mmA, 0); + a.psllw(mmA, mmB); + a.psllw(mmA, anyptr_gpB); + a.psllw(mmA, 0); + a.psrad(mmA, mmB); + a.psrad(mmA, anyptr_gpB); + a.psrad(mmA, 0); + a.psraw(mmA, mmB); + a.psraw(mmA, anyptr_gpB); + a.psraw(mmA, 0); + a.psrld(mmA, mmB); + a.psrld(mmA, anyptr_gpB); + a.psrld(mmA, 0); + a.psrlq(mmA, mmB); + a.psrlq(mmA, anyptr_gpB); + a.psrlq(mmA, 0); + a.psrlw(mmA, mmB); + a.psrlw(mmA, anyptr_gpB); + a.psrlw(mmA, 0); + a.psubb(mmA, mmB); + a.psubb(mmA, anyptr_gpB); + a.psubw(mmA, mmB); + a.psubw(mmA, anyptr_gpB); + a.psubd(mmA, mmB); + a.psubd(mmA, anyptr_gpB); + a.psubsb(mmA, mmB); + a.psubsb(mmA, anyptr_gpB); + a.psubsw(mmA, mmB); + a.psubsw(mmA, anyptr_gpB); + a.psubusb(mmA, mmB); + a.psubusb(mmA, anyptr_gpB); + a.psubusw(mmA, mmB); + a.psubusw(mmA, anyptr_gpB); + a.punpckhbw(mmA, mmB); + a.punpckhbw(mmA, anyptr_gpB); + a.punpckhwd(mmA, mmB); + a.punpckhwd(mmA, anyptr_gpB); + a.punpckhdq(mmA, mmB); + a.punpckhdq(mmA, anyptr_gpB); + a.punpcklbw(mmA, mmB); + a.punpcklbw(mmA, anyptr_gpB); + a.punpcklwd(mmA, mmB); + a.punpcklwd(mmA, anyptr_gpB); + a.punpckldq(mmA, mmB); + a.punpckldq(mmA, anyptr_gpB); + a.pxor(mmA, mmB); + a.pxor(mmA, anyptr_gpB); + a.emms(); + + // 3DNOW! + a.nop(); + + a.pf2id(mmA, mmB); + a.pf2id(mmA, anyptr_gpB); + a.pf2iw(mmA, mmB); + a.pf2iw(mmA, anyptr_gpB); + a.pfacc(mmA, mmB); + a.pfacc(mmA, anyptr_gpB); + a.pfadd(mmA, mmB); + a.pfadd(mmA, anyptr_gpB); + a.pfcmpeq(mmA, mmB); + a.pfcmpeq(mmA, anyptr_gpB); + a.pfcmpge(mmA, mmB); + a.pfcmpge(mmA, anyptr_gpB); + a.pfcmpgt(mmA, mmB); + a.pfcmpgt(mmA, anyptr_gpB); + a.pfmax(mmA, mmB); + a.pfmax(mmA, anyptr_gpB); + a.pfmin(mmA, mmB); + a.pfmin(mmA, anyptr_gpB); + a.pfmul(mmA, mmB); + a.pfmul(mmA, anyptr_gpB); + a.pfnacc(mmA, mmB); + a.pfnacc(mmA, anyptr_gpB); + a.pfpnacc(mmA, mmB); + a.pfpnacc(mmA, anyptr_gpB); + a.pfrcp(mmA, mmB); + a.pfrcp(mmA, anyptr_gpB); + a.pfrcpit1(mmA, mmB); + a.pfrcpit1(mmA, anyptr_gpB); + a.pfrcpit2(mmA, mmB); + a.pfrcpit2(mmA, anyptr_gpB); + a.pfrsqit1(mmA, mmB); + a.pfrsqit1(mmA, anyptr_gpB); + a.pfrsqrt(mmA, mmB); + a.pfrsqrt(mmA, anyptr_gpB); + a.pfsub(mmA, mmB); + a.pfsub(mmA, anyptr_gpB); + a.pfsubr(mmA, mmB); + a.pfsubr(mmA, anyptr_gpB); + a.pi2fd(mmA, mmB); + a.pi2fd(mmA, anyptr_gpB); + a.pi2fw(mmA, mmB); + a.pi2fw(mmA, anyptr_gpB); + a.pswapd(mmA, mmB); + a.pswapd(mmA, anyptr_gpB); + a.prefetch3dnow(anyptr_gpA); + a.prefetchw3dnow(anyptr_gpA); + a.femms(); + + // SSE. + a.nop(); + + a.addps(xmmA, xmmB); + a.addps(xmmA, anyptr_gpB); + a.addss(xmmA, xmmB); + a.addss(xmmA, anyptr_gpB); + a.andnps(xmmA, xmmB); + a.andnps(xmmA, anyptr_gpB); + a.andps(xmmA, xmmB); + a.andps(xmmA, anyptr_gpB); + a.cmpps(xmmA, xmmB, 0); + a.cmpps(xmmA, anyptr_gpB, 0); + a.cmpss(xmmA, xmmB, 0); + a.cmpss(xmmA, anyptr_gpB, 0); + a.comiss(xmmA, xmmB); + a.comiss(xmmA, anyptr_gpB); + a.cvtpi2ps(xmmA, mmB); + a.cvtpi2ps(xmmA, anyptr_gpB); + a.cvtps2pi(mmA, xmmB); + a.cvtps2pi(mmA, anyptr_gpB); + a.cvtsi2ss(xmmA, gzA); + a.cvtsi2ss(xmmA, anyptr_gpB); + a.cvtss2si(gzA, xmmB); + a.cvtss2si(gzA, anyptr_gpB); + a.cvttps2pi(mmA, xmmB); + a.cvttps2pi(mmA, anyptr_gpB); + a.cvttss2si(gzA, xmmB); + a.cvttss2si(gzA, anyptr_gpB); + a.divps(xmmA, xmmB); + a.divps(xmmA, anyptr_gpB); + a.divss(xmmA, xmmB); + a.divss(xmmA, anyptr_gpB); + a.ldmxcsr(anyptr_gpB); + a.maskmovq(mmA, mmB); + a.maxps(xmmA, xmmB); + a.maxps(xmmA, anyptr_gpB); + a.maxss(xmmA, xmmB); + a.maxss(xmmA, anyptr_gpB); + a.minps(xmmA, xmmB); + a.minps(xmmA, anyptr_gpB); + a.minss(xmmA, xmmB); + a.minss(xmmA, anyptr_gpB); + a.movaps(xmmA, xmmB); + a.movaps(xmmA, anyptr_gpB); + a.movaps(anyptr_gpA, xmmB); + a.movd(anyptr_gpA, xmmB); + a.movd(gdA, xmmB); + a.movd(xmmA, anyptr_gpB); + a.movd(xmmA, gdB); + a.movq(mmA, mmB); + a.movq(xmmA, xmmB); + a.movq(anyptr_gpA, xmmB); + a.movq(xmmA, anyptr_gpB); + a.movntq(anyptr_gpA, mmB); + a.movhlps(xmmA, xmmB); + a.movhps(xmmA, anyptr_gpB); + a.movhps(anyptr_gpA, xmmB); + a.movlhps(xmmA, xmmB); + a.movlps(xmmA, anyptr_gpB); + a.movlps(anyptr_gpA, xmmB); + a.movntps(anyptr_gpA, xmmB); + a.movss(xmmA, anyptr_gpB); + a.movss(anyptr_gpA, xmmB); + a.movups(xmmA, xmmB); + a.movups(xmmA, anyptr_gpB); + a.movups(anyptr_gpA, xmmB); + a.mulps(xmmA, xmmB); + a.mulps(xmmA, anyptr_gpB); + a.mulss(xmmA, xmmB); + a.mulss(xmmA, anyptr_gpB); + a.orps(xmmA, xmmB); + a.orps(xmmA, anyptr_gpB); + a.pavgb(mmA, mmB); + a.pavgb(mmA, anyptr_gpB); + a.pavgw(mmA, mmB); + a.pavgw(mmA, anyptr_gpB); + a.pextrw(gzA, mmB, 0); + a.pinsrw(mmA, gdB, 0); + a.pinsrw(mmA, anyptr_gpB, 0); + a.pmaxsw(mmA, mmB); + a.pmaxsw(mmA, anyptr_gpB); + a.pmaxub(mmA, mmB); + a.pmaxub(mmA, anyptr_gpB); + a.pminsw(mmA, mmB); + a.pminsw(mmA, anyptr_gpB); + a.pminub(mmA, mmB); + a.pminub(mmA, anyptr_gpB); + a.pmovmskb(gzA, mmB); + a.pmulhuw(mmA, mmB); + a.pmulhuw(mmA, anyptr_gpB); + a.psadbw(mmA, mmB); + a.psadbw(mmA, anyptr_gpB); + a.pshufw(mmA, mmB, 0); + a.pshufw(mmA, anyptr_gpB, 0); + a.rcpps(xmmA, xmmB); + a.rcpps(xmmA, anyptr_gpB); + a.rcpss(xmmA, xmmB); + a.rcpss(xmmA, anyptr_gpB); + a.prefetch(anyptr_gpA, 0); + a.psadbw(xmmA, xmmB); + a.psadbw(xmmA, anyptr_gpB); + a.rsqrtps(xmmA, xmmB); + a.rsqrtps(xmmA, anyptr_gpB); + a.rsqrtss(xmmA, xmmB); + a.rsqrtss(xmmA, anyptr_gpB); + a.sfence(); + a.shufps(xmmA, xmmB, 0); + a.shufps(xmmA, anyptr_gpB, 0); + a.sqrtps(xmmA, xmmB); + a.sqrtps(xmmA, anyptr_gpB); + a.sqrtss(xmmA, xmmB); + a.sqrtss(xmmA, anyptr_gpB); + a.stmxcsr(anyptr_gpA); + a.subps(xmmA, xmmB); + a.subps(xmmA, anyptr_gpB); + a.subss(xmmA, xmmB); + a.subss(xmmA, anyptr_gpB); + a.ucomiss(xmmA, xmmB); + a.ucomiss(xmmA, anyptr_gpB); + a.unpckhps(xmmA, xmmB); + a.unpckhps(xmmA, anyptr_gpB); + a.unpcklps(xmmA, xmmB); + a.unpcklps(xmmA, anyptr_gpB); + a.xorps(xmmA, xmmB); + a.xorps(xmmA, anyptr_gpB); + + // SSE2. + a.nop(); + + a.addpd(xmmA, xmmB); + a.addpd(xmmA, anyptr_gpB); + a.addsd(xmmA, xmmB); + a.addsd(xmmA, anyptr_gpB); + a.andnpd(xmmA, xmmB); + a.andnpd(xmmA, anyptr_gpB); + a.andpd(xmmA, xmmB); + a.andpd(xmmA, anyptr_gpB); + a.clflush(anyptr_gpA); + a.cmppd(xmmA, xmmB, 0); + a.cmppd(xmmA, anyptr_gpB, 0); + a.cmpsd(xmmA, xmmB, 0); + a.cmpsd(xmmA, anyptr_gpB, 0); + a.comisd(xmmA, xmmB); + a.comisd(xmmA, anyptr_gpB); + a.cvtdq2pd(xmmA, xmmB); + a.cvtdq2pd(xmmA, anyptr_gpB); + a.cvtdq2ps(xmmA, xmmB); + a.cvtdq2ps(xmmA, anyptr_gpB); + a.cvtpd2dq(xmmA, xmmB); + a.cvtpd2dq(xmmA, anyptr_gpB); + a.cvtpd2pi(mmA, xmmB); + a.cvtpd2pi(mmA, anyptr_gpB); + a.cvtpd2ps(xmmA, xmmB); + a.cvtpd2ps(xmmA, anyptr_gpB); + a.cvtpi2pd(xmmA, mmB); + a.cvtpi2pd(xmmA, anyptr_gpB); + a.cvtps2dq(xmmA, xmmB); + a.cvtps2dq(xmmA, anyptr_gpB); + a.cvtps2pd(xmmA, xmmB); + a.cvtps2pd(xmmA, anyptr_gpB); + a.cvtsd2si(gzA, xmmB); + a.cvtsd2si(gzA, anyptr_gpB); + a.cvtsd2ss(xmmA, xmmB); + a.cvtsd2ss(xmmA, anyptr_gpB); + a.cvtsi2sd(xmmA, gzB); + a.cvtsi2sd(xmmA, anyptr_gpB); + a.cvtss2sd(xmmA, xmmB); + a.cvtss2sd(xmmA, anyptr_gpB); + a.cvtss2si(gzA, xmmB); + a.cvtss2si(gzA, anyptr_gpB); + a.cvttpd2pi(mmA, xmmB); + a.cvttpd2pi(mmA, anyptr_gpB); + a.cvttpd2dq(xmmA, xmmB); + a.cvttpd2dq(xmmA, anyptr_gpB); + a.cvttps2dq(xmmA, xmmB); + a.cvttps2dq(xmmA, anyptr_gpB); + a.cvttsd2si(gzA, xmmB); + a.cvttsd2si(gzA, anyptr_gpB); + a.divpd(xmmA, xmmB); + a.divpd(xmmA, anyptr_gpB); + a.divsd(xmmA, xmmB); + a.divsd(xmmA, anyptr_gpB); + a.lfence(); + a.maskmovdqu(xmmA, xmmB); + a.maxpd(xmmA, xmmB); + a.maxpd(xmmA, anyptr_gpB); + a.maxsd(xmmA, xmmB); + a.maxsd(xmmA, anyptr_gpB); + a.mfence(); + a.minpd(xmmA, xmmB); + a.minpd(xmmA, anyptr_gpB); + a.minsd(xmmA, xmmB); + a.minsd(xmmA, anyptr_gpB); + a.movdqa(xmmA, xmmB); + a.movdqa(xmmA, anyptr_gpB); + a.movdqa(anyptr_gpA, xmmB); + a.movdqu(xmmA, xmmB); + a.movdqu(xmmA, anyptr_gpB); + a.movdqu(anyptr_gpA, xmmB); + a.movmskps(gzA, xmmB); + a.movmskpd(gzA, xmmB); + a.movsd(xmmA, xmmB); + a.movsd(xmmA, anyptr_gpB); + a.movsd(anyptr_gpA, xmmB); + a.movapd(xmmA, anyptr_gpB); + a.movapd(anyptr_gpA, xmmB); + a.movdq2q(mmA, xmmB); + a.movq2dq(xmmA, mmB); + a.movhpd(xmmA, anyptr_gpB); + a.movhpd(anyptr_gpA, xmmB); + a.movlpd(xmmA, anyptr_gpB); + a.movlpd(anyptr_gpA, xmmB); + a.movntdq(anyptr_gpA, xmmB); + a.movnti(anyptr_gpA, gzB); + a.movntpd(anyptr_gpA, xmmB); + a.movupd(xmmA, anyptr_gpB); + a.movupd(anyptr_gpA, xmmB); + a.mulpd(xmmA, xmmB); + a.mulpd(xmmA, anyptr_gpB); + a.mulsd(xmmA, xmmB); + a.mulsd(xmmA, anyptr_gpB); + a.orpd(xmmA, xmmB); + a.orpd(xmmA, anyptr_gpB); + a.packsswb(xmmA, xmmB); + a.packsswb(xmmA, anyptr_gpB); + a.packssdw(xmmA, xmmB); + a.packssdw(xmmA, anyptr_gpB); + a.packuswb(xmmA, xmmB); + a.packuswb(xmmA, anyptr_gpB); + a.paddb(xmmA, xmmB); + a.paddb(xmmA, anyptr_gpB); + a.paddw(xmmA, xmmB); + a.paddw(xmmA, anyptr_gpB); + a.paddd(xmmA, xmmB); + a.paddd(xmmA, anyptr_gpB); + a.paddq(mmA, mmB); + a.paddq(mmA, anyptr_gpB); + a.paddq(xmmA, xmmB); + a.paddq(xmmA, anyptr_gpB); + a.paddsb(xmmA, xmmB); + a.paddsb(xmmA, anyptr_gpB); + a.paddsw(xmmA, xmmB); + a.paddsw(xmmA, anyptr_gpB); + a.paddusb(xmmA, xmmB); + a.paddusb(xmmA, anyptr_gpB); + a.paddusw(xmmA, xmmB); + a.paddusw(xmmA, anyptr_gpB); + a.pand(xmmA, xmmB); + a.pand(xmmA, anyptr_gpB); + a.pandn(xmmA, xmmB); + a.pandn(xmmA, anyptr_gpB); + a.pause(); + a.pavgb(xmmA, xmmB); + a.pavgb(xmmA, anyptr_gpB); + a.pavgw(xmmA, xmmB); + a.pavgw(xmmA, anyptr_gpB); + a.pcmpeqb(xmmA, xmmB); + a.pcmpeqb(xmmA, anyptr_gpB); + a.pcmpeqw(xmmA, xmmB); + a.pcmpeqw(xmmA, anyptr_gpB); + a.pcmpeqd(xmmA, xmmB); + a.pcmpeqd(xmmA, anyptr_gpB); + a.pcmpgtb(xmmA, xmmB); + a.pcmpgtb(xmmA, anyptr_gpB); + a.pcmpgtw(xmmA, xmmB); + a.pcmpgtw(xmmA, anyptr_gpB); + a.pcmpgtd(xmmA, xmmB); + a.pcmpgtd(xmmA, anyptr_gpB); + a.pmaxsw(xmmA, xmmB); + a.pmaxsw(xmmA, anyptr_gpB); + a.pmaxub(xmmA, xmmB); + a.pmaxub(xmmA, anyptr_gpB); + a.pminsw(xmmA, xmmB); + a.pminsw(xmmA, anyptr_gpB); + a.pminub(xmmA, xmmB); + a.pminub(xmmA, anyptr_gpB); + a.pmovmskb(gzA, xmmB); + a.pmulhw(xmmA, xmmB); + a.pmulhw(xmmA, anyptr_gpB); + a.pmulhuw(xmmA, xmmB); + a.pmulhuw(xmmA, anyptr_gpB); + a.pmullw(xmmA, xmmB); + a.pmullw(xmmA, anyptr_gpB); + a.pmuludq(mmA, mmB); + a.pmuludq(mmA, anyptr_gpB); + a.pmuludq(xmmA, xmmB); + a.pmuludq(xmmA, anyptr_gpB); + a.por(xmmA, xmmB); + a.por(xmmA, anyptr_gpB); + a.pslld(xmmA, xmmB); + a.pslld(xmmA, anyptr_gpB); + a.pslld(xmmA, 0); + a.psllq(xmmA, xmmB); + a.psllq(xmmA, anyptr_gpB); + a.psllq(xmmA, 0); + a.psllw(xmmA, xmmB); + a.psllw(xmmA, anyptr_gpB); + a.psllw(xmmA, 0); + a.pslldq(xmmA, 0); + a.psrad(xmmA, xmmB); + a.psrad(xmmA, anyptr_gpB); + a.psrad(xmmA, 0); + a.psraw(xmmA, xmmB); + a.psraw(xmmA, anyptr_gpB); + a.psraw(xmmA, 0); + a.psubb(xmmA, xmmB); + a.psubb(xmmA, anyptr_gpB); + a.psubw(xmmA, xmmB); + a.psubw(xmmA, anyptr_gpB); + a.psubd(xmmA, xmmB); + a.psubd(xmmA, anyptr_gpB); + a.psubq(mmA, mmB); + a.psubq(mmA, anyptr_gpB); + a.psubq(xmmA, xmmB); + a.psubq(xmmA, anyptr_gpB); + a.pmaddwd(xmmA, xmmB); + a.pmaddwd(xmmA, anyptr_gpB); + a.pshufd(xmmA, xmmB, 0); + a.pshufd(xmmA, anyptr_gpB, 0); + a.pshufhw(xmmA, xmmB, 0); + a.pshufhw(xmmA, anyptr_gpB, 0); + a.pshuflw(xmmA, xmmB, 0); + a.pshuflw(xmmA, anyptr_gpB, 0); + a.psrld(xmmA, xmmB); + a.psrld(xmmA, anyptr_gpB); + a.psrld(xmmA, 0); + a.psrlq(xmmA, xmmB); + a.psrlq(xmmA, anyptr_gpB); + a.psrlq(xmmA, 0); + a.psrldq(xmmA, 0); + a.psrlw(xmmA, xmmB); + a.psrlw(xmmA, anyptr_gpB); + a.psrlw(xmmA, 0); + a.psubsb(xmmA, xmmB); + a.psubsb(xmmA, anyptr_gpB); + a.psubsw(xmmA, xmmB); + a.psubsw(xmmA, anyptr_gpB); + a.psubusb(xmmA, xmmB); + a.psubusb(xmmA, anyptr_gpB); + a.psubusw(xmmA, xmmB); + a.psubusw(xmmA, anyptr_gpB); + a.punpckhbw(xmmA, xmmB); + a.punpckhbw(xmmA, anyptr_gpB); + a.punpckhwd(xmmA, xmmB); + a.punpckhwd(xmmA, anyptr_gpB); + a.punpckhdq(xmmA, xmmB); + a.punpckhdq(xmmA, anyptr_gpB); + a.punpckhqdq(xmmA, xmmB); + a.punpckhqdq(xmmA, anyptr_gpB); + a.punpcklbw(xmmA, xmmB); + a.punpcklbw(xmmA, anyptr_gpB); + a.punpcklwd(xmmA, xmmB); + a.punpcklwd(xmmA, anyptr_gpB); + a.punpckldq(xmmA, xmmB); + a.punpckldq(xmmA, anyptr_gpB); + a.punpcklqdq(xmmA, xmmB); + a.punpcklqdq(xmmA, anyptr_gpB); + a.pxor(xmmA, xmmB); + a.pxor(xmmA, anyptr_gpB); + a.sqrtpd(xmmA, xmmB); + a.sqrtpd(xmmA, anyptr_gpB); + a.sqrtsd(xmmA, xmmB); + a.sqrtsd(xmmA, anyptr_gpB); + a.subpd(xmmA, xmmB); + a.subpd(xmmA, anyptr_gpB); + a.subsd(xmmA, xmmB); + a.subsd(xmmA, anyptr_gpB); + a.ucomisd(xmmA, xmmB); + a.ucomisd(xmmA, anyptr_gpB); + a.unpckhpd(xmmA, xmmB); + a.unpckhpd(xmmA, anyptr_gpB); + a.unpcklpd(xmmA, xmmB); + a.unpcklpd(xmmA, anyptr_gpB); + a.xorpd(xmmA, xmmB); + a.xorpd(xmmA, anyptr_gpB); + + // SSE3. + a.nop(); + + a.addsubpd(xmmA, xmmB); + a.addsubpd(xmmA, anyptr_gpB); + a.addsubps(xmmA, xmmB); + a.addsubps(xmmA, anyptr_gpB); + a.fisttp(dword_ptr(gzA)); + a.haddpd(xmmA, xmmB); + a.haddpd(xmmA, anyptr_gpB); + a.haddps(xmmA, xmmB); + a.haddps(xmmA, anyptr_gpB); + a.hsubpd(xmmA, xmmB); + a.hsubpd(xmmA, anyptr_gpB); + a.hsubps(xmmA, xmmB); + a.hsubps(xmmA, anyptr_gpB); + a.lddqu(xmmA, anyptr_gpB); + a.monitor(); + a.movddup(xmmA, xmmB); + a.movddup(xmmA, anyptr_gpB); + a.movshdup(xmmA, xmmB); + a.movshdup(xmmA, anyptr_gpB); + a.movsldup(xmmA, xmmB); + a.movsldup(xmmA, anyptr_gpB); + a.mwait(); + + // SSSE3. + a.nop(); + + a.psignb(mmA, mmB); + a.psignb(mmA, anyptr_gpB); + a.psignb(xmmA, xmmB); + a.psignb(xmmA, anyptr_gpB); + a.psignw(mmA, mmB); + a.psignw(mmA, anyptr_gpB); + a.psignw(xmmA, xmmB); + a.psignw(xmmA, anyptr_gpB); + a.psignd(mmA, mmB); + a.psignd(mmA, anyptr_gpB); + a.psignd(xmmA, xmmB); + a.psignd(xmmA, anyptr_gpB); + a.phaddw(mmA, mmB); + a.phaddw(mmA, anyptr_gpB); + a.phaddw(xmmA, xmmB); + a.phaddw(xmmA, anyptr_gpB); + a.phaddd(mmA, mmB); + a.phaddd(mmA, anyptr_gpB); + a.phaddd(xmmA, xmmB); + a.phaddd(xmmA, anyptr_gpB); + a.phaddsw(mmA, mmB); + a.phaddsw(mmA, anyptr_gpB); + a.phaddsw(xmmA, xmmB); + a.phaddsw(xmmA, anyptr_gpB); + a.phsubw(mmA, mmB); + a.phsubw(mmA, anyptr_gpB); + a.phsubw(xmmA, xmmB); + a.phsubw(xmmA, anyptr_gpB); + a.phsubd(mmA, mmB); + a.phsubd(mmA, anyptr_gpB); + a.phsubd(xmmA, xmmB); + a.phsubd(xmmA, anyptr_gpB); + a.phsubsw(mmA, mmB); + a.phsubsw(mmA, anyptr_gpB); + a.phsubsw(xmmA, xmmB); + a.phsubsw(xmmA, anyptr_gpB); + a.pmaddubsw(mmA, mmB); + a.pmaddubsw(mmA, anyptr_gpB); + a.pmaddubsw(xmmA, xmmB); + a.pmaddubsw(xmmA, anyptr_gpB); + a.pabsb(mmA, mmB); + a.pabsb(mmA, anyptr_gpB); + a.pabsb(xmmA, xmmB); + a.pabsb(xmmA, anyptr_gpB); + a.pabsw(mmA, mmB); + a.pabsw(mmA, anyptr_gpB); + a.pabsw(xmmA, xmmB); + a.pabsw(xmmA, anyptr_gpB); + a.pabsd(mmA, mmB); + a.pabsd(mmA, anyptr_gpB); + a.pabsd(xmmA, xmmB); + a.pabsd(xmmA, anyptr_gpB); + a.pmulhrsw(mmA, mmB); + a.pmulhrsw(mmA, anyptr_gpB); + a.pmulhrsw(xmmA, xmmB); + a.pmulhrsw(xmmA, anyptr_gpB); + a.pshufb(mmA, mmB); + a.pshufb(mmA, anyptr_gpB); + a.pshufb(xmmA, xmmB); + a.pshufb(xmmA, anyptr_gpB); + a.palignr(mmA, mmB, 0); + a.palignr(mmA, anyptr_gpB, 0); + a.palignr(xmmA, xmmB, 0); + a.palignr(xmmA, anyptr_gpB, 0); + + // SSE4.1. + a.nop(); + + a.blendpd(xmmA, xmmB, 0); + a.blendpd(xmmA, anyptr_gpB, 0); + a.blendps(xmmA, xmmB, 0); + a.blendps(xmmA, anyptr_gpB, 0); + a.blendvpd(xmmA, xmmB); + a.blendvpd(xmmA, anyptr_gpB); + a.blendvps(xmmA, xmmB); + a.blendvps(xmmA, anyptr_gpB); + a.dppd(xmmA, xmmB, 0); + a.dppd(xmmA, anyptr_gpB, 0); + a.dpps(xmmA, xmmB, 0); + a.dpps(xmmA, anyptr_gpB, 0); + a.extractps(gzA, xmmB, 0); + a.extractps(anyptr_gpA, xmmB, 0); + a.insertps(xmmA, xmmB, 0); + a.insertps(xmmA, anyptr_gpB, 0); + a.movntdqa(xmmA, anyptr_gpB); + a.mpsadbw(xmmA, xmmB, 0); + a.mpsadbw(xmmA, anyptr_gpB, 0); + a.packusdw(xmmA, xmmB); + a.packusdw(xmmA, anyptr_gpB); + a.pblendvb(xmmA, xmmB); + a.pblendvb(xmmA, anyptr_gpB); + a.pblendw(xmmA, xmmB, 0); + a.pblendw(xmmA, anyptr_gpB, 0); + a.pcmpeqq(xmmA, xmmB); + a.pcmpeqq(xmmA, anyptr_gpB); + a.pextrb(gzA, xmmA, 0); + a.pextrb(anyptr_gpA, xmmB, 0); + a.pextrd(gzA, xmmA, 0); + a.pextrd(anyptr_gpA, xmmB, 0); + a.pextrq(gzA, xmmA, 0); + a.pextrq(anyptr_gpA, xmmB, 0); + a.pextrw(gzA, xmmA, 0); + a.pextrw(anyptr_gpA, xmmB, 0); + a.phminposuw(xmmA, xmmB); + a.phminposuw(xmmA, anyptr_gpB); + a.pinsrb(xmmA, gdB, 0); + a.pinsrb(xmmA, anyptr_gpB, 0); + a.pinsrd(xmmA, gdB, 0); + a.pinsrd(xmmA, anyptr_gpB, 0); + a.pinsrw(xmmA, gdB, 0); + a.pinsrw(xmmA, anyptr_gpB, 0); + a.pmaxuw(xmmA, xmmB); + a.pmaxuw(xmmA, anyptr_gpB); + a.pmaxsb(xmmA, xmmB); + a.pmaxsb(xmmA, anyptr_gpB); + a.pmaxsd(xmmA, xmmB); + a.pmaxsd(xmmA, anyptr_gpB); + a.pmaxud(xmmA, xmmB); + a.pmaxud(xmmA, anyptr_gpB); + a.pminsb(xmmA, xmmB); + a.pminsb(xmmA, anyptr_gpB); + a.pminuw(xmmA, xmmB); + a.pminuw(xmmA, anyptr_gpB); + a.pminud(xmmA, xmmB); + a.pminud(xmmA, anyptr_gpB); + a.pminsd(xmmA, xmmB); + a.pminsd(xmmA, anyptr_gpB); + a.pmovsxbw(xmmA, xmmB); + a.pmovsxbw(xmmA, anyptr_gpB); + a.pmovsxbd(xmmA, xmmB); + a.pmovsxbd(xmmA, anyptr_gpB); + a.pmovsxbq(xmmA, xmmB); + a.pmovsxbq(xmmA, anyptr_gpB); + a.pmovsxwd(xmmA, xmmB); + a.pmovsxwd(xmmA, anyptr_gpB); + a.pmovsxwq(xmmA, xmmB); + a.pmovsxwq(xmmA, anyptr_gpB); + a.pmovsxdq(xmmA, xmmB); + a.pmovsxdq(xmmA, anyptr_gpB); + a.pmovzxbw(xmmA, xmmB); + a.pmovzxbw(xmmA, anyptr_gpB); + a.pmovzxbd(xmmA, xmmB); + a.pmovzxbd(xmmA, anyptr_gpB); + a.pmovzxbq(xmmA, xmmB); + a.pmovzxbq(xmmA, anyptr_gpB); + a.pmovzxwd(xmmA, xmmB); + a.pmovzxwd(xmmA, anyptr_gpB); + a.pmovzxwq(xmmA, xmmB); + a.pmovzxwq(xmmA, anyptr_gpB); + a.pmovzxdq(xmmA, xmmB); + a.pmovzxdq(xmmA, anyptr_gpB); + a.pmuldq(xmmA, xmmB); + a.pmuldq(xmmA, anyptr_gpB); + a.pmulld(xmmA, xmmB); + a.pmulld(xmmA, anyptr_gpB); + a.ptest(xmmA, xmmB); + a.ptest(xmmA, anyptr_gpB); + a.roundps(xmmA, xmmB, 0); + a.roundps(xmmA, anyptr_gpB, 0); + a.roundss(xmmA, xmmB, 0); + a.roundss(xmmA, anyptr_gpB, 0); + a.roundpd(xmmA, xmmB, 0); + a.roundpd(xmmA, anyptr_gpB, 0); + a.roundsd(xmmA, xmmB, 0); + a.roundsd(xmmA, anyptr_gpB, 0); + + // SSE4.2. + a.nop(); + + a.pcmpestri(xmmA, xmmB, 0); + a.pcmpestri(xmmA, anyptr_gpB, 0); + a.pcmpestrm(xmmA, xmmB, 0); + a.pcmpestrm(xmmA, anyptr_gpB, 0); + a.pcmpistri(xmmA, xmmB, 0); + a.pcmpistri(xmmA, anyptr_gpB, 0); + a.pcmpistrm(xmmA, xmmB, 0); + a.pcmpistrm(xmmA, anyptr_gpB, 0); + a.pcmpgtq(xmmA, xmmB); + a.pcmpgtq(xmmA, anyptr_gpB); + + // SSE4a. + a.nop(); + + a.extrq(xmmA, xmmB); + a.extrq(xmmA, 0x1, 0x2); + a.insertq(xmmA, xmmB); + a.insertq(xmmA, xmmB, 0x1, 0x2); + a.movntsd(anyptr_gpA, xmmB); + a.movntss(anyptr_gpA, xmmB); + + // POPCNT. + a.nop(); + + a.popcnt(gzA, anyptr_gpB); + + // AESNI. + a.nop(); + + a.aesdec(xmmA, xmmB); + a.aesdec(xmmA, anyptr_gpB); + a.aesdeclast(xmmA, xmmB); + a.aesdeclast(xmmA, anyptr_gpB); + a.aesenc(xmmA, xmmB); + a.aesenc(xmmA, anyptr_gpB); + a.aesenclast(xmmA, xmmB); + a.aesenclast(xmmA, anyptr_gpB); + a.aesimc(xmmA, xmmB); + a.aesimc(xmmA, anyptr_gpB); + a.aeskeygenassist(xmmA, xmmB, 0); + a.aeskeygenassist(xmmA, anyptr_gpB, 0); + + // PCLMULQDQ. + a.nop(); + + a.pclmulqdq(xmmA, xmmB, 0); + a.pclmulqdq(xmmA, anyptr_gpB, 0); + + // XSAVE. + a.nop(); + + a.xgetbv(); + a.xsetbv(); + + a.xsave(anyptr_gpA); + a.xsaveopt(anyptr_gpA); + a.xrstor(anyptr_gpA); + + // AVX. + a.nop(); + + a.vaddpd(xmmA, xmmB, xmmC); + a.vaddpd(xmmA, xmmB, anyptr_gpC); + a.vaddpd(ymmA, ymmB, ymmC); + a.vaddpd(ymmA, ymmB, anyptr_gpC); + a.vaddps(xmmA, xmmB, xmmC); + a.vaddps(xmmA, xmmB, anyptr_gpC); + a.vaddps(ymmA, ymmB, ymmC); + a.vaddps(ymmA, ymmB, anyptr_gpC); + a.vaddsd(xmmA, xmmB, xmmC); + a.vaddsd(xmmA, xmmB, anyptr_gpC); + a.vaddss(xmmA, xmmB, xmmC); + a.vaddss(xmmA, xmmB, anyptr_gpC); + a.vaddsubpd(xmmA, xmmB, xmmC); + a.vaddsubpd(xmmA, xmmB, anyptr_gpC); + a.vaddsubpd(ymmA, ymmB, ymmC); + a.vaddsubpd(ymmA, ymmB, anyptr_gpC); + a.vaddsubps(xmmA, xmmB, xmmC); + a.vaddsubps(xmmA, xmmB, anyptr_gpC); + a.vaddsubps(ymmA, ymmB, ymmC); + a.vaddsubps(ymmA, ymmB, anyptr_gpC); + a.vandpd(xmmA, xmmB, xmmC); + a.vandpd(xmmA, xmmB, anyptr_gpC); + a.vandpd(ymmA, ymmB, ymmC); + a.vandpd(ymmA, ymmB, anyptr_gpC); + a.vandps(xmmA, xmmB, xmmC); + a.vandps(xmmA, xmmB, anyptr_gpC); + a.vandps(ymmA, ymmB, ymmC); + a.vandps(ymmA, ymmB, anyptr_gpC); + a.vandnpd(xmmA, xmmB, xmmC); + a.vandnpd(xmmA, xmmB, anyptr_gpC); + a.vandnpd(ymmA, ymmB, ymmC); + a.vandnpd(ymmA, ymmB, anyptr_gpC); + a.vandnps(xmmA, xmmB, xmmC); + a.vandnps(xmmA, xmmB, anyptr_gpC); + a.vandnps(ymmA, ymmB, ymmC); + a.vandnps(ymmA, ymmB, anyptr_gpC); + a.vblendpd(xmmA, xmmB, xmmC, 0); + a.vblendpd(xmmA, xmmB, anyptr_gpC, 0); + a.vblendpd(ymmA, ymmB, ymmC, 0); + a.vblendpd(ymmA, ymmB, anyptr_gpC, 0); + a.vblendps(xmmA, xmmB, xmmC, 0); + a.vblendps(xmmA, xmmB, anyptr_gpC, 0); + a.vblendps(ymmA, ymmB, ymmC, 0); + a.vblendps(ymmA, ymmB, anyptr_gpC, 0); + a.vblendvpd(xmmA, xmmB, xmmC, xmmD); + a.vblendvpd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vblendvpd(ymmA, ymmB, ymmC, ymmD); + a.vblendvpd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vbroadcastf128(ymmA, anyptr_gpB); + a.vbroadcastsd(ymmA, anyptr_gpB); + a.vbroadcastss(xmmA, anyptr_gpB); + a.vbroadcastss(ymmA, anyptr_gpB); + a.vcmppd(xmmA, xmmB, xmmC, 0); + a.vcmppd(xmmA, xmmB, anyptr_gpC, 0); + a.vcmppd(ymmA, ymmB, ymmC, 0); + a.vcmppd(ymmA, ymmB, anyptr_gpC, 0); + a.vcmpps(xmmA, xmmB, xmmC, 0); + a.vcmpps(xmmA, xmmB, anyptr_gpC, 0); + a.vcmpps(ymmA, ymmB, ymmC, 0); + a.vcmpps(ymmA, ymmB, anyptr_gpC, 0); + a.vcmpsd(xmmA, xmmB, xmmC, 0); + a.vcmpsd(xmmA, xmmB, anyptr_gpC, 0); + a.vcmpss(xmmA, xmmB, xmmC, 0); + a.vcmpss(xmmA, xmmB, anyptr_gpC, 0); + a.vcomisd(xmmA, xmmB); + a.vcomisd(xmmA, anyptr_gpB); + a.vcomiss(xmmA, xmmB); + a.vcomiss(xmmA, anyptr_gpB); + a.vcvtdq2pd(xmmA, xmmB); + a.vcvtdq2pd(xmmA, anyptr_gpB); + a.vcvtdq2pd(ymmA, xmmB); + a.vcvtdq2pd(ymmA, anyptr_gpB); + a.vcvtdq2ps(xmmA, xmmB); + a.vcvtdq2ps(xmmA, anyptr_gpB); + a.vcvtdq2ps(ymmA, ymmB); + a.vcvtdq2ps(ymmA, anyptr_gpB); + a.vcvtpd2dq(xmmA, xmmB); + a.vcvtpd2dq(xmmA, ymmB); + a.vcvtpd2dq(xmmA, anyptr_gpB); + a.vcvtpd2ps(xmmA, xmmB); + a.vcvtpd2ps(xmmA, ymmB); + a.vcvtpd2ps(xmmA, anyptr_gpB); + a.vcvtps2dq(xmmA, xmmB); + a.vcvtps2dq(xmmA, anyptr_gpB); + a.vcvtps2dq(ymmA, ymmB); + a.vcvtps2dq(ymmA, anyptr_gpB); + a.vcvtps2pd(xmmA, xmmB); + a.vcvtps2pd(xmmA, anyptr_gpB); + a.vcvtps2pd(ymmA, xmmB); + a.vcvtps2pd(ymmA, anyptr_gpB); + a.vcvtsd2si(gzA, xmmB); + a.vcvtsd2si(gzA, anyptr_gpB); + a.vcvtsd2ss(xmmA, xmmB, xmmC); + a.vcvtsd2ss(xmmA, xmmB, anyptr_gpC); + a.vcvtsi2sd(xmmA, xmmB, gzA); + a.vcvtsi2sd(xmmA, xmmB, anyptr_gpC); + a.vcvtsi2ss(xmmA, xmmB, gzA); + a.vcvtsi2ss(xmmA, xmmB, anyptr_gpC); + a.vcvtss2sd(xmmA, xmmB, xmmC); + a.vcvtss2sd(xmmA, xmmB, anyptr_gpC); + a.vcvtss2si(gzA, xmmB); + a.vcvtss2si(gzA, anyptr_gpB); + a.vcvttpd2dq(xmmA, xmmB); + a.vcvttpd2dq(xmmA, ymmB); + a.vcvttpd2dq(xmmA, anyptr_gpB); + a.vcvttps2dq(xmmA, xmmB); + a.vcvttps2dq(xmmA, anyptr_gpB); + a.vcvttps2dq(ymmA, ymmB); + a.vcvttps2dq(ymmA, anyptr_gpB); + a.vcvttsd2si(gzA, xmmB); + a.vcvttsd2si(gzA, anyptr_gpB); + a.vcvttss2si(gzA, xmmB); + a.vcvttss2si(gzA, anyptr_gpB); + a.vdivpd(xmmA, xmmB, xmmC); + a.vdivpd(xmmA, xmmB, anyptr_gpC); + a.vdivpd(ymmA, ymmB, ymmC); + a.vdivpd(ymmA, ymmB, anyptr_gpC); + a.vdivps(xmmA, xmmB, xmmC); + a.vdivps(xmmA, xmmB, anyptr_gpC); + a.vdivps(ymmA, ymmB, ymmC); + a.vdivps(ymmA, ymmB, anyptr_gpC); + a.vdivsd(xmmA, xmmB, xmmC); + a.vdivsd(xmmA, xmmB, anyptr_gpC); + a.vdivss(xmmA, xmmB, xmmC); + a.vdivss(xmmA, xmmB, anyptr_gpC); + a.vdppd(xmmA, xmmB, xmmC, 0); + a.vdppd(xmmA, xmmB, anyptr_gpC, 0); + a.vdpps(xmmA, xmmB, xmmC, 0); + a.vdpps(xmmA, xmmB, anyptr_gpC, 0); + a.vdpps(ymmA, ymmB, ymmC, 0); + a.vdpps(ymmA, ymmB, anyptr_gpC, 0); + a.vextractf128(xmmA, ymmB, 0); + a.vextractf128(anyptr_gpA, ymmB, 0); + a.vextractps(gzA, xmmB, 0); + a.vextractps(anyptr_gpA, xmmB, 0); + a.vhaddpd(xmmA, xmmB, xmmC); + a.vhaddpd(xmmA, xmmB, anyptr_gpC); + a.vhaddpd(ymmA, ymmB, ymmC); + a.vhaddpd(ymmA, ymmB, anyptr_gpC); + a.vhaddps(xmmA, xmmB, xmmC); + a.vhaddps(xmmA, xmmB, anyptr_gpC); + a.vhaddps(ymmA, ymmB, ymmC); + a.vhaddps(ymmA, ymmB, anyptr_gpC); + a.vhsubpd(xmmA, xmmB, xmmC); + a.vhsubpd(xmmA, xmmB, anyptr_gpC); + a.vhsubpd(ymmA, ymmB, ymmC); + a.vhsubpd(ymmA, ymmB, anyptr_gpC); + a.vhsubps(xmmA, xmmB, xmmC); + a.vhsubps(xmmA, xmmB, anyptr_gpC); + a.vhsubps(ymmA, ymmB, ymmC); + a.vhsubps(ymmA, ymmB, anyptr_gpC); + a.vinsertf128(ymmA, ymmB, xmmC, 0); + a.vinsertf128(ymmA, ymmB, anyptr_gpC, 0); + a.vinsertps(xmmA, xmmB, xmmC, 0); + a.vinsertps(xmmA, xmmB, anyptr_gpC, 0); + a.vlddqu(xmmA, anyptr_gpB); + a.vlddqu(ymmA, anyptr_gpB); + a.vldmxcsr(anyptr_gpA); + a.vmaskmovdqu(xmmA, xmmB); + a.vmaskmovps(xmmA, xmmB, anyptr_gpC); + a.vmaskmovps(ymmA, ymmB, anyptr_gpC); + a.vmaskmovpd(xmmA, xmmB, anyptr_gpC); + a.vmaskmovpd(ymmA, ymmB, anyptr_gpC); + a.vmaskmovps(anyptr_gpA, xmmA, xmmB); + a.vmaskmovps(anyptr_gpA, ymmA, ymmB); + a.vmaskmovpd(anyptr_gpA, xmmA, xmmB); + a.vmaskmovpd(anyptr_gpA, ymmA, ymmB); + a.vmaxpd(xmmA, xmmB, xmmC); + a.vmaxpd(xmmA, xmmB, anyptr_gpC); + a.vmaxpd(ymmA, ymmB, ymmC); + a.vmaxpd(ymmA, ymmB, anyptr_gpC); + a.vmaxps(xmmA, xmmB, xmmC); + a.vmaxps(xmmA, xmmB, anyptr_gpC); + a.vmaxps(ymmA, ymmB, ymmC); + a.vmaxps(ymmA, ymmB, anyptr_gpC); + a.vmaxsd(xmmA, xmmB, xmmC); + a.vmaxsd(xmmA, xmmB, anyptr_gpC); + a.vmaxss(xmmA, xmmB, xmmC); + a.vmaxss(xmmA, xmmB, anyptr_gpC); + a.vminpd(xmmA, xmmB, xmmC); + a.vminpd(xmmA, xmmB, anyptr_gpC); + a.vminpd(ymmA, ymmB, ymmC); + a.vminpd(ymmA, ymmB, anyptr_gpC); + a.vminps(xmmA, xmmB, xmmC); + a.vminps(xmmA, xmmB, anyptr_gpC); + a.vminps(ymmA, ymmB, ymmC); + a.vminps(ymmA, ymmB, anyptr_gpC); + a.vminsd(xmmA, xmmB, xmmC); + a.vminsd(xmmA, xmmB, anyptr_gpC); + a.vminss(xmmA, xmmB, xmmC); + a.vminss(xmmA, xmmB, anyptr_gpC); + a.vmovapd(xmmA, xmmB); + a.vmovapd(xmmA, anyptr_gpB); + a.vmovapd(anyptr_gpA, xmmB); + a.vmovapd(ymmA, ymmB); + a.vmovapd(ymmA, anyptr_gpB); + a.vmovapd(anyptr_gpA, ymmB); + a.vmovaps(xmmA, xmmB); + a.vmovaps(xmmA, anyptr_gpB); + a.vmovaps(anyptr_gpA, xmmB); + a.vmovaps(ymmA, ymmB); + a.vmovaps(ymmA, anyptr_gpB); + a.vmovaps(anyptr_gpA, ymmB); + a.vmovd(xmmA, gzA); + a.vmovd(xmmA, anyptr_gpB); + a.vmovd(gzA, xmmB); + a.vmovd(anyptr_gpA, xmmB); + a.vmovddup(xmmA, xmmB); + a.vmovddup(xmmA, anyptr_gpB); + a.vmovddup(ymmA, ymmB); + a.vmovddup(ymmA, anyptr_gpB); + a.vmovdqa(xmmA, xmmB); + a.vmovdqa(xmmA, anyptr_gpB); + a.vmovdqa(anyptr_gpA, xmmB); + a.vmovdqa(ymmA, ymmB); + a.vmovdqa(ymmA, anyptr_gpB); + a.vmovdqa(anyptr_gpA, ymmB); + a.vmovdqu(xmmA, xmmB); + a.vmovdqu(xmmA, anyptr_gpB); + a.vmovdqu(anyptr_gpA, xmmB); + a.vmovdqu(ymmA, ymmB); + a.vmovdqu(ymmA, anyptr_gpB); + a.vmovdqu(anyptr_gpA, ymmB); + a.vmovhlps(xmmA, xmmB, xmmC); + a.vmovhpd(xmmA, xmmB, anyptr_gpC); + a.vmovhpd(anyptr_gpA, xmmB); + a.vmovhps(xmmA, xmmB, anyptr_gpC); + a.vmovhps(anyptr_gpA, xmmB); + a.vmovlhps(xmmA, xmmB, xmmC); + a.vmovlpd(xmmA, xmmB, anyptr_gpC); + a.vmovlpd(anyptr_gpA, xmmB); + a.vmovlps(xmmA, xmmB, anyptr_gpC); + a.vmovlps(anyptr_gpA, xmmB); + a.vmovmskpd(gzA, xmmB); + a.vmovmskpd(gzA, ymmB); + a.vmovmskps(gzA, xmmB); + a.vmovmskps(gzA, ymmB); + a.vmovntdq(anyptr_gpA, xmmB); + a.vmovntdq(anyptr_gpA, ymmB); + a.vmovntdqa(xmmA, anyptr_gpB); + a.vmovntpd(anyptr_gpA, xmmB); + a.vmovntpd(anyptr_gpA, ymmB); + a.vmovntps(anyptr_gpA, xmmB); + a.vmovntps(anyptr_gpA, ymmB); + a.vmovsd(xmmA, xmmB, xmmC); + a.vmovsd(xmmA, anyptr_gpB); + a.vmovsd(anyptr_gpA, xmmB); + a.vmovshdup(xmmA, xmmB); + a.vmovshdup(xmmA, anyptr_gpB); + a.vmovshdup(ymmA, ymmB); + a.vmovshdup(ymmA, anyptr_gpB); + a.vmovsldup(xmmA, xmmB); + a.vmovsldup(xmmA, anyptr_gpB); + a.vmovsldup(ymmA, ymmB); + a.vmovsldup(ymmA, anyptr_gpB); + a.vmovss(xmmA, xmmB, xmmC); + a.vmovss(xmmA, anyptr_gpB); + a.vmovss(anyptr_gpA, xmmB); + a.vmovupd(xmmA, xmmB); + a.vmovupd(xmmA, anyptr_gpB); + a.vmovupd(anyptr_gpA, xmmB); + a.vmovupd(ymmA, ymmB); + a.vmovupd(ymmA, anyptr_gpB); + a.vmovupd(anyptr_gpA, ymmB); + a.vmovups(xmmA, xmmB); + a.vmovups(xmmA, anyptr_gpB); + a.vmovups(anyptr_gpA, xmmB); + a.vmovups(ymmA, ymmB); + a.vmovups(ymmA, anyptr_gpB); + a.vmovups(anyptr_gpA, ymmB); + a.vmpsadbw(xmmA, xmmB, xmmC, 0); + a.vmpsadbw(xmmA, xmmB, anyptr_gpC, 0); + a.vmulpd(xmmA, xmmB, xmmC); + a.vmulpd(xmmA, xmmB, anyptr_gpC); + a.vmulpd(ymmA, ymmB, ymmC); + a.vmulpd(ymmA, ymmB, anyptr_gpC); + a.vmulps(xmmA, xmmB, xmmC); + a.vmulps(xmmA, xmmB, anyptr_gpC); + a.vmulps(ymmA, ymmB, ymmC); + a.vmulps(ymmA, ymmB, anyptr_gpC); + a.vmulsd(xmmA, xmmB, xmmC); + a.vmulsd(xmmA, xmmB, anyptr_gpC); + a.vmulss(xmmA, xmmB, xmmC); + a.vmulss(xmmA, xmmB, anyptr_gpC); + a.vorpd(xmmA, xmmB, xmmC); + a.vorpd(xmmA, xmmB, anyptr_gpC); + a.vorpd(ymmA, ymmB, ymmC); + a.vorpd(ymmA, ymmB, anyptr_gpC); + a.vorps(xmmA, xmmB, xmmC); + a.vorps(xmmA, xmmB, anyptr_gpC); + a.vorps(ymmA, ymmB, ymmC); + a.vorps(ymmA, ymmB, anyptr_gpC); + a.vpabsb(xmmA, xmmB); + a.vpabsb(xmmA, anyptr_gpB); + a.vpabsd(xmmA, xmmB); + a.vpabsd(xmmA, anyptr_gpB); + a.vpabsw(xmmA, xmmB); + a.vpabsw(xmmA, anyptr_gpB); + a.vpackssdw(xmmA, xmmB, xmmC); + a.vpackssdw(xmmA, xmmB, anyptr_gpC); + a.vpacksswb(xmmA, xmmB, xmmC); + a.vpacksswb(xmmA, xmmB, anyptr_gpC); + a.vpackusdw(xmmA, xmmB, xmmC); + a.vpackusdw(xmmA, xmmB, anyptr_gpC); + a.vpackuswb(xmmA, xmmB, xmmC); + a.vpackuswb(xmmA, xmmB, anyptr_gpC); + a.vpaddb(xmmA, xmmB, xmmC); + a.vpaddb(xmmA, xmmB, anyptr_gpC); + a.vpaddd(xmmA, xmmB, xmmC); + a.vpaddd(xmmA, xmmB, anyptr_gpC); + a.vpaddq(xmmA, xmmB, xmmC); + a.vpaddq(xmmA, xmmB, anyptr_gpC); + a.vpaddw(xmmA, xmmB, xmmC); + a.vpaddw(xmmA, xmmB, anyptr_gpC); + a.vpaddsb(xmmA, xmmB, xmmC); + a.vpaddsb(xmmA, xmmB, anyptr_gpC); + a.vpaddsw(xmmA, xmmB, xmmC); + a.vpaddsw(xmmA, xmmB, anyptr_gpC); + a.vpaddusb(xmmA, xmmB, xmmC); + a.vpaddusb(xmmA, xmmB, anyptr_gpC); + a.vpaddusw(xmmA, xmmB, xmmC); + a.vpaddusw(xmmA, xmmB, anyptr_gpC); + a.vpalignr(xmmA, xmmB, xmmC, 0); + a.vpalignr(xmmA, xmmB, anyptr_gpC, 0); + a.vpand(xmmA, xmmB, xmmC); + a.vpand(xmmA, xmmB, anyptr_gpC); + a.vpandn(xmmA, xmmB, xmmC); + a.vpandn(xmmA, xmmB, anyptr_gpC); + a.vpavgb(xmmA, xmmB, xmmC); + a.vpavgb(xmmA, xmmB, anyptr_gpC); + a.vpavgw(xmmA, xmmB, xmmC); + a.vpavgw(xmmA, xmmB, anyptr_gpC); + a.vpblendvb(xmmA, xmmB, xmmC, xmmD); + a.vpblendvb(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpblendw(xmmA, xmmB, xmmC, 0); + a.vpblendw(xmmA, xmmB, anyptr_gpC, 0); + a.vpcmpeqb(xmmA, xmmB, xmmC); + a.vpcmpeqb(xmmA, xmmB, anyptr_gpC); + a.vpcmpeqd(xmmA, xmmB, xmmC); + a.vpcmpeqd(xmmA, xmmB, anyptr_gpC); + a.vpcmpeqq(xmmA, xmmB, xmmC); + a.vpcmpeqq(xmmA, xmmB, anyptr_gpC); + a.vpcmpeqw(xmmA, xmmB, xmmC); + a.vpcmpeqw(xmmA, xmmB, anyptr_gpC); + a.vpcmpgtb(xmmA, xmmB, xmmC); + a.vpcmpgtb(xmmA, xmmB, anyptr_gpC); + a.vpcmpgtd(xmmA, xmmB, xmmC); + a.vpcmpgtd(xmmA, xmmB, anyptr_gpC); + a.vpcmpgtq(xmmA, xmmB, xmmC); + a.vpcmpgtq(xmmA, xmmB, anyptr_gpC); + a.vpcmpgtw(xmmA, xmmB, xmmC); + a.vpcmpgtw(xmmA, xmmB, anyptr_gpC); + a.vpcmpestri(xmmA, xmmB, 0); + a.vpcmpestri(xmmA, anyptr_gpB, 0); + a.vpcmpestrm(xmmA, xmmB, 0); + a.vpcmpestrm(xmmA, anyptr_gpB, 0); + a.vpcmpistri(xmmA, xmmB, 0); + a.vpcmpistri(xmmA, anyptr_gpB, 0); + a.vpcmpistrm(xmmA, xmmB, 0); + a.vpcmpistrm(xmmA, anyptr_gpB, 0); + a.vpermilpd(xmmA, xmmB, xmmC); + a.vpermilpd(xmmA, xmmB, anyptr_gpC); + a.vpermilpd(ymmA, ymmB, ymmC); + a.vpermilpd(ymmA, ymmB, anyptr_gpC); + a.vpermilpd(xmmA, xmmB, 0); + a.vpermilpd(xmmA, anyptr_gpB, 0); + a.vpermilpd(ymmA, ymmB, 0); + a.vpermilpd(ymmA, anyptr_gpB, 0); + a.vpermilps(xmmA, xmmB, xmmC); + a.vpermilps(xmmA, xmmB, anyptr_gpC); + a.vpermilps(ymmA, ymmB, ymmC); + a.vpermilps(ymmA, ymmB, anyptr_gpC); + a.vpermilps(xmmA, xmmB, 0); + a.vpermilps(xmmA, anyptr_gpB, 0); + a.vpermilps(ymmA, ymmB, 0); + a.vpermilps(ymmA, anyptr_gpB, 0); + a.vperm2f128(ymmA, ymmB, ymmC, 0); + a.vperm2f128(ymmA, ymmB, anyptr_gpC, 0); + a.vpextrb(gzA, xmmB, 0); + a.vpextrb(anyptr_gpA, xmmB, 0); + a.vpextrd(gzA, xmmB, 0); + a.vpextrd(anyptr_gpA, xmmB, 0); + a.vpextrw(gzA, xmmB, 0); + a.vpextrw(anyptr_gpA, xmmB, 0); + a.vphaddd(xmmA, xmmB, xmmC); + a.vphaddd(xmmA, xmmB, anyptr_gpC); + a.vphaddsw(xmmA, xmmB, xmmC); + a.vphaddsw(xmmA, xmmB, anyptr_gpC); + a.vphaddw(xmmA, xmmB, xmmC); + a.vphaddw(xmmA, xmmB, anyptr_gpC); + a.vphminposuw(xmmA, xmmB); + a.vphminposuw(xmmA, anyptr_gpB); + a.vphsubd(xmmA, xmmB, xmmC); + a.vphsubd(xmmA, xmmB, anyptr_gpC); + a.vphsubsw(xmmA, xmmB, xmmC); + a.vphsubsw(xmmA, xmmB, anyptr_gpC); + a.vphsubw(xmmA, xmmB, xmmC); + a.vphsubw(xmmA, xmmB, anyptr_gpC); + a.vpinsrb(xmmA, xmmB, gzA, 0); + a.vpinsrb(xmmA, xmmB, anyptr_gpC, 0); + a.vpinsrd(xmmA, xmmB, gzA, 0); + a.vpinsrd(xmmA, xmmB, anyptr_gpC, 0); + a.vpinsrw(xmmA, xmmB, gzA, 0); + a.vpinsrw(xmmA, xmmB, anyptr_gpC, 0); + a.vpmaddubsw(xmmA, xmmB, xmmC); + a.vpmaddubsw(xmmA, xmmB, anyptr_gpC); + a.vpmaddwd(xmmA, xmmB, xmmC); + a.vpmaddwd(xmmA, xmmB, anyptr_gpC); + a.vpmaxsb(xmmA, xmmB, xmmC); + a.vpmaxsb(xmmA, xmmB, anyptr_gpC); + a.vpmaxsd(xmmA, xmmB, xmmC); + a.vpmaxsd(xmmA, xmmB, anyptr_gpC); + a.vpmaxsw(xmmA, xmmB, xmmC); + a.vpmaxsw(xmmA, xmmB, anyptr_gpC); + a.vpmaxub(xmmA, xmmB, xmmC); + a.vpmaxub(xmmA, xmmB, anyptr_gpC); + a.vpmaxud(xmmA, xmmB, xmmC); + a.vpmaxud(xmmA, xmmB, anyptr_gpC); + a.vpmaxuw(xmmA, xmmB, xmmC); + a.vpmaxuw(xmmA, xmmB, anyptr_gpC); + a.vpminsb(xmmA, xmmB, xmmC); + a.vpminsb(xmmA, xmmB, anyptr_gpC); + a.vpminsd(xmmA, xmmB, xmmC); + a.vpminsd(xmmA, xmmB, anyptr_gpC); + a.vpminsw(xmmA, xmmB, xmmC); + a.vpminsw(xmmA, xmmB, anyptr_gpC); + a.vpminub(xmmA, xmmB, xmmC); + a.vpminub(xmmA, xmmB, anyptr_gpC); + a.vpminud(xmmA, xmmB, xmmC); + a.vpminud(xmmA, xmmB, anyptr_gpC); + a.vpminuw(xmmA, xmmB, xmmC); + a.vpminuw(xmmA, xmmB, anyptr_gpC); + a.vpmovmskb(gzA, xmmB); + a.vpmovsxbd(xmmA, xmmB); + a.vpmovsxbd(xmmA, anyptr_gpB); + a.vpmovsxbq(xmmA, xmmB); + a.vpmovsxbq(xmmA, anyptr_gpB); + a.vpmovsxbw(xmmA, xmmB); + a.vpmovsxbw(xmmA, anyptr_gpB); + a.vpmovsxdq(xmmA, xmmB); + a.vpmovsxdq(xmmA, anyptr_gpB); + a.vpmovsxwd(xmmA, xmmB); + a.vpmovsxwd(xmmA, anyptr_gpB); + a.vpmovsxwq(xmmA, xmmB); + a.vpmovsxwq(xmmA, anyptr_gpB); + a.vpmovzxbd(xmmA, xmmB); + a.vpmovzxbd(xmmA, anyptr_gpB); + a.vpmovzxbq(xmmA, xmmB); + a.vpmovzxbq(xmmA, anyptr_gpB); + a.vpmovzxbw(xmmA, xmmB); + a.vpmovzxbw(xmmA, anyptr_gpB); + a.vpmovzxdq(xmmA, xmmB); + a.vpmovzxdq(xmmA, anyptr_gpB); + a.vpmovzxwd(xmmA, xmmB); + a.vpmovzxwd(xmmA, anyptr_gpB); + a.vpmovzxwq(xmmA, xmmB); + a.vpmovzxwq(xmmA, anyptr_gpB); + a.vpmuldq(xmmA, xmmB, xmmC); + a.vpmuldq(xmmA, xmmB, anyptr_gpC); + a.vpmulhrsw(xmmA, xmmB, xmmC); + a.vpmulhrsw(xmmA, xmmB, anyptr_gpC); + a.vpmulhuw(xmmA, xmmB, xmmC); + a.vpmulhuw(xmmA, xmmB, anyptr_gpC); + a.vpmulhw(xmmA, xmmB, xmmC); + a.vpmulhw(xmmA, xmmB, anyptr_gpC); + a.vpmulld(xmmA, xmmB, xmmC); + a.vpmulld(xmmA, xmmB, anyptr_gpC); + a.vpmullw(xmmA, xmmB, xmmC); + a.vpmullw(xmmA, xmmB, anyptr_gpC); + a.vpmuludq(xmmA, xmmB, xmmC); + a.vpmuludq(xmmA, xmmB, anyptr_gpC); + a.vpor(xmmA, xmmB, xmmC); + a.vpor(xmmA, xmmB, anyptr_gpC); + a.vpsadbw(xmmA, xmmB, xmmC); + a.vpsadbw(xmmA, xmmB, anyptr_gpC); + a.vpshufb(xmmA, xmmB, xmmC); + a.vpshufb(xmmA, xmmB, anyptr_gpC); + a.vpshufd(xmmA, xmmB, 0); + a.vpshufd(xmmA, anyptr_gpB, 0); + a.vpshufhw(xmmA, xmmB, 0); + a.vpshufhw(xmmA, anyptr_gpB, 0); + a.vpshuflw(xmmA, xmmB, 0); + a.vpshuflw(xmmA, anyptr_gpB, 0); + a.vpsignb(xmmA, xmmB, xmmC); + a.vpsignb(xmmA, xmmB, anyptr_gpC); + a.vpsignd(xmmA, xmmB, xmmC); + a.vpsignd(xmmA, xmmB, anyptr_gpC); + a.vpsignw(xmmA, xmmB, xmmC); + a.vpsignw(xmmA, xmmB, anyptr_gpC); + a.vpslld(xmmA, xmmB, xmmC); + a.vpslld(xmmA, xmmB, anyptr_gpC); + a.vpslld(xmmA, xmmB, 0); + a.vpslldq(xmmA, xmmB, 0); + a.vpsllq(xmmA, xmmB, xmmC); + a.vpsllq(xmmA, xmmB, anyptr_gpC); + a.vpsllq(xmmA, xmmB, 0); + a.vpsllw(xmmA, xmmB, xmmC); + a.vpsllw(xmmA, xmmB, anyptr_gpC); + a.vpsllw(xmmA, xmmB, 0); + a.vpsrad(xmmA, xmmB, xmmC); + a.vpsrad(xmmA, xmmB, anyptr_gpC); + a.vpsrad(xmmA, xmmB, 0); + a.vpsraw(xmmA, xmmB, xmmC); + a.vpsraw(xmmA, xmmB, anyptr_gpC); + a.vpsraw(xmmA, xmmB, 0); + a.vpsrld(xmmA, xmmB, xmmC); + a.vpsrld(xmmA, xmmB, anyptr_gpC); + a.vpsrld(xmmA, xmmB, 0); + a.vpsrldq(xmmA, xmmB, 0); + a.vpsrlq(xmmA, xmmB, xmmC); + a.vpsrlq(xmmA, xmmB, anyptr_gpC); + a.vpsrlq(xmmA, xmmB, 0); + a.vpsrlw(xmmA, xmmB, xmmC); + a.vpsrlw(xmmA, xmmB, anyptr_gpC); + a.vpsrlw(xmmA, xmmB, 0); + a.vpsubb(xmmA, xmmB, xmmC); + a.vpsubb(xmmA, xmmB, anyptr_gpC); + a.vpsubd(xmmA, xmmB, xmmC); + a.vpsubd(xmmA, xmmB, anyptr_gpC); + a.vpsubq(xmmA, xmmB, xmmC); + a.vpsubq(xmmA, xmmB, anyptr_gpC); + a.vpsubw(xmmA, xmmB, xmmC); + a.vpsubw(xmmA, xmmB, anyptr_gpC); + a.vpsubsb(xmmA, xmmB, xmmC); + a.vpsubsb(xmmA, xmmB, anyptr_gpC); + a.vpsubsw(xmmA, xmmB, xmmC); + a.vpsubsw(xmmA, xmmB, anyptr_gpC); + a.vpsubusb(xmmA, xmmB, xmmC); + a.vpsubusb(xmmA, xmmB, anyptr_gpC); + a.vpsubusw(xmmA, xmmB, xmmC); + a.vpsubusw(xmmA, xmmB, anyptr_gpC); + a.vptest(xmmA, xmmB); + a.vptest(xmmA, anyptr_gpB); + a.vptest(ymmA, ymmB); + a.vptest(ymmA, anyptr_gpB); + a.vpunpckhbw(xmmA, xmmB, xmmC); + a.vpunpckhbw(xmmA, xmmB, anyptr_gpC); + a.vpunpckhdq(xmmA, xmmB, xmmC); + a.vpunpckhdq(xmmA, xmmB, anyptr_gpC); + a.vpunpckhqdq(xmmA, xmmB, xmmC); + a.vpunpckhqdq(xmmA, xmmB, anyptr_gpC); + a.vpunpckhwd(xmmA, xmmB, xmmC); + a.vpunpckhwd(xmmA, xmmB, anyptr_gpC); + a.vpunpcklbw(xmmA, xmmB, xmmC); + a.vpunpcklbw(xmmA, xmmB, anyptr_gpC); + a.vpunpckldq(xmmA, xmmB, xmmC); + a.vpunpckldq(xmmA, xmmB, anyptr_gpC); + a.vpunpcklqdq(xmmA, xmmB, xmmC); + a.vpunpcklqdq(xmmA, xmmB, anyptr_gpC); + a.vpunpcklwd(xmmA, xmmB, xmmC); + a.vpunpcklwd(xmmA, xmmB, anyptr_gpC); + a.vpxor(xmmA, xmmB, xmmC); + a.vpxor(xmmA, xmmB, anyptr_gpC); + a.vrcpps(xmmA, xmmB); + a.vrcpps(xmmA, anyptr_gpB); + a.vrcpps(ymmA, ymmB); + a.vrcpps(ymmA, anyptr_gpB); + a.vrcpss(xmmA, xmmB, xmmC); + a.vrcpss(xmmA, xmmB, anyptr_gpC); + a.vrsqrtps(xmmA, xmmB); + a.vrsqrtps(xmmA, anyptr_gpB); + a.vrsqrtps(ymmA, ymmB); + a.vrsqrtps(ymmA, anyptr_gpB); + a.vrsqrtss(xmmA, xmmB, xmmC); + a.vrsqrtss(xmmA, xmmB, anyptr_gpC); + a.vroundpd(xmmA, xmmB, 0); + a.vroundpd(xmmA, anyptr_gpB, 0); + a.vroundpd(ymmA, ymmB, 0); + a.vroundpd(ymmA, anyptr_gpB, 0); + a.vroundps(xmmA, xmmB, 0); + a.vroundps(xmmA, anyptr_gpB, 0); + a.vroundps(ymmA, ymmB, 0); + a.vroundps(ymmA, anyptr_gpB, 0); + a.vroundsd(xmmA, xmmB, xmmC, 0); + a.vroundsd(xmmA, xmmB, anyptr_gpC, 0); + a.vroundss(xmmA, xmmB, xmmC, 0); + a.vroundss(xmmA, xmmB, anyptr_gpC, 0); + a.vshufpd(xmmA, xmmB, xmmC, 0); + a.vshufpd(xmmA, xmmB, anyptr_gpC, 0); + a.vshufpd(ymmA, ymmB, ymmC, 0); + a.vshufpd(ymmA, ymmB, anyptr_gpC, 0); + a.vshufps(xmmA, xmmB, xmmC, 0); + a.vshufps(xmmA, xmmB, anyptr_gpC, 0); + a.vshufps(ymmA, ymmB, ymmC, 0); + a.vshufps(ymmA, ymmB, anyptr_gpC, 0); + a.vsqrtpd(xmmA, xmmB); + a.vsqrtpd(xmmA, anyptr_gpB); + a.vsqrtpd(ymmA, ymmB); + a.vsqrtpd(ymmA, anyptr_gpB); + a.vsqrtps(xmmA, xmmB); + a.vsqrtps(xmmA, anyptr_gpB); + a.vsqrtps(ymmA, ymmB); + a.vsqrtps(ymmA, anyptr_gpB); + a.vsqrtsd(xmmA, xmmB, xmmC); + a.vsqrtsd(xmmA, xmmB, anyptr_gpC); + a.vsqrtss(xmmA, xmmB, xmmC); + a.vsqrtss(xmmA, xmmB, anyptr_gpC); + a.vstmxcsr(anyptr_gpA); + a.vsubpd(xmmA, xmmB, xmmC); + a.vsubpd(xmmA, xmmB, anyptr_gpC); + a.vsubpd(ymmA, ymmB, ymmC); + a.vsubpd(ymmA, ymmB, anyptr_gpC); + a.vsubps(xmmA, xmmB, xmmC); + a.vsubps(xmmA, xmmB, anyptr_gpC); + a.vsubps(ymmA, ymmB, ymmC); + a.vsubps(ymmA, ymmB, anyptr_gpC); + a.vsubsd(xmmA, xmmB, xmmC); + a.vsubsd(xmmA, xmmB, anyptr_gpC); + a.vsubss(xmmA, xmmB, xmmC); + a.vsubss(xmmA, xmmB, anyptr_gpC); + a.vtestps(xmmA, xmmB); + a.vtestps(xmmA, anyptr_gpB); + a.vtestps(ymmA, ymmB); + a.vtestps(ymmA, anyptr_gpB); + a.vtestpd(xmmA, xmmB); + a.vtestpd(xmmA, anyptr_gpB); + a.vtestpd(ymmA, ymmB); + a.vtestpd(ymmA, anyptr_gpB); + a.vucomisd(xmmA, xmmB); + a.vucomisd(xmmA, anyptr_gpB); + a.vucomiss(xmmA, xmmB); + a.vucomiss(xmmA, anyptr_gpB); + a.vunpckhpd(xmmA, xmmB, xmmC); + a.vunpckhpd(xmmA, xmmB, anyptr_gpC); + a.vunpckhpd(ymmA, ymmB, ymmC); + a.vunpckhpd(ymmA, ymmB, anyptr_gpC); + a.vunpckhps(xmmA, xmmB, xmmC); + a.vunpckhps(xmmA, xmmB, anyptr_gpC); + a.vunpckhps(ymmA, ymmB, ymmC); + a.vunpckhps(ymmA, ymmB, anyptr_gpC); + a.vunpcklpd(xmmA, xmmB, xmmC); + a.vunpcklpd(xmmA, xmmB, anyptr_gpC); + a.vunpcklpd(ymmA, ymmB, ymmC); + a.vunpcklpd(ymmA, ymmB, anyptr_gpC); + a.vunpcklps(xmmA, xmmB, xmmC); + a.vunpcklps(xmmA, xmmB, anyptr_gpC); + a.vunpcklps(ymmA, ymmB, ymmC); + a.vunpcklps(ymmA, ymmB, anyptr_gpC); + a.vxorpd(xmmA, xmmB, xmmC); + a.vxorpd(xmmA, xmmB, anyptr_gpC); + a.vxorpd(ymmA, ymmB, ymmC); + a.vxorpd(ymmA, ymmB, anyptr_gpC); + a.vxorps(xmmA, xmmB, xmmC); + a.vxorps(xmmA, xmmB, anyptr_gpC); + a.vxorps(ymmA, ymmB, ymmC); + a.vxorps(ymmA, ymmB, anyptr_gpC); + a.vzeroall(); + a.vzeroupper(); + + // AVX+AESNI. + a.nop(); + + a.vaesdec(xmmA, xmmB, xmmC); + a.vaesdec(xmmA, xmmB, anyptr_gpC); + a.vaesdeclast(xmmA, xmmB, xmmC); + a.vaesdeclast(xmmA, xmmB, anyptr_gpC); + a.vaesenc(xmmA, xmmB, xmmC); + a.vaesenc(xmmA, xmmB, anyptr_gpC); + a.vaesenclast(xmmA, xmmB, xmmC); + a.vaesenclast(xmmA, xmmB, anyptr_gpC); + a.vaesimc(xmmA, xmmB); + a.vaesimc(xmmA, anyptr_gpB); + a.vaeskeygenassist(xmmA, xmmB, 0); + a.vaeskeygenassist(xmmA, anyptr_gpB, 0); + + // AVX+PCLMULQDQ. + a.nop(); + + a.vpclmulqdq(xmmA, xmmB, xmmC, 0); + a.vpclmulqdq(xmmA, xmmB, anyptr_gpC, 0); + + // AVX2. + a.nop(); + + a.vbroadcasti128(ymmA, anyptr_gpB); + a.vbroadcastsd(ymmA, xmmB); + a.vbroadcastss(xmmA, xmmB); + a.vbroadcastss(ymmA, xmmB); + a.vextracti128(xmmA, ymmB, 0); + a.vextracti128(anyptr_gpA, ymmB, 0); + a.vgatherdpd(xmmA, vmxptr_gpB, xmmC); + a.vgatherdpd(ymmA, vmyptr_gpB, ymmC); + a.vgatherdps(xmmA, vmxptr_gpB, xmmC); + a.vgatherdps(ymmA, vmyptr_gpB, ymmC); + a.vgatherqpd(xmmA, vmxptr_gpB, xmmC); + a.vgatherqpd(ymmA, vmyptr_gpB, ymmC); + a.vgatherqps(xmmA, vmxptr_gpB, xmmC); + a.vgatherqps(xmmA, vmyptr_gpB, xmmC); + a.vinserti128(ymmA, ymmB, xmmC, 0); + a.vinserti128(ymmA, ymmB, anyptr_gpC, 0); + a.vmovntdqa(ymmA, anyptr_gpB); + a.vmpsadbw(ymmA, ymmB, ymmC, 0); + a.vmpsadbw(ymmA, ymmB, anyptr_gpC, 0); + a.vpabsb(ymmA, ymmB); + a.vpabsb(ymmA, anyptr_gpB); + a.vpabsd(ymmA, ymmB); + a.vpabsd(ymmA, anyptr_gpB); + a.vpabsw(ymmA, ymmB); + a.vpabsw(ymmA, anyptr_gpB); + a.vpackssdw(ymmA, ymmB, ymmC); + a.vpackssdw(ymmA, ymmB, anyptr_gpC); + a.vpacksswb(ymmA, ymmB, ymmC); + a.vpacksswb(ymmA, ymmB, anyptr_gpC); + a.vpackusdw(ymmA, ymmB, ymmC); + a.vpackusdw(ymmA, ymmB, anyptr_gpC); + a.vpackuswb(ymmA, ymmB, ymmC); + a.vpackuswb(ymmA, ymmB, anyptr_gpC); + a.vpaddb(ymmA, ymmB, ymmC); + a.vpaddb(ymmA, ymmB, anyptr_gpC); + a.vpaddd(ymmA, ymmB, ymmC); + a.vpaddd(ymmA, ymmB, anyptr_gpC); + a.vpaddq(ymmA, ymmB, ymmC); + a.vpaddq(ymmA, ymmB, anyptr_gpC); + a.vpaddw(ymmA, ymmB, ymmC); + a.vpaddw(ymmA, ymmB, anyptr_gpC); + a.vpaddsb(ymmA, ymmB, ymmC); + a.vpaddsb(ymmA, ymmB, anyptr_gpC); + a.vpaddsw(ymmA, ymmB, ymmC); + a.vpaddsw(ymmA, ymmB, anyptr_gpC); + a.vpaddusb(ymmA, ymmB, ymmC); + a.vpaddusb(ymmA, ymmB, anyptr_gpC); + a.vpaddusw(ymmA, ymmB, ymmC); + a.vpaddusw(ymmA, ymmB, anyptr_gpC); + a.vpalignr(ymmA, ymmB, ymmC, 0); + a.vpalignr(ymmA, ymmB, anyptr_gpC, 0); + a.vpand(ymmA, ymmB, ymmC); + a.vpand(ymmA, ymmB, anyptr_gpC); + a.vpandn(ymmA, ymmB, ymmC); + a.vpandn(ymmA, ymmB, anyptr_gpC); + a.vpavgb(ymmA, ymmB, ymmC); + a.vpavgb(ymmA, ymmB, anyptr_gpC); + a.vpavgw(ymmA, ymmB, ymmC); + a.vpavgw(ymmA, ymmB, anyptr_gpC); + a.vpblendd(xmmA, xmmB, xmmC, 0); + a.vpblendd(xmmA, xmmB, anyptr_gpC, 0); + a.vpblendd(ymmA, ymmB, ymmC, 0); + a.vpblendd(ymmA, ymmB, anyptr_gpC, 0); + a.vpblendvb(ymmA, ymmB, ymmC, ymmD); + a.vpblendvb(ymmA, ymmB, anyptr_gpC, ymmD); + a.vpblendw(ymmA, ymmB, ymmC, 0); + a.vpblendw(ymmA, ymmB, anyptr_gpC, 0); + a.vpbroadcastb(xmmA, xmmB); + a.vpbroadcastb(xmmA, anyptr_gpB); + a.vpbroadcastb(ymmA, xmmB); + a.vpbroadcastb(ymmA, anyptr_gpB); + a.vpbroadcastd(xmmA, xmmB); + a.vpbroadcastd(xmmA, anyptr_gpB); + a.vpbroadcastd(ymmA, xmmB); + a.vpbroadcastd(ymmA, anyptr_gpB); + a.vpbroadcastq(xmmA, xmmB); + a.vpbroadcastq(xmmA, anyptr_gpB); + a.vpbroadcastq(ymmA, xmmB); + a.vpbroadcastq(ymmA, anyptr_gpB); + a.vpbroadcastw(xmmA, xmmB); + a.vpbroadcastw(xmmA, anyptr_gpB); + a.vpbroadcastw(ymmA, xmmB); + a.vpbroadcastw(ymmA, anyptr_gpB); + a.vpcmpeqb(ymmA, ymmB, ymmC); + a.vpcmpeqb(ymmA, ymmB, anyptr_gpC); + a.vpcmpeqd(ymmA, ymmB, ymmC); + a.vpcmpeqd(ymmA, ymmB, anyptr_gpC); + a.vpcmpeqq(ymmA, ymmB, ymmC); + a.vpcmpeqq(ymmA, ymmB, anyptr_gpC); + a.vpcmpeqw(ymmA, ymmB, ymmC); + a.vpcmpeqw(ymmA, ymmB, anyptr_gpC); + a.vpcmpgtb(ymmA, ymmB, ymmC); + a.vpcmpgtb(ymmA, ymmB, anyptr_gpC); + a.vpcmpgtd(ymmA, ymmB, ymmC); + a.vpcmpgtd(ymmA, ymmB, anyptr_gpC); + a.vpcmpgtq(ymmA, ymmB, ymmC); + a.vpcmpgtq(ymmA, ymmB, anyptr_gpC); + a.vpcmpgtw(ymmA, ymmB, ymmC); + a.vpcmpgtw(ymmA, ymmB, anyptr_gpC); + a.vperm2i128(ymmA, ymmB, ymmC, 0); + a.vperm2i128(ymmA, ymmB, anyptr_gpC, 0); + a.vpermd(ymmA, ymmB, ymmC); + a.vpermd(ymmA, ymmB, anyptr_gpC); + a.vpermps(ymmA, ymmB, ymmC); + a.vpermps(ymmA, ymmB, anyptr_gpC); + a.vpermpd(ymmA, ymmB, 0); + a.vpermpd(ymmA, anyptr_gpB, 0); + a.vpermq(ymmA, ymmB, 0); + a.vpermq(ymmA, anyptr_gpB, 0); + a.vpgatherdd(xmmA, vmxptr_gpB, xmmC); + a.vpgatherdd(ymmA, vmyptr_gpB, ymmC); + a.vpgatherdq(xmmA, vmxptr_gpB, xmmC); + a.vpgatherdq(ymmA, vmyptr_gpB, ymmC); + a.vpgatherqd(xmmA, vmxptr_gpB, xmmC); + a.vpgatherqd(xmmA, vmyptr_gpB, xmmC); + a.vpgatherqq(xmmA, vmxptr_gpB, xmmC); + a.vpgatherqq(ymmA, vmyptr_gpB, ymmC); + a.vpmovmskb(gzA, ymmB); + a.vpmovsxbd(ymmA, anyptr_gpB); + a.vpmovsxbd(ymmA, xmmB); + a.vpmovsxbq(ymmA, anyptr_gpB); + a.vpmovsxbq(ymmA, xmmB); + a.vpmovsxbw(ymmA, anyptr_gpB); + a.vpmovsxbw(ymmA, xmmB); + a.vpmovsxdq(ymmA, anyptr_gpB); + a.vpmovsxdq(ymmA, xmmB); + a.vpmovsxwd(ymmA, anyptr_gpB); + a.vpmovsxwd(ymmA, xmmB); + a.vpmovsxwq(ymmA, anyptr_gpB); + a.vpmovsxwq(ymmA, xmmB); + a.vpmovzxbd(ymmA, anyptr_gpB); + a.vpmovzxbd(ymmA, xmmB); + a.vpmovzxbq(ymmA, anyptr_gpB); + a.vpmovzxbq(ymmA, xmmB); + a.vpmovzxbw(ymmA, anyptr_gpB); + a.vpmovzxbw(ymmA, xmmB); + a.vpmovzxdq(ymmA, anyptr_gpB); + a.vpmovzxdq(ymmA, xmmB); + a.vpmovzxwd(ymmA, anyptr_gpB); + a.vpmovzxwd(ymmA, xmmB); + a.vpmovzxwq(ymmA, anyptr_gpB); + a.vpmovzxwq(ymmA, xmmB); + a.vpshufd(ymmA, anyptr_gpB, 0); + a.vpshufd(ymmA, ymmB, 0); + a.vpshufhw(ymmA, anyptr_gpB, 0); + a.vpshufhw(ymmA, ymmB, 0); + a.vpshuflw(ymmA, anyptr_gpB, 0); + a.vpshuflw(ymmA, ymmB, 0); + a.vpslld(ymmA, ymmB, 0); + a.vpslldq(ymmA, ymmB, 0); + a.vpsllq(ymmA, ymmB, 0); + a.vpsllw(ymmA, ymmB, 0); + a.vpsrad(ymmA, ymmB, 0); + a.vpsraw(ymmA, ymmB, 0); + a.vpsrld(ymmA, ymmB, 0); + a.vpsrldq(ymmA, ymmB, 0); + a.vpsrlq(ymmA, ymmB, 0); + a.vpsrlw(ymmA, ymmB, 0); + a.vphaddd(ymmA, ymmB, anyptr_gpC); + a.vphaddd(ymmA, ymmB, ymmC); + a.vphaddsw(ymmA, ymmB, anyptr_gpC); + a.vphaddsw(ymmA, ymmB, ymmC); + a.vphaddw(ymmA, ymmB, anyptr_gpC); + a.vphaddw(ymmA, ymmB, ymmC); + a.vphsubd(ymmA, ymmB, anyptr_gpC); + a.vphsubd(ymmA, ymmB, ymmC); + a.vphsubsw(ymmA, ymmB, anyptr_gpC); + a.vphsubsw(ymmA, ymmB, ymmC); + a.vphsubw(ymmA, ymmB, anyptr_gpC); + a.vphsubw(ymmA, ymmB, ymmC); + a.vpmaddubsw(ymmA, ymmB, anyptr_gpC); + a.vpmaddubsw(ymmA, ymmB, ymmC); + a.vpmaddwd(ymmA, ymmB, anyptr_gpC); + a.vpmaddwd(ymmA, ymmB, ymmC); + a.vpmaskmovd(anyptr_gpA, xmmB, xmmC); + a.vpmaskmovd(anyptr_gpA, ymmB, ymmC); + a.vpmaskmovd(xmmA, xmmB, anyptr_gpC); + a.vpmaskmovd(ymmA, ymmB, anyptr_gpC); + a.vpmaskmovq(anyptr_gpA, xmmB, xmmC); + a.vpmaskmovq(anyptr_gpA, ymmB, ymmC); + a.vpmaskmovq(xmmA, xmmB, anyptr_gpC); + a.vpmaskmovq(ymmA, ymmB, anyptr_gpC); + a.vpmaxsb(ymmA, ymmB, anyptr_gpC); + a.vpmaxsb(ymmA, ymmB, ymmC); + a.vpmaxsd(ymmA, ymmB, anyptr_gpC); + a.vpmaxsd(ymmA, ymmB, ymmC); + a.vpmaxsw(ymmA, ymmB, anyptr_gpC); + a.vpmaxsw(ymmA, ymmB, ymmC); + a.vpmaxub(ymmA, ymmB, anyptr_gpC); + a.vpmaxub(ymmA, ymmB, ymmC); + a.vpmaxud(ymmA, ymmB, anyptr_gpC); + a.vpmaxud(ymmA, ymmB, ymmC); + a.vpmaxuw(ymmA, ymmB, anyptr_gpC); + a.vpmaxuw(ymmA, ymmB, ymmC); + a.vpminsb(ymmA, ymmB, anyptr_gpC); + a.vpminsb(ymmA, ymmB, ymmC); + a.vpminsd(ymmA, ymmB, anyptr_gpC); + a.vpminsd(ymmA, ymmB, ymmC); + a.vpminsw(ymmA, ymmB, anyptr_gpC); + a.vpminsw(ymmA, ymmB, ymmC); + a.vpminub(ymmA, ymmB, anyptr_gpC); + a.vpminub(ymmA, ymmB, ymmC); + a.vpminud(ymmA, ymmB, anyptr_gpC); + a.vpminud(ymmA, ymmB, ymmC); + a.vpminuw(ymmA, ymmB, anyptr_gpC); + a.vpminuw(ymmA, ymmB, ymmC); + a.vpmuldq(ymmA, ymmB, anyptr_gpC); + a.vpmuldq(ymmA, ymmB, ymmC); + a.vpmulhrsw(ymmA, ymmB, anyptr_gpC); + a.vpmulhrsw(ymmA, ymmB, ymmC); + a.vpmulhuw(ymmA, ymmB, anyptr_gpC); + a.vpmulhuw(ymmA, ymmB, ymmC); + a.vpmulhw(ymmA, ymmB, anyptr_gpC); + a.vpmulhw(ymmA, ymmB, ymmC); + a.vpmulld(ymmA, ymmB, anyptr_gpC); + a.vpmulld(ymmA, ymmB, ymmC); + a.vpmullw(ymmA, ymmB, anyptr_gpC); + a.vpmullw(ymmA, ymmB, ymmC); + a.vpmuludq(ymmA, ymmB, anyptr_gpC); + a.vpmuludq(ymmA, ymmB, ymmC); + a.vpor(ymmA, ymmB, anyptr_gpC); + a.vpor(ymmA, ymmB, ymmC); + a.vpsadbw(ymmA, ymmB, anyptr_gpC); + a.vpsadbw(ymmA, ymmB, ymmC); + a.vpshufb(ymmA, ymmB, anyptr_gpC); + a.vpshufb(ymmA, ymmB, ymmC); + a.vpsignb(ymmA, ymmB, anyptr_gpC); + a.vpsignb(ymmA, ymmB, ymmC); + a.vpsignd(ymmA, ymmB, anyptr_gpC); + a.vpsignd(ymmA, ymmB, ymmC); + a.vpsignw(ymmA, ymmB, anyptr_gpC); + a.vpsignw(ymmA, ymmB, ymmC); + a.vpslld(ymmA, ymmB, anyptr_gpC); + a.vpslld(ymmA, ymmB, xmmC); + a.vpsllq(ymmA, ymmB, anyptr_gpC); + a.vpsllq(ymmA, ymmB, xmmC); + a.vpsllvd(xmmA, xmmB, anyptr_gpC); + a.vpsllvd(xmmA, xmmB, xmmC); + a.vpsllvd(ymmA, ymmB, anyptr_gpC); + a.vpsllvd(ymmA, ymmB, ymmC); + a.vpsllvq(xmmA, xmmB, anyptr_gpC); + a.vpsllvq(xmmA, xmmB, xmmC); + a.vpsllvq(ymmA, ymmB, anyptr_gpC); + a.vpsllvq(ymmA, ymmB, ymmC); + a.vpsllw(ymmA, ymmB, anyptr_gpC); + a.vpsllw(ymmA, ymmB, xmmC); + a.vpsrad(ymmA, ymmB, anyptr_gpC); + a.vpsrad(ymmA, ymmB, xmmC); + a.vpsravd(xmmA, xmmB, anyptr_gpC); + a.vpsravd(xmmA, xmmB, xmmC); + a.vpsravd(ymmA, ymmB, anyptr_gpC); + a.vpsravd(ymmA, ymmB, ymmC); + a.vpsraw(ymmA, ymmB, anyptr_gpC); + a.vpsraw(ymmA, ymmB, xmmC); + a.vpsrld(ymmA, ymmB, anyptr_gpC); + a.vpsrld(ymmA, ymmB, xmmC); + a.vpsrlq(ymmA, ymmB, anyptr_gpC); + a.vpsrlq(ymmA, ymmB, xmmC); + a.vpsrlvd(xmmA, xmmB, anyptr_gpC); + a.vpsrlvd(xmmA, xmmB, xmmC); + a.vpsrlvd(ymmA, ymmB, anyptr_gpC); + a.vpsrlvd(ymmA, ymmB, ymmC); + a.vpsrlvq(xmmA, xmmB, anyptr_gpC); + a.vpsrlvq(xmmA, xmmB, xmmC); + a.vpsrlvq(ymmA, ymmB, anyptr_gpC); + a.vpsrlvq(ymmA, ymmB, ymmC); + a.vpsrlw(ymmA, ymmB, anyptr_gpC); + a.vpsrlw(ymmA, ymmB, xmmC); + a.vpsubb(ymmA, ymmB, anyptr_gpC); + a.vpsubb(ymmA, ymmB, ymmC); + a.vpsubd(ymmA, ymmB, anyptr_gpC); + a.vpsubd(ymmA, ymmB, ymmC); + a.vpsubq(ymmA, ymmB, anyptr_gpC); + a.vpsubq(ymmA, ymmB, ymmC); + a.vpsubsb(ymmA, ymmB, anyptr_gpC); + a.vpsubsb(ymmA, ymmB, ymmC); + a.vpsubsw(ymmA, ymmB, anyptr_gpC); + a.vpsubsw(ymmA, ymmB, ymmC); + a.vpsubusb(ymmA, ymmB, anyptr_gpC); + a.vpsubusb(ymmA, ymmB, ymmC); + a.vpsubusw(ymmA, ymmB, anyptr_gpC); + a.vpsubusw(ymmA, ymmB, ymmC); + a.vpsubw(ymmA, ymmB, anyptr_gpC); + a.vpsubw(ymmA, ymmB, ymmC); + a.vpunpckhbw(ymmA, ymmB, anyptr_gpC); + a.vpunpckhbw(ymmA, ymmB, ymmC); + a.vpunpckhdq(ymmA, ymmB, anyptr_gpC); + a.vpunpckhdq(ymmA, ymmB, ymmC); + a.vpunpckhqdq(ymmA, ymmB, anyptr_gpC); + a.vpunpckhqdq(ymmA, ymmB, ymmC); + a.vpunpckhwd(ymmA, ymmB, anyptr_gpC); + a.vpunpckhwd(ymmA, ymmB, ymmC); + a.vpunpcklbw(ymmA, ymmB, anyptr_gpC); + a.vpunpcklbw(ymmA, ymmB, ymmC); + a.vpunpckldq(ymmA, ymmB, anyptr_gpC); + a.vpunpckldq(ymmA, ymmB, ymmC); + a.vpunpcklqdq(ymmA, ymmB, anyptr_gpC); + a.vpunpcklqdq(ymmA, ymmB, ymmC); + a.vpunpcklwd(ymmA, ymmB, anyptr_gpC); + a.vpunpcklwd(ymmA, ymmB, ymmC); + a.vpxor(ymmA, ymmB, anyptr_gpC); + a.vpxor(ymmA, ymmB, ymmC); + + // FMA3. + a.nop(); + + a.vfmadd132pd(xmmA, xmmB, anyptr_gpC); + a.vfmadd132pd(xmmA, xmmB, xmmC); + a.vfmadd132pd(ymmA, ymmB, anyptr_gpC); + a.vfmadd132pd(ymmA, ymmB, ymmC); + a.vfmadd132ps(xmmA, xmmB, anyptr_gpC); + a.vfmadd132ps(xmmA, xmmB, xmmC); + a.vfmadd132ps(ymmA, ymmB, anyptr_gpC); + a.vfmadd132ps(ymmA, ymmB, ymmC); + a.vfmadd132sd(xmmA, xmmB, anyptr_gpC); + a.vfmadd132sd(xmmA, xmmB, xmmC); + a.vfmadd132ss(xmmA, xmmB, anyptr_gpC); + a.vfmadd132ss(xmmA, xmmB, xmmC); + a.vfmadd213pd(xmmA, xmmB, anyptr_gpC); + a.vfmadd213pd(xmmA, xmmB, xmmC); + a.vfmadd213pd(ymmA, ymmB, anyptr_gpC); + a.vfmadd213pd(ymmA, ymmB, ymmC); + a.vfmadd213ps(xmmA, xmmB, anyptr_gpC); + a.vfmadd213ps(xmmA, xmmB, xmmC); + a.vfmadd213ps(ymmA, ymmB, anyptr_gpC); + a.vfmadd213ps(ymmA, ymmB, ymmC); + a.vfmadd213sd(xmmA, xmmB, anyptr_gpC); + a.vfmadd213sd(xmmA, xmmB, xmmC); + a.vfmadd213ss(xmmA, xmmB, anyptr_gpC); + a.vfmadd213ss(xmmA, xmmB, xmmC); + a.vfmadd231pd(xmmA, xmmB, anyptr_gpC); + a.vfmadd231pd(xmmA, xmmB, xmmC); + a.vfmadd231pd(ymmA, ymmB, anyptr_gpC); + a.vfmadd231pd(ymmA, ymmB, ymmC); + a.vfmadd231ps(xmmA, xmmB, anyptr_gpC); + a.vfmadd231ps(xmmA, xmmB, xmmC); + a.vfmadd231ps(ymmA, ymmB, anyptr_gpC); + a.vfmadd231ps(ymmA, ymmB, ymmC); + a.vfmadd231sd(xmmA, xmmB, anyptr_gpC); + a.vfmadd231sd(xmmA, xmmB, xmmC); + a.vfmadd231ss(xmmA, xmmB, anyptr_gpC); + a.vfmadd231ss(xmmA, xmmB, xmmC); + a.vfmaddsub132pd(xmmA, xmmB, anyptr_gpC); + a.vfmaddsub132pd(xmmA, xmmB, xmmC); + a.vfmaddsub132pd(ymmA, ymmB, anyptr_gpC); + a.vfmaddsub132pd(ymmA, ymmB, ymmC); + a.vfmaddsub132ps(xmmA, xmmB, anyptr_gpC); + a.vfmaddsub132ps(xmmA, xmmB, xmmC); + a.vfmaddsub132ps(ymmA, ymmB, anyptr_gpC); + a.vfmaddsub132ps(ymmA, ymmB, ymmC); + a.vfmaddsub213pd(xmmA, xmmB, anyptr_gpC); + a.vfmaddsub213pd(xmmA, xmmB, xmmC); + a.vfmaddsub213pd(ymmA, ymmB, anyptr_gpC); + a.vfmaddsub213pd(ymmA, ymmB, ymmC); + a.vfmaddsub213ps(xmmA, xmmB, anyptr_gpC); + a.vfmaddsub213ps(xmmA, xmmB, xmmC); + a.vfmaddsub213ps(ymmA, ymmB, anyptr_gpC); + a.vfmaddsub213ps(ymmA, ymmB, ymmC); + a.vfmaddsub231pd(xmmA, xmmB, anyptr_gpC); + a.vfmaddsub231pd(xmmA, xmmB, xmmC); + a.vfmaddsub231pd(ymmA, ymmB, anyptr_gpC); + a.vfmaddsub231pd(ymmA, ymmB, ymmC); + a.vfmaddsub231ps(xmmA, xmmB, anyptr_gpC); + a.vfmaddsub231ps(xmmA, xmmB, xmmC); + a.vfmaddsub231ps(ymmA, ymmB, anyptr_gpC); + a.vfmaddsub231ps(ymmA, ymmB, ymmC); + a.vfmsub132pd(xmmA, xmmB, anyptr_gpC); + a.vfmsub132pd(xmmA, xmmB, xmmC); + a.vfmsub132pd(ymmA, ymmB, anyptr_gpC); + a.vfmsub132pd(ymmA, ymmB, ymmC); + a.vfmsub132ps(xmmA, xmmB, anyptr_gpC); + a.vfmsub132ps(xmmA, xmmB, xmmC); + a.vfmsub132ps(ymmA, ymmB, anyptr_gpC); + a.vfmsub132ps(ymmA, ymmB, ymmC); + a.vfmsub132sd(xmmA, xmmB, anyptr_gpC); + a.vfmsub132sd(xmmA, xmmB, xmmC); + a.vfmsub132ss(xmmA, xmmB, anyptr_gpC); + a.vfmsub132ss(xmmA, xmmB, xmmC); + a.vfmsub213pd(xmmA, xmmB, anyptr_gpC); + a.vfmsub213pd(xmmA, xmmB, xmmC); + a.vfmsub213pd(ymmA, ymmB, anyptr_gpC); + a.vfmsub213pd(ymmA, ymmB, ymmC); + a.vfmsub213ps(xmmA, xmmB, anyptr_gpC); + a.vfmsub213ps(xmmA, xmmB, xmmC); + a.vfmsub213ps(ymmA, ymmB, anyptr_gpC); + a.vfmsub213ps(ymmA, ymmB, ymmC); + a.vfmsub213sd(xmmA, xmmB, anyptr_gpC); + a.vfmsub213sd(xmmA, xmmB, xmmC); + a.vfmsub213ss(xmmA, xmmB, anyptr_gpC); + a.vfmsub213ss(xmmA, xmmB, xmmC); + a.vfmsub231pd(xmmA, xmmB, anyptr_gpC); + a.vfmsub231pd(xmmA, xmmB, xmmC); + a.vfmsub231pd(ymmA, ymmB, anyptr_gpC); + a.vfmsub231pd(ymmA, ymmB, ymmC); + a.vfmsub231ps(xmmA, xmmB, anyptr_gpC); + a.vfmsub231ps(xmmA, xmmB, xmmC); + a.vfmsub231ps(ymmA, ymmB, anyptr_gpC); + a.vfmsub231ps(ymmA, ymmB, ymmC); + a.vfmsub231sd(xmmA, xmmB, anyptr_gpC); + a.vfmsub231sd(xmmA, xmmB, xmmC); + a.vfmsub231ss(xmmA, xmmB, anyptr_gpC); + a.vfmsub231ss(xmmA, xmmB, xmmC); + a.vfmsubadd132pd(xmmA, xmmB, anyptr_gpC); + a.vfmsubadd132pd(xmmA, xmmB, xmmC); + a.vfmsubadd132pd(ymmA, ymmB, anyptr_gpC); + a.vfmsubadd132pd(ymmA, ymmB, ymmC); + a.vfmsubadd132ps(xmmA, xmmB, anyptr_gpC); + a.vfmsubadd132ps(xmmA, xmmB, xmmC); + a.vfmsubadd132ps(ymmA, ymmB, anyptr_gpC); + a.vfmsubadd132ps(ymmA, ymmB, ymmC); + a.vfmsubadd213pd(xmmA, xmmB, anyptr_gpC); + a.vfmsubadd213pd(xmmA, xmmB, xmmC); + a.vfmsubadd213pd(ymmA, ymmB, anyptr_gpC); + a.vfmsubadd213pd(ymmA, ymmB, ymmC); + a.vfmsubadd213ps(xmmA, xmmB, anyptr_gpC); + a.vfmsubadd213ps(xmmA, xmmB, xmmC); + a.vfmsubadd213ps(ymmA, ymmB, anyptr_gpC); + a.vfmsubadd213ps(ymmA, ymmB, ymmC); + a.vfmsubadd231pd(xmmA, xmmB, anyptr_gpC); + a.vfmsubadd231pd(xmmA, xmmB, xmmC); + a.vfmsubadd231pd(ymmA, ymmB, anyptr_gpC); + a.vfmsubadd231pd(ymmA, ymmB, ymmC); + a.vfmsubadd231ps(xmmA, xmmB, anyptr_gpC); + a.vfmsubadd231ps(xmmA, xmmB, xmmC); + a.vfmsubadd231ps(ymmA, ymmB, anyptr_gpC); + a.vfmsubadd231ps(ymmA, ymmB, ymmC); + a.vfnmadd132pd(xmmA, xmmB, anyptr_gpC); + a.vfnmadd132pd(xmmA, xmmB, xmmC); + a.vfnmadd132pd(ymmA, ymmB, anyptr_gpC); + a.vfnmadd132pd(ymmA, ymmB, ymmC); + a.vfnmadd132ps(xmmA, xmmB, anyptr_gpC); + a.vfnmadd132ps(xmmA, xmmB, xmmC); + a.vfnmadd132ps(ymmA, ymmB, anyptr_gpC); + a.vfnmadd132ps(ymmA, ymmB, ymmC); + a.vfnmadd132sd(xmmA, xmmB, anyptr_gpC); + a.vfnmadd132sd(xmmA, xmmB, xmmC); + a.vfnmadd132ss(xmmA, xmmB, anyptr_gpC); + a.vfnmadd132ss(xmmA, xmmB, xmmC); + a.vfnmadd213pd(xmmA, xmmB, anyptr_gpC); + a.vfnmadd213pd(xmmA, xmmB, xmmC); + a.vfnmadd213pd(ymmA, ymmB, anyptr_gpC); + a.vfnmadd213pd(ymmA, ymmB, ymmC); + a.vfnmadd213ps(xmmA, xmmB, anyptr_gpC); + a.vfnmadd213ps(xmmA, xmmB, xmmC); + a.vfnmadd213ps(ymmA, ymmB, anyptr_gpC); + a.vfnmadd213ps(ymmA, ymmB, ymmC); + a.vfnmadd213sd(xmmA, xmmB, anyptr_gpC); + a.vfnmadd213sd(xmmA, xmmB, xmmC); + a.vfnmadd213ss(xmmA, xmmB, anyptr_gpC); + a.vfnmadd213ss(xmmA, xmmB, xmmC); + a.vfnmadd231pd(xmmA, xmmB, anyptr_gpC); + a.vfnmadd231pd(xmmA, xmmB, xmmC); + a.vfnmadd231pd(ymmA, ymmB, anyptr_gpC); + a.vfnmadd231pd(ymmA, ymmB, ymmC); + a.vfnmadd231ps(xmmA, xmmB, anyptr_gpC); + a.vfnmadd231ps(xmmA, xmmB, xmmC); + a.vfnmadd231ps(ymmA, ymmB, anyptr_gpC); + a.vfnmadd231ps(ymmA, ymmB, ymmC); + a.vfnmadd231sd(xmmA, xmmB, anyptr_gpC); + a.vfnmadd231sd(xmmA, xmmB, xmmC); + a.vfnmadd231ss(xmmA, xmmB, anyptr_gpC); + a.vfnmadd231ss(xmmA, xmmB, xmmC); + a.vfnmsub132pd(xmmA, xmmB, anyptr_gpC); + a.vfnmsub132pd(xmmA, xmmB, xmmC); + a.vfnmsub132pd(ymmA, ymmB, anyptr_gpC); + a.vfnmsub132pd(ymmA, ymmB, ymmC); + a.vfnmsub132ps(xmmA, xmmB, anyptr_gpC); + a.vfnmsub132ps(xmmA, xmmB, xmmC); + a.vfnmsub132ps(ymmA, ymmB, anyptr_gpC); + a.vfnmsub132ps(ymmA, ymmB, ymmC); + a.vfnmsub132sd(xmmA, xmmB, anyptr_gpC); + a.vfnmsub132sd(xmmA, xmmB, xmmC); + a.vfnmsub132ss(xmmA, xmmB, anyptr_gpC); + a.vfnmsub132ss(xmmA, xmmB, xmmC); + a.vfnmsub213pd(xmmA, xmmB, anyptr_gpC); + a.vfnmsub213pd(xmmA, xmmB, xmmC); + a.vfnmsub213pd(ymmA, ymmB, anyptr_gpC); + a.vfnmsub213pd(ymmA, ymmB, ymmC); + a.vfnmsub213ps(xmmA, xmmB, anyptr_gpC); + a.vfnmsub213ps(xmmA, xmmB, xmmC); + a.vfnmsub213ps(ymmA, ymmB, anyptr_gpC); + a.vfnmsub213ps(ymmA, ymmB, ymmC); + a.vfnmsub213sd(xmmA, xmmB, anyptr_gpC); + a.vfnmsub213sd(xmmA, xmmB, xmmC); + a.vfnmsub213ss(xmmA, xmmB, anyptr_gpC); + a.vfnmsub213ss(xmmA, xmmB, xmmC); + a.vfnmsub231pd(xmmA, xmmB, anyptr_gpC); + a.vfnmsub231pd(xmmA, xmmB, xmmC); + a.vfnmsub231pd(ymmA, ymmB, anyptr_gpC); + a.vfnmsub231pd(ymmA, ymmB, ymmC); + a.vfnmsub231ps(xmmA, xmmB, anyptr_gpC); + a.vfnmsub231ps(xmmA, xmmB, xmmC); + a.vfnmsub231ps(ymmA, ymmB, anyptr_gpC); + a.vfnmsub231ps(ymmA, ymmB, ymmC); + a.vfnmsub231sd(xmmA, xmmB, anyptr_gpC); + a.vfnmsub231sd(xmmA, xmmB, xmmC); + a.vfnmsub231ss(xmmA, xmmB, anyptr_gpC); + a.vfnmsub231ss(xmmA, xmmB, xmmC); + + // FMA4. + a.nop(); + + a.vfmaddpd(xmmA, xmmB, xmmC, xmmD); + a.vfmaddpd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmaddpd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmaddpd(ymmA, ymmB, ymmC, ymmD); + a.vfmaddpd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmaddpd(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmaddps(xmmA, xmmB, xmmC, xmmD); + a.vfmaddps(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmaddps(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmaddps(ymmA, ymmB, ymmC, ymmD); + a.vfmaddps(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmaddps(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmaddsd(xmmA, xmmB, xmmC, xmmD); + a.vfmaddsd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmaddsd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmaddss(xmmA, xmmB, xmmC, xmmD); + a.vfmaddss(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmaddss(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmaddsubpd(xmmA, xmmB, xmmC, xmmD); + a.vfmaddsubpd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmaddsubpd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmaddsubpd(ymmA, ymmB, ymmC, ymmD); + a.vfmaddsubpd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmaddsubpd(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmaddsubps(xmmA, xmmB, xmmC, xmmD); + a.vfmaddsubps(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmaddsubps(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmaddsubps(ymmA, ymmB, ymmC, ymmD); + a.vfmaddsubps(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmaddsubps(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmsubaddpd(xmmA, xmmB, xmmC, xmmD); + a.vfmsubaddpd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmsubaddpd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmsubaddpd(ymmA, ymmB, ymmC, ymmD); + a.vfmsubaddpd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmsubaddpd(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmsubaddps(xmmA, xmmB, xmmC, xmmD); + a.vfmsubaddps(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmsubaddps(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmsubaddps(ymmA, ymmB, ymmC, ymmD); + a.vfmsubaddps(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmsubaddps(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmsubpd(xmmA, xmmB, xmmC, xmmD); + a.vfmsubpd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmsubpd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmsubpd(ymmA, ymmB, ymmC, ymmD); + a.vfmsubpd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmsubpd(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmsubps(xmmA, xmmB, xmmC, xmmD); + a.vfmsubps(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmsubps(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmsubps(ymmA, ymmB, ymmC, ymmD); + a.vfmsubps(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfmsubps(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfmsubsd(xmmA, xmmB, xmmC, xmmD); + a.vfmsubsd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmsubsd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfmsubss(xmmA, xmmB, xmmC, xmmD); + a.vfmsubss(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfmsubss(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmaddpd(xmmA, xmmB, xmmC, xmmD); + a.vfnmaddpd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmaddpd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmaddpd(ymmA, ymmB, ymmC, ymmD); + a.vfnmaddpd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfnmaddpd(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfnmaddps(xmmA, xmmB, xmmC, xmmD); + a.vfnmaddps(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmaddps(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmaddps(ymmA, ymmB, ymmC, ymmD); + a.vfnmaddps(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfnmaddps(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfnmaddsd(xmmA, xmmB, xmmC, xmmD); + a.vfnmaddsd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmaddsd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmaddss(xmmA, xmmB, xmmC, xmmD); + a.vfnmaddss(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmaddss(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmsubpd(xmmA, xmmB, xmmC, xmmD); + a.vfnmsubpd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmsubpd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmsubpd(ymmA, ymmB, ymmC, ymmD); + a.vfnmsubpd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfnmsubpd(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfnmsubps(xmmA, xmmB, xmmC, xmmD); + a.vfnmsubps(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmsubps(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmsubps(ymmA, ymmB, ymmC, ymmD); + a.vfnmsubps(ymmA, ymmB, anyptr_gpC, ymmD); + a.vfnmsubps(ymmA, ymmB, ymmC, anyptr_gpA); + a.vfnmsubsd(xmmA, xmmB, xmmC, xmmD); + a.vfnmsubsd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmsubsd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vfnmsubss(xmmA, xmmB, xmmC, xmmD); + a.vfnmsubss(xmmA, xmmB, anyptr_gpC, xmmD); + a.vfnmsubss(xmmA, xmmB, xmmC, anyptr_gpA); + + // XOP. + a.nop(); + + a.vfrczpd(xmmA, xmmB); + a.vfrczpd(xmmA, anyptr_gpB); + a.vfrczpd(ymmA, ymmB); + a.vfrczpd(ymmA, anyptr_gpB); + a.vfrczps(xmmA, xmmB); + a.vfrczps(xmmA, anyptr_gpB); + a.vfrczps(ymmA, ymmB); + a.vfrczps(ymmA, anyptr_gpB); + a.vfrczsd(xmmA, xmmB); + a.vfrczsd(xmmA, anyptr_gpB); + a.vfrczss(xmmA, xmmB); + a.vfrczss(xmmA, anyptr_gpB); + a.vpcmov(xmmA, xmmB, xmmC, xmmD); + a.vpcmov(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpcmov(xmmA, xmmB, xmmC, anyptr_gpA); + a.vpcmov(ymmA, ymmB, ymmC, ymmD); + a.vpcmov(ymmA, ymmB, anyptr_gpC, ymmD); + a.vpcmov(ymmA, ymmB, ymmC, anyptr_gpA); + a.vpcomb(xmmA, xmmB, xmmC, 0); + a.vpcomb(xmmA, xmmB, anyptr_gpC, 0); + a.vpcomd(xmmA, xmmB, xmmC, 0); + a.vpcomd(xmmA, xmmB, anyptr_gpC, 0); + a.vpcomq(xmmA, xmmB, xmmC, 0); + a.vpcomq(xmmA, xmmB, anyptr_gpC, 0); + a.vpcomw(xmmA, xmmB, xmmC, 0); + a.vpcomw(xmmA, xmmB, anyptr_gpC, 0); + a.vpcomub(xmmA, xmmB, xmmC, 0); + a.vpcomub(xmmA, xmmB, anyptr_gpC, 0); + a.vpcomud(xmmA, xmmB, xmmC, 0); + a.vpcomud(xmmA, xmmB, anyptr_gpC, 0); + a.vpcomuq(xmmA, xmmB, xmmC, 0); + a.vpcomuq(xmmA, xmmB, anyptr_gpC, 0); + a.vpcomuw(xmmA, xmmB, xmmC, 0); + a.vpcomuw(xmmA, xmmB, anyptr_gpC, 0); + a.vpermil2pd(xmmA, xmmB, xmmC, xmmD); + a.vpermil2pd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpermil2pd(xmmA, xmmB, xmmC, anyptr_gpA); + a.vpermil2pd(ymmA, ymmB, ymmC, ymmD); + a.vpermil2pd(ymmA, ymmB, anyptr_gpC, ymmD); + a.vpermil2pd(ymmA, ymmB, ymmC, anyptr_gpA); + a.vpermil2ps(xmmA, xmmB, xmmC, xmmD); + a.vpermil2ps(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpermil2ps(xmmA, xmmB, xmmC, anyptr_gpA); + a.vpermil2ps(ymmA, ymmB, ymmC, ymmD); + a.vpermil2ps(ymmA, ymmB, anyptr_gpC, ymmD); + a.vpermil2ps(ymmA, ymmB, ymmC, anyptr_gpA); + a.vphaddbd(xmmA, xmmB); + a.vphaddbd(xmmA, anyptr_gpB); + a.vphaddbq(xmmA, xmmB); + a.vphaddbq(xmmA, anyptr_gpB); + a.vphaddbw(xmmA, xmmB); + a.vphaddbw(xmmA, anyptr_gpB); + a.vphadddq(xmmA, xmmB); + a.vphadddq(xmmA, anyptr_gpB); + a.vphaddwd(xmmA, xmmB); + a.vphaddwd(xmmA, anyptr_gpB); + a.vphaddwq(xmmA, xmmB); + a.vphaddwq(xmmA, anyptr_gpB); + a.vphaddubd(xmmA, xmmB); + a.vphaddubd(xmmA, anyptr_gpB); + a.vphaddubq(xmmA, xmmB); + a.vphaddubq(xmmA, anyptr_gpB); + a.vphaddubw(xmmA, xmmB); + a.vphaddubw(xmmA, anyptr_gpB); + a.vphaddudq(xmmA, xmmB); + a.vphaddudq(xmmA, anyptr_gpB); + a.vphadduwd(xmmA, xmmB); + a.vphadduwd(xmmA, anyptr_gpB); + a.vphadduwq(xmmA, xmmB); + a.vphadduwq(xmmA, anyptr_gpB); + a.vphsubbw(xmmA, xmmB); + a.vphsubbw(xmmA, anyptr_gpB); + a.vphsubdq(xmmA, xmmB); + a.vphsubdq(xmmA, anyptr_gpB); + a.vphsubwd(xmmA, xmmB); + a.vphsubwd(xmmA, anyptr_gpB); + a.vpmacsdd(xmmA, xmmB, xmmC, xmmD); + a.vpmacsdd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacsdqh(xmmA, xmmB, xmmC, xmmD); + a.vpmacsdqh(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacsdql(xmmA, xmmB, xmmC, xmmD); + a.vpmacsdql(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacswd(xmmA, xmmB, xmmC, xmmD); + a.vpmacswd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacsww(xmmA, xmmB, xmmC, xmmD); + a.vpmacsww(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacssdd(xmmA, xmmB, xmmC, xmmD); + a.vpmacssdd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacssdqh(xmmA, xmmB, xmmC, xmmD); + a.vpmacssdqh(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacssdql(xmmA, xmmB, xmmC, xmmD); + a.vpmacssdql(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacsswd(xmmA, xmmB, xmmC, xmmD); + a.vpmacsswd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmacssww(xmmA, xmmB, xmmC, xmmD); + a.vpmacssww(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmadcsswd(xmmA, xmmB, xmmC, xmmD); + a.vpmadcsswd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpmadcswd(xmmA, xmmB, xmmC, xmmD); + a.vpmadcswd(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpperm(xmmA, xmmB, xmmC, xmmD); + a.vpperm(xmmA, xmmB, anyptr_gpC, xmmD); + a.vpperm(xmmA, xmmB, xmmC, anyptr_gpA); + a.vprotb(xmmA, xmmB, xmmC); + a.vprotb(xmmA, anyptr_gpB, xmmC); + a.vprotb(xmmA, xmmB, anyptr_gpC); + a.vprotb(xmmA, xmmB, 0); + a.vprotb(xmmA, anyptr_gpB, 0); + a.vprotd(xmmA, xmmB, xmmC); + a.vprotd(xmmA, anyptr_gpB, xmmC); + a.vprotd(xmmA, xmmB, anyptr_gpC); + a.vprotd(xmmA, xmmB, 0); + a.vprotd(xmmA, anyptr_gpB, 0); + a.vprotq(xmmA, xmmB, xmmC); + a.vprotq(xmmA, anyptr_gpB, xmmC); + a.vprotq(xmmA, xmmB, anyptr_gpC); + a.vprotq(xmmA, xmmB, 0); + a.vprotq(xmmA, anyptr_gpB, 0); + a.vprotw(xmmA, xmmB, xmmC); + a.vprotw(xmmA, anyptr_gpB, xmmC); + a.vprotw(xmmA, xmmB, anyptr_gpC); + a.vprotw(xmmA, xmmB, 0); + a.vprotw(xmmA, anyptr_gpB, 0); + a.vpshab(xmmA, xmmB, xmmC); + a.vpshab(xmmA, anyptr_gpB, xmmC); + a.vpshab(xmmA, xmmB, anyptr_gpC); + a.vpshad(xmmA, xmmB, xmmC); + a.vpshad(xmmA, anyptr_gpB, xmmC); + a.vpshad(xmmA, xmmB, anyptr_gpC); + a.vpshaq(xmmA, xmmB, xmmC); + a.vpshaq(xmmA, anyptr_gpB, xmmC); + a.vpshaq(xmmA, xmmB, anyptr_gpC); + a.vpshaw(xmmA, xmmB, xmmC); + a.vpshaw(xmmA, anyptr_gpB, xmmC); + a.vpshaw(xmmA, xmmB, anyptr_gpC); + a.vpshlb(xmmA, xmmB, xmmC); + a.vpshlb(xmmA, anyptr_gpB, xmmC); + a.vpshlb(xmmA, xmmB, anyptr_gpC); + a.vpshld(xmmA, xmmB, xmmC); + a.vpshld(xmmA, anyptr_gpB, xmmC); + a.vpshld(xmmA, xmmB, anyptr_gpC); + a.vpshlq(xmmA, xmmB, xmmC); + a.vpshlq(xmmA, anyptr_gpB, xmmC); + a.vpshlq(xmmA, xmmB, anyptr_gpC); + a.vpshlw(xmmA, xmmB, xmmC); + a.vpshlw(xmmA, anyptr_gpB, xmmC); + a.vpshlw(xmmA, xmmB, anyptr_gpC); + + // BMI. + a.nop(); + + a.andn(gzA, gzB, gzC); + a.andn(gzA, gzB, anyptr_gpB); + a.bextr(gzA, gzB, gzC); + a.bextr(gzA, anyptr_gpB, gzC); + a.blsi(gzA, gzB); + a.blsi(gzA, anyptr_gpB); + a.blsmsk(gzA, gzB); + a.blsmsk(gzA, anyptr_gpB); + a.blsr(gzA, gzB); + a.blsr(gzA, anyptr_gpB); + + // LZCNT. + a.nop(); + + a.lzcnt(gzA, gzB); + a.lzcnt(gzA, anyptr_gpB); + + // TZCNT. + a.nop(); + + a.tzcnt(gzA, gzB); + a.tzcnt(gzA, anyptr_gpB); + + // BMI2. + a.nop(); + + a.bzhi(gzA, gzB, gzC); + a.bzhi(gzA, anyptr_gpB, gzC); + a.mulx(gzA, gzB, gzC); + a.mulx(gzA, gzB, anyptr_gpB); + a.pdep(gzA, gzB, gzC); + a.pdep(gzA, gzB, anyptr_gpB); + a.pext(gzA, gzB, gzC); + a.pext(gzA, gzB, anyptr_gpB); + a.rorx(gzA, gzB, 0); + a.rorx(gzA, anyptr_gpB, 0); + a.sarx(gzA, gzB, gzC); + a.sarx(gzA, anyptr_gpB, gzC); + a.shlx(gzA, gzB, gzC); + a.shlx(gzA, anyptr_gpB, gzC); + a.shrx(gzA, gzB, gzC); + a.shrx(gzA, anyptr_gpB, gzC); + + // RDRAND. + a.nop(); + + a.rdrand(gzA); + + // F16C. + a.nop(); + + a.vcvtph2ps(xmmA, xmmB); + a.vcvtph2ps(xmmA, anyptr_gpB); + a.vcvtph2ps(ymmA, xmmB); + a.vcvtph2ps(ymmA, anyptr_gpB); + a.vcvtps2ph(xmmA, xmmB, 0); + a.vcvtps2ph(anyptr_gpA, xmmB, 0); + a.vcvtps2ph(xmmA, ymmB, 0); + a.vcvtps2ph(anyptr_gpA, ymmB, 0); + + // Mark the end of the stream. + a.nop(); +} + +} // asmgen namespace + +// [Guard] +#endif // _TEST_ASMJIT_TEST_OPCODE_H diff --git a/src/asmjit/test/main.cpp b/src/test/asmjit_test_unit.cpp similarity index 52% rename from src/asmjit/test/main.cpp rename to src/test/asmjit_test_unit.cpp index 3b05ece..32626d3 100644 --- a/src/asmjit/test/main.cpp +++ b/src/test/asmjit_test_unit.cpp @@ -5,9 +5,7 @@ // Zlib - See LICENSE.md file in the package. // [Dependencies - AsmJit] -#include "../asmjit.h" - -using namespace asmjit; +#include "../asmjit/asmjit.h" // ============================================================================ // [DumpCpu] @@ -18,14 +16,14 @@ struct DumpCpuFeature { const char* name; }; -static void dumpCpuFeatures(const CpuInfo* cpuInfo, const DumpCpuFeature* data, size_t count) { +static void dumpCpuFeatures(const asmjit::CpuInfo* cpuInfo, const DumpCpuFeature* data, size_t count) { for (size_t i = 0; i < count; i++) if (cpuInfo->hasFeature(data[i].feature)) INFO(" %s", data[i].name); } static void dumpCpu(void) { - const CpuInfo* cpu = CpuInfo::getHost(); + const asmjit::CpuInfo* cpu = asmjit::CpuInfo::getHost(); INFO("Host CPU Info:"); INFO(" Vendor string : %s", cpu->getVendorString()); @@ -41,67 +39,67 @@ static void dumpCpu(void) { // -------------------------------------------------------------------------- #if defined(ASMJIT_ARCH_X86) || defined(ASMJIT_ARCH_X64) - const X86CpuInfo* x86Cpu = static_cast(cpu); + const asmjit::X86CpuInfo* x86Cpu = static_cast(cpu); static const DumpCpuFeature x86FeaturesList[] = { - { kX86CpuFeatureNX , "NX (Non-Execute Bit)" }, - { kX86CpuFeatureMT , "MT (Multi-Threading)" }, - { kX86CpuFeatureRDTSC , "RDTSC" }, - { kX86CpuFeatureRDTSCP , "RDTSCP" }, - { kX86CpuFeatureCMOV , "CMOV" }, - { kX86CpuFeatureCMPXCHG8B , "CMPXCHG8B" }, - { kX86CpuFeatureCMPXCHG16B , "CMPXCHG16B" }, - { kX86CpuFeatureCLFLUSH , "CLFLUSH" }, - { kX86CpuFeatureCLFLUSHOpt , "CLFLUSH (Opt)" }, - { kX86CpuFeaturePREFETCH , "PREFETCH" }, - { kX86CpuFeaturePREFETCHWT1 , "PREFETCHWT1" }, - { kX86CpuFeatureLahfSahf , "LAHF/SAHF" }, - { kX86CpuFeatureFXSR , "FXSR" }, - { kX86CpuFeatureFXSROpt , "FXSR (Opt)" }, - { kX86CpuFeatureMMX , "MMX" }, - { kX86CpuFeatureMMX2 , "MMX2" }, - { kX86CpuFeature3DNOW , "3DNOW" }, - { kX86CpuFeature3DNOW2 , "3DNOW2" }, - { kX86CpuFeatureSSE , "SSE" }, - { kX86CpuFeatureSSE2 , "SSE2" }, - { kX86CpuFeatureSSE3 , "SSE3" }, - { kX86CpuFeatureSSSE3 , "SSSE3" }, - { kX86CpuFeatureSSE4A , "SSE4A" }, - { kX86CpuFeatureSSE4_1 , "SSE4.1" }, - { kX86CpuFeatureSSE4_2 , "SSE4.2" }, - { kX86CpuFeatureMSSE , "Misaligned SSE" }, - { kX86CpuFeatureMONITOR , "MONITOR/MWAIT" }, - { kX86CpuFeatureMOVBE , "MOVBE" }, - { kX86CpuFeaturePOPCNT , "POPCNT" }, - { kX86CpuFeatureLZCNT , "LZCNT" }, - { kX86CpuFeatureAESNI , "AESNI" }, - { kX86CpuFeaturePCLMULQDQ , "PCLMULQDQ" }, - { kX86CpuFeatureRDRAND , "RDRAND" }, - { kX86CpuFeatureRDSEED , "RDSEED" }, - { kX86CpuFeatureSHA , "SHA" }, - { kX86CpuFeatureXSave , "XSAVE" }, - { kX86CpuFeatureXSaveOS , "XSAVE (OS)" }, - { kX86CpuFeatureAVX , "AVX" }, - { kX86CpuFeatureAVX2 , "AVX2" }, - { kX86CpuFeatureF16C , "F16C" }, - { kX86CpuFeatureFMA3 , "FMA3" }, - { kX86CpuFeatureFMA4 , "FMA4" }, - { kX86CpuFeatureXOP , "XOP" }, - { kX86CpuFeatureBMI , "BMI" }, - { kX86CpuFeatureBMI2 , "BMI2" }, - { kX86CpuFeatureHLE , "HLE" }, - { kX86CpuFeatureRTM , "RTM" }, - { kX86CpuFeatureADX , "ADX" }, - { kX86CpuFeatureMPX , "MPX" }, - { kX86CpuFeatureFSGSBase , "FS/GS Base" }, - { kX86CpuFeatureMOVSBSTOSBOpt , "REP MOVSB/STOSB (Opt)" }, - { kX86CpuFeatureAVX512F , "AVX512F" }, - { kX86CpuFeatureAVX512CD , "AVX512CD" }, - { kX86CpuFeatureAVX512PF , "AVX512PF" }, - { kX86CpuFeatureAVX512ER , "AVX512ER" }, - { kX86CpuFeatureAVX512DQ , "AVX512DQ" }, - { kX86CpuFeatureAVX512BW , "AVX512BW" }, - { kX86CpuFeatureAVX512VL , "AVX512VL" } + { asmjit::kX86CpuFeatureNX , "NX (Non-Execute Bit)" }, + { asmjit::kX86CpuFeatureMT , "MT (Multi-Threading)" }, + { asmjit::kX86CpuFeatureRDTSC , "RDTSC" }, + { asmjit::kX86CpuFeatureRDTSCP , "RDTSCP" }, + { asmjit::kX86CpuFeatureCMOV , "CMOV" }, + { asmjit::kX86CpuFeatureCMPXCHG8B , "CMPXCHG8B" }, + { asmjit::kX86CpuFeatureCMPXCHG16B , "CMPXCHG16B" }, + { asmjit::kX86CpuFeatureCLFLUSH , "CLFLUSH" }, + { asmjit::kX86CpuFeatureCLFLUSHOpt , "CLFLUSH (Opt)" }, + { asmjit::kX86CpuFeaturePREFETCH , "PREFETCH" }, + { asmjit::kX86CpuFeaturePREFETCHWT1 , "PREFETCHWT1" }, + { asmjit::kX86CpuFeatureLahfSahf , "LAHF/SAHF" }, + { asmjit::kX86CpuFeatureFXSR , "FXSR" }, + { asmjit::kX86CpuFeatureFXSROpt , "FXSR (Opt)" }, + { asmjit::kX86CpuFeatureMMX , "MMX" }, + { asmjit::kX86CpuFeatureMMX2 , "MMX2" }, + { asmjit::kX86CpuFeature3DNOW , "3DNOW" }, + { asmjit::kX86CpuFeature3DNOW2 , "3DNOW2" }, + { asmjit::kX86CpuFeatureSSE , "SSE" }, + { asmjit::kX86CpuFeatureSSE2 , "SSE2" }, + { asmjit::kX86CpuFeatureSSE3 , "SSE3" }, + { asmjit::kX86CpuFeatureSSSE3 , "SSSE3" }, + { asmjit::kX86CpuFeatureSSE4A , "SSE4A" }, + { asmjit::kX86CpuFeatureSSE4_1 , "SSE4.1" }, + { asmjit::kX86CpuFeatureSSE4_2 , "SSE4.2" }, + { asmjit::kX86CpuFeatureMSSE , "Misaligned SSE" }, + { asmjit::kX86CpuFeatureMONITOR , "MONITOR/MWAIT" }, + { asmjit::kX86CpuFeatureMOVBE , "MOVBE" }, + { asmjit::kX86CpuFeaturePOPCNT , "POPCNT" }, + { asmjit::kX86CpuFeatureLZCNT , "LZCNT" }, + { asmjit::kX86CpuFeatureAESNI , "AESNI" }, + { asmjit::kX86CpuFeaturePCLMULQDQ , "PCLMULQDQ" }, + { asmjit::kX86CpuFeatureRDRAND , "RDRAND" }, + { asmjit::kX86CpuFeatureRDSEED , "RDSEED" }, + { asmjit::kX86CpuFeatureSHA , "SHA" }, + { asmjit::kX86CpuFeatureXSave , "XSAVE" }, + { asmjit::kX86CpuFeatureXSaveOS , "XSAVE (OS)" }, + { asmjit::kX86CpuFeatureAVX , "AVX" }, + { asmjit::kX86CpuFeatureAVX2 , "AVX2" }, + { asmjit::kX86CpuFeatureF16C , "F16C" }, + { asmjit::kX86CpuFeatureFMA3 , "FMA3" }, + { asmjit::kX86CpuFeatureFMA4 , "FMA4" }, + { asmjit::kX86CpuFeatureXOP , "XOP" }, + { asmjit::kX86CpuFeatureBMI , "BMI" }, + { asmjit::kX86CpuFeatureBMI2 , "BMI2" }, + { asmjit::kX86CpuFeatureHLE , "HLE" }, + { asmjit::kX86CpuFeatureRTM , "RTM" }, + { asmjit::kX86CpuFeatureADX , "ADX" }, + { asmjit::kX86CpuFeatureMPX , "MPX" }, + { asmjit::kX86CpuFeatureFSGSBase , "FS/GS Base" }, + { asmjit::kX86CpuFeatureMOVSBSTOSBOpt , "REP MOVSB/STOSB (Opt)" }, + { asmjit::kX86CpuFeatureAVX512F , "AVX512F" }, + { asmjit::kX86CpuFeatureAVX512CD , "AVX512CD" }, + { asmjit::kX86CpuFeatureAVX512PF , "AVX512PF" }, + { asmjit::kX86CpuFeatureAVX512ER , "AVX512ER" }, + { asmjit::kX86CpuFeatureAVX512DQ , "AVX512DQ" }, + { asmjit::kX86CpuFeatureAVX512BW , "AVX512BW" }, + { asmjit::kX86CpuFeatureAVX512VL , "AVX512VL" } }; INFO("Host CPU Info (X86/X64):"); diff --git a/src/app/test/asmjit_test_x86.cpp b/src/test/asmjit_test_x86.cpp similarity index 97% rename from src/app/test/asmjit_test_x86.cpp rename to src/test/asmjit_test_x86.cpp index 04caa45..5a330c8 100644 --- a/src/app/test/asmjit_test_x86.cpp +++ b/src/test/asmjit_test_x86.cpp @@ -5,10 +5,10 @@ // Zlib - See LICENSE.md file in the package. // [Dependencies - AsmJit] -#include +#include "../asmjit/asmjit.h" // [Dependencies - Test] -#include "genblend.h" +#include "./genblend.h" // [Dependencies - C] #include @@ -1799,7 +1799,7 @@ struct X86Test_CallBase : public X86Test { // Call function. X86GpVar fn(c, kVarTypeIntPtr, "fn"); - c.mov(fn, imm_ptr((void*)calledFunc)); + c.mov(fn, imm_ptr(calledFunc)); X86CallNode* call = c.call(fn, kFuncConvHost, FuncBuilder3()); call->setArg(0, v2); @@ -1845,7 +1845,7 @@ struct X86Test_CallFast : public X86Test { c.addFunc(kFuncConvHost, FuncBuilder1()); c.setArg(0, var); - c.mov(fn, imm_ptr((void*)calledFunc)); + c.mov(fn, imm_ptr(calledFunc)); X86CallNode* call; call = c.call(fn, kFuncConvHostFastCall, FuncBuilder1()); @@ -1910,7 +1910,7 @@ struct X86Test_CallManyArgs : public X86Test { X86GpVar vi(c, kVarTypeInt32, "vi"); X86GpVar vj(c, kVarTypeInt32, "vj"); - c.mov(fn, imm_ptr((void*)calledFunc)); + c.mov(fn, imm_ptr(calledFunc)); c.mov(va, 0x03); c.mov(vb, 0x12); c.mov(vc, 0xA0); @@ -1977,7 +1977,7 @@ struct X86Test_CallDuplicateArgs : public X86Test { X86GpVar fn(c, kVarTypeIntPtr, "fn"); X86GpVar a(c, kVarTypeInt32, "a"); - c.mov(fn, imm_ptr((void*)calledFunc)); + c.mov(fn, imm_ptr(calledFunc)); c.mov(a, 3); // Call function. @@ -2031,7 +2031,7 @@ struct X86Test_CallImmArgs : public X86Test { X86GpVar fn(c, kVarTypeIntPtr, "fn"); X86GpVar rv(c, kVarTypeInt32, "rv"); - c.mov(fn, imm_ptr((void*)X86Test_CallManyArgs::calledFunc)); + c.mov(fn, imm_ptr(X86Test_CallManyArgs::calledFunc)); // Call function. X86CallNode* call = c.call(fn, kFuncConvHost, @@ -2097,7 +2097,7 @@ struct X86Test_CallPtrArgs : public X86Test { X86GpVar fn(c, kVarTypeIntPtr, "fn"); X86GpVar rv(c, kVarTypeInt32, "rv"); - c.mov(fn, imm_ptr((void*)calledFunc)); + c.mov(fn, imm_ptr(calledFunc)); // Call function. X86CallNode* call = c.call(fn, kFuncConvHost, @@ -2159,7 +2159,7 @@ struct X86Test_CallFloatAsXmmRet : public X86Test { // Prepare. X86GpVar fn(c); - c.mov(fn, imm_ptr((void*)calledFunc)); + c.mov(fn, imm_ptr(calledFunc)); // Call function. X86CallNode* call = c.call(fn, kFuncConvHost, @@ -2213,7 +2213,7 @@ struct X86Test_CallDoubleAsXmmRet : public X86Test { c.setArg(1, b); X86GpVar fn(c); - c.mov(fn, imm_ptr((void*)calledFunc)); + c.mov(fn, imm_ptr(calledFunc)); X86CallNode* call = c.call(fn, kFuncConvHost, FuncBuilder2()); @@ -2474,7 +2474,7 @@ struct X86Test_CallMisc1 : public X86Test { c.alloc(a, x86::eax); c.alloc(b, x86::ebx); - X86CallNode* call = c.call(imm_ptr((void*)dummy), kFuncConvHost, FuncBuilder2()); + X86CallNode* call = c.call(imm_ptr(dummy), kFuncConvHost, FuncBuilder2()); call->setArg(0, a); call->setArg(1, b); @@ -2520,7 +2520,7 @@ struct X86Test_CallMisc2 : public X86Test { c.setArg(0, p); c.movsd(arg, x86::ptr(p)); - c.mov(fn, imm_ptr((void*)op)); + c.mov(fn, imm_ptr(op)); X86CallNode* call = c.call(fn, kFuncConvHost, FuncBuilder1()); call->setArg(0, arg); @@ -2570,7 +2570,7 @@ struct X86Test_CallMisc3 : public X86Test { c.setArg(0, p); c.movsd(arg, x86::ptr(p)); - c.mov(fn, imm_ptr((void*)op)); + c.mov(fn, imm_ptr(op)); X86CallNode* call = c.call(fn, kFuncConvHost, FuncBuilder1()); call->setArg(0, arg); @@ -2601,6 +2601,49 @@ struct X86Test_CallMisc3 : public X86Test { static double op(double a) { return a * a; } }; +// ============================================================================ +// [X86Test_CallMisc4] +// ============================================================================ + +struct X86Test_CallMisc4 : public X86Test { + X86Test_CallMisc4() : X86Test("[Call] Misc #4") {} + + static void add(PodVector& tests) { + tests.append(new X86Test_CallMisc4()); + } + + virtual void compile(X86Compiler& c) { + FuncBuilderX funcPrototype; + funcPrototype.setRet(kVarTypeFp64); + X86FuncNode* func = c.addFunc(kFuncConvHost, funcPrototype); + + FuncBuilderX callPrototype; + callPrototype.setRet(kVarTypeFp64); + X86CallNode* call = c.call(imm_ptr(calledFunc), kFuncConvHost, callPrototype); + + X86XmmVar ret(c, kX86VarTypeXmmSd, "ret"); + call->setRet(0, ret); + c.ret(ret); + + c.endFunc(); + } + + virtual bool run(void* _func, StringBuilder& result, StringBuilder& expect) { + typedef double (*Func)(void); + Func func = asmjit_cast(_func); + + double resultRet = func(); + double expectRet = 3.14; + + result.setFormat("ret=%g", resultRet); + expect.setFormat("ret=%g", expectRet); + + return resultRet == expectRet; + } + + static double calledFunc() { return 3.14; } +}; + // ============================================================================ // [X86Test_MiscConstPool] // ============================================================================ @@ -2732,14 +2775,14 @@ struct X86Test_MiscMultiRet : public X86Test { expect.setFormat("ret={%d %d %d %d}", e0, e1, e2, e3); return result.eq(expect); -} + } }; // ============================================================================ // [X86Test_MiscUnfollow] // ============================================================================ -// Global (I didn't find better way to really test this). +// Global (I didn't find a better way to test this). static jmp_buf globalJmpBuf; struct X86Test_MiscUnfollow : public X86Test { @@ -2881,6 +2924,7 @@ X86TestSuite::X86TestSuite() : ADD_TEST(X86Test_CallMisc1); ADD_TEST(X86Test_CallMisc2); ADD_TEST(X86Test_CallMisc3); + ADD_TEST(X86Test_CallMisc4); // Misc. ADD_TEST(X86Test_MiscConstPool); diff --git a/src/asmjit/test/broken.cpp b/src/test/broken.cpp similarity index 100% rename from src/asmjit/test/broken.cpp rename to src/test/broken.cpp diff --git a/src/asmjit/test/broken.h b/src/test/broken.h similarity index 100% rename from src/asmjit/test/broken.h rename to src/test/broken.h diff --git a/src/app/test/genblend.h b/src/test/genblend.h similarity index 96% rename from src/app/test/genblend.h rename to src/test/genblend.h index 83d5e41..a5e5c39 100644 --- a/src/app/test/genblend.h +++ b/src/test/genblend.h @@ -5,11 +5,11 @@ // Zlib - See LICENSE.md file in the package. // [Guard] -#ifndef _APP_TEST_GENBLEND_H -#define _APP_TEST_GENBLEND_H +#ifndef _TEST_GENBLEND_H +#define _TEST_GENBLEND_H // [Dependencies] -#include +#include "../asmjit/asmjit.h" namespace asmgen { @@ -177,4 +177,4 @@ static void blend(asmjit::X86Compiler& c) { } // asmgen namespace // [Guard] -#endif // _APP_TEST_GENBLEND_H +#endif // _TEST_GENBLEND_H